]> jfr.im git - yt-dlp.git/blob - youtube-dl
Correct comedycentral flash URL regex
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.18'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48 import ctypes
49
50 try:
51 import email.utils
52 except ImportError: # Python 2.4
53 import email.Utils
54 try:
55 import cStringIO as StringIO
56 except ImportError:
57 import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61 from urlparse import parse_qs
62 except ImportError:
63 from cgi import parse_qs
64
65 try:
66 import lxml.etree
67 except ImportError:
68 pass # Handled below
69
70 try:
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84 import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86 import re
87 class json(object):
88 @staticmethod
89 def loads(s):
90 s = s.decode('UTF-8')
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
95 i += 1
96 if expectMore:
97 if i >= len(s):
98 raiseError('Premature end', i)
99 return i
100 def decodeEscape(match):
101 esc = match.group(1)
102 _STATIC = {
103 '"': '"',
104 '\\': '\\',
105 '/': '/',
106 'b': unichr(0x8),
107 'f': unichr(0xc),
108 'n': '\n',
109 'r': '\r',
110 't': '\t',
111 }
112 if esc in _STATIC:
113 return _STATIC[esc]
114 if esc[0] == 'u':
115 if len(esc) == 1+4:
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
122 def parseString(i):
123 i += 1
124 e = i
125 while True:
126 e = s.index('"', e)
127 bslashes = 0
128 while s[e-bslashes-1] == '\\':
129 bslashes += 1
130 if bslashes % 2 == 1:
131 e += 1
132 continue
133 break
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
136 return (e+1,stri)
137 def parseObj(i):
138 i += 1
139 res = {}
140 i = skipSpace(i)
141 if s[i] == '}': # Empty dictionary
142 return (i+1,res)
143 while True:
144 if s[i] != '"':
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
147 i = skipSpace(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
150 i,val = parse(i+1)
151 res[key] = val
152 i = skipSpace(i)
153 if s[i] == '}':
154 return (i+1, res)
155 if s[i] != ',':
156 raiseError('Expected comma or closing curly brace', i)
157 i = skipSpace(i+1)
158 def parseArray(i):
159 res = []
160 i = skipSpace(i+1)
161 if s[i] == ']': # Empty array
162 return (i+1,res)
163 while True:
164 i,val = parse(i)
165 res.append(val)
166 i = skipSpace(i) # Raise exception if premature end
167 if s[i] == ']':
168 return (i+1, res)
169 if s[i] != ',':
170 raiseError('Expected a comma or closing bracket', i)
171 i = skipSpace(i+1)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
175 return (i+len(k), v)
176 raiseError('Not a boolean (or null)', i)
177 def parseNumber(i):
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 if mobj is None:
180 raiseError('Not a number', i)
181 nums = mobj.group(1)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186 def parse(i):
187 i = skipSpace(i)
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
190 return (i,res)
191 i,res = parse(0)
192 if i < len(s):
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194 return res
195
196 def preferredencoding():
197 """Get preferred encoding.
198
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
201 """
202 def yield_preferredencoding():
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
208 while True:
209 yield pref
210 return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
215
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
218 """
219 entity = matchobj.group(1)
220
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
224
225 # Unicode character
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 if mobj is not None:
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
230 base = 16
231 numstr = u'0%s' % numstr
232 else:
233 base = 10
234 return unichr(long(numstr, base))
235
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
248
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
252 function.
253
254 It returns the tuple (stream, definitive_file_name).
255 """
256 try:
257 if filename == u'-':
258 if sys.platform == 'win32':
259 import msvcrt
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
271
272
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
275 timestamp = None
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
279 return timestamp
280
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
287 res = []
288 for el in iterable:
289 if el not in res:
290 res.append(el)
291 return res
292
293 class DownloadError(Exception):
294 """Download Error exception.
295
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
298 error message.
299 """
300 pass
301
302
303 class SameFileError(Exception):
304 """Same File exception.
305
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
308 """
309 pass
310
311
312 class PostProcessingError(Exception):
313 """Post Processing exception.
314
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
317 """
318 pass
319
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
322 pass
323
324
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
327
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
330 """
331 pass
332
333
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
336
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
340 """
341 # Both in bytes
342 downloaded = None
343 expected = None
344
345 def __init__(self, downloaded, expected):
346 self.downloaded = downloaded
347 self.expected = expected
348
349
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351 """Handler for HTTP requests and responses.
352
353 This class, when installed with an OpenerDirector, automatically adds
354 the standard headers to every HTTP request and handles gzipped and
355 deflated responses from web servers. If compression is to be avoided in
356 a particular request, the original request in the program code only has
357 to include the HTTP header "Youtubedl-No-Compression", which will be
358 removed before making the real request.
359
360 Part of this code was copied from:
361
362 http://techknack.net/python-urllib2-handlers/
363
364 Andrew Rowls, the author of that code, agreed to release it to the
365 public domain.
366 """
367
368 @staticmethod
369 def deflate(data):
370 try:
371 return zlib.decompress(data, -zlib.MAX_WBITS)
372 except zlib.error:
373 return zlib.decompress(data)
374
375 @staticmethod
376 def addinfourl_wrapper(stream, headers, url, code):
377 if hasattr(urllib2.addinfourl, 'getcode'):
378 return urllib2.addinfourl(stream, headers, url, code)
379 ret = urllib2.addinfourl(stream, headers, url)
380 ret.code = code
381 return ret
382
383 def http_request(self, req):
384 for h in std_headers:
385 if h in req.headers:
386 del req.headers[h]
387 req.add_header(h, std_headers[h])
388 if 'Youtubedl-no-compression' in req.headers:
389 if 'Accept-encoding' in req.headers:
390 del req.headers['Accept-encoding']
391 del req.headers['Youtubedl-no-compression']
392 return req
393
394 def http_response(self, req, resp):
395 old_resp = resp
396 # gzip
397 if resp.headers.get('Content-encoding', '') == 'gzip':
398 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400 resp.msg = old_resp.msg
401 # deflate
402 if resp.headers.get('Content-encoding', '') == 'deflate':
403 gz = StringIO.StringIO(self.deflate(resp.read()))
404 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405 resp.msg = old_resp.msg
406 return resp
407
408
409 class FileDownloader(object):
410 """File Downloader class.
411
412 File downloader objects are the ones responsible of downloading the
413 actual video file and writing it to disk if the user has requested
414 it, among some other tasks. In most cases there should be one per
415 program. As, given a video URL, the downloader doesn't know how to
416 extract all the needed information, task that InfoExtractors do, it
417 has to pass the URL to one of them.
418
419 For this, file downloader objects have a method that allows
420 InfoExtractors to be registered in a given order. When it is passed
421 a URL, the file downloader handles it to the first InfoExtractor it
422 finds that reports being able to handle it. The InfoExtractor extracts
423 all the information about the video or videos the URL refers to, and
424 asks the FileDownloader to process the video information, possibly
425 downloading the video.
426
427 File downloaders accept a lot of parameters. In order not to saturate
428 the object constructor with arguments, it receives a dictionary of
429 options instead. These options are available through the params
430 attribute for the InfoExtractors to use. The FileDownloader also
431 registers itself as the downloader in charge for the InfoExtractors
432 that are added to it, so this is a "mutual registration".
433
434 Available options:
435
436 username: Username for authentication purposes.
437 password: Password for authentication purposes.
438 usenetrc: Use netrc for authentication instead.
439 quiet: Do not print messages to stdout.
440 forceurl: Force printing final URL.
441 forcetitle: Force printing title.
442 forcethumbnail: Force printing thumbnail URL.
443 forcedescription: Force printing description.
444 forcefilename: Force printing final filename.
445 simulate: Do not download the video files.
446 format: Video format code.
447 format_limit: Highest quality format to try.
448 outtmpl: Template for output names.
449 ignoreerrors: Do not stop on download errors.
450 ratelimit: Download speed limit, in bytes/sec.
451 nooverwrites: Prevent overwriting files.
452 retries: Number of times to retry for HTTP error 5xx
453 continuedl: Try to continue downloads if possible.
454 noprogress: Do not print the progress bar.
455 playliststart: Playlist item to start at.
456 playlistend: Playlist item to end at.
457 matchtitle: Download only matching titles.
458 rejecttitle: Reject downloads for matching titles.
459 logtostderr: Log messages to stderr instead of stdout.
460 consoletitle: Display progress in console window's titlebar.
461 nopart: Do not use temporary .part files.
462 updatetime: Use the Last-modified header to set output file timestamps.
463 writedescription: Write the video description to a .description file
464 writeinfojson: Write the video description to a .info.json file
465 """
466
467 params = None
468 _ies = []
469 _pps = []
470 _download_retcode = None
471 _num_downloads = None
472 _screen_file = None
473
474 def __init__(self, params):
475 """Create a FileDownloader object with the given options."""
476 self._ies = []
477 self._pps = []
478 self._download_retcode = 0
479 self._num_downloads = 0
480 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
481 self.params = params
482
483 @staticmethod
484 def format_bytes(bytes):
485 if bytes is None:
486 return 'N/A'
487 if type(bytes) is str:
488 bytes = float(bytes)
489 if bytes == 0.0:
490 exponent = 0
491 else:
492 exponent = long(math.log(bytes, 1024.0))
493 suffix = 'bkMGTPEZY'[exponent]
494 converted = float(bytes) / float(1024 ** exponent)
495 return '%.2f%s' % (converted, suffix)
496
497 @staticmethod
498 def calc_percent(byte_counter, data_len):
499 if data_len is None:
500 return '---.-%'
501 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
502
503 @staticmethod
504 def calc_eta(start, now, total, current):
505 if total is None:
506 return '--:--'
507 dif = now - start
508 if current == 0 or dif < 0.001: # One millisecond
509 return '--:--'
510 rate = float(current) / dif
511 eta = long((float(total) - float(current)) / rate)
512 (eta_mins, eta_secs) = divmod(eta, 60)
513 if eta_mins > 99:
514 return '--:--'
515 return '%02d:%02d' % (eta_mins, eta_secs)
516
517 @staticmethod
518 def calc_speed(start, now, bytes):
519 dif = now - start
520 if bytes == 0 or dif < 0.001: # One millisecond
521 return '%10s' % '---b/s'
522 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
523
524 @staticmethod
525 def best_block_size(elapsed_time, bytes):
526 new_min = max(bytes / 2.0, 1.0)
527 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528 if elapsed_time < 0.001:
529 return long(new_max)
530 rate = bytes / elapsed_time
531 if rate > new_max:
532 return long(new_max)
533 if rate < new_min:
534 return long(new_min)
535 return long(rate)
536
537 @staticmethod
538 def parse_bytes(bytestr):
539 """Parse a string indicating a byte quantity into a long integer."""
540 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
541 if matchobj is None:
542 return None
543 number = float(matchobj.group(1))
544 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545 return long(round(number * multiplier))
546
547 def add_info_extractor(self, ie):
548 """Add an InfoExtractor object to the end of the list."""
549 self._ies.append(ie)
550 ie.set_downloader(self)
551
552 def add_post_processor(self, pp):
553 """Add a PostProcessor object to the end of the chain."""
554 self._pps.append(pp)
555 pp.set_downloader(self)
556
557 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558 """Print message to stdout if not in quiet mode."""
559 try:
560 if not self.params.get('quiet', False):
561 terminator = [u'\n', u''][skip_eol]
562 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563 self._screen_file.flush()
564 except (UnicodeEncodeError), err:
565 if not ignore_encoding_errors:
566 raise
567
568 def to_stderr(self, message):
569 """Print message to stderr."""
570 print >>sys.stderr, message.encode(preferredencoding())
571
572 def to_cons_title(self, message):
573 """Set console/terminal window title to message."""
574 if not self.params.get('consoletitle', False):
575 return
576 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577 # c_wchar_p() might not be necessary if `message` is
578 # already of type unicode()
579 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580 elif 'TERM' in os.environ:
581 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
582
583 def fixed_template(self):
584 """Checks if the output template is fixed."""
585 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
586
587 def trouble(self, message=None):
588 """Determine action to take when a download problem appears.
589
590 Depending on if the downloader has been configured to ignore
591 download errors or not, this method may throw an exception or
592 not when errors are found, after printing the message.
593 """
594 if message is not None:
595 self.to_stderr(message)
596 if not self.params.get('ignoreerrors', False):
597 raise DownloadError(message)
598 self._download_retcode = 1
599
600 def slow_down(self, start_time, byte_counter):
601 """Sleep if the download speed is over the rate limit."""
602 rate_limit = self.params.get('ratelimit', None)
603 if rate_limit is None or byte_counter == 0:
604 return
605 now = time.time()
606 elapsed = now - start_time
607 if elapsed <= 0.0:
608 return
609 speed = float(byte_counter) / elapsed
610 if speed > rate_limit:
611 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
612
613 def temp_name(self, filename):
614 """Returns a temporary filename for the given filename."""
615 if self.params.get('nopart', False) or filename == u'-' or \
616 (os.path.exists(filename) and not os.path.isfile(filename)):
617 return filename
618 return filename + u'.part'
619
620 def undo_temp_name(self, filename):
621 if filename.endswith(u'.part'):
622 return filename[:-len(u'.part')]
623 return filename
624
625 def try_rename(self, old_filename, new_filename):
626 try:
627 if old_filename == new_filename:
628 return
629 os.rename(old_filename, new_filename)
630 except (IOError, OSError), err:
631 self.trouble(u'ERROR: unable to rename file')
632
633 def try_utime(self, filename, last_modified_hdr):
634 """Try to set the last-modified time of the given file."""
635 if last_modified_hdr is None:
636 return
637 if not os.path.isfile(filename):
638 return
639 timestr = last_modified_hdr
640 if timestr is None:
641 return
642 filetime = timeconvert(timestr)
643 if filetime is None:
644 return filetime
645 try:
646 os.utime(filename, (time.time(), filetime))
647 except:
648 pass
649 return filetime
650
651 def report_writedescription(self, descfn):
652 """ Report that the description file is being written """
653 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
654
655 def report_writeinfojson(self, infofn):
656 """ Report that the metadata file has been written """
657 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
658
659 def report_destination(self, filename):
660 """Report destination filename."""
661 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
662
663 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664 """Report download progress."""
665 if self.params.get('noprogress', False):
666 return
667 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
671
672 def report_resuming_byte(self, resume_len):
673 """Report attempt to resume at given byte."""
674 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
675
676 def report_retry(self, count, retries):
677 """Report retry in case of HTTP error 5xx"""
678 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
679
680 def report_file_already_downloaded(self, file_name):
681 """Report file has already been fully downloaded."""
682 try:
683 self.to_screen(u'[download] %s has already been downloaded' % file_name)
684 except (UnicodeEncodeError), err:
685 self.to_screen(u'[download] The file has already been downloaded')
686
687 def report_unable_to_resume(self):
688 """Report it was impossible to resume download."""
689 self.to_screen(u'[download] Unable to resume')
690
691 def report_finish(self):
692 """Report download finished."""
693 if self.params.get('noprogress', False):
694 self.to_screen(u'[download] Download completed')
695 else:
696 self.to_screen(u'')
697
698 def increment_downloads(self):
699 """Increment the ordinal that assigns a number to each file."""
700 self._num_downloads += 1
701
702 def prepare_filename(self, info_dict):
703 """Generate the output filename."""
704 try:
705 template_dict = dict(info_dict)
706 template_dict['epoch'] = unicode(long(time.time()))
707 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708 filename = self.params['outtmpl'] % template_dict
709 return filename
710 except (ValueError, KeyError), err:
711 self.trouble(u'ERROR: invalid system charset or erroneous output template')
712 return None
713
714 def _match_entry(self, info_dict):
715 """ Returns None iff the file should be downloaded """
716
717 title = info_dict['title']
718 matchtitle = self.params.get('matchtitle', False)
719 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721 rejecttitle = self.params.get('rejecttitle', False)
722 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724 return None
725
726 def process_info(self, info_dict):
727 """Process a single dictionary returned by an InfoExtractor."""
728
729 reason = self._match_entry(info_dict)
730 if reason is not None:
731 self.to_screen(u'[download] ' + reason)
732 return
733
734 max_downloads = self.params.get('max_downloads')
735 if max_downloads is not None:
736 if self._num_downloads > int(max_downloads):
737 raise MaxDownloadsReached()
738
739 filename = self.prepare_filename(info_dict)
740
741 # Forced printings
742 if self.params.get('forcetitle', False):
743 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744 if self.params.get('forceurl', False):
745 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748 if self.params.get('forcedescription', False) and 'description' in info_dict:
749 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750 if self.params.get('forcefilename', False) and filename is not None:
751 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752 if self.params.get('forceformat', False):
753 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
754
755 # Do nothing else if in simulate mode
756 if self.params.get('simulate', False):
757 return
758
759 if filename is None:
760 return
761
762 try:
763 dn = os.path.dirname(filename)
764 if dn != '' and not os.path.exists(dn):
765 os.makedirs(dn)
766 except (OSError, IOError), err:
767 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
768 return
769
770 if self.params.get('writedescription', False):
771 try:
772 descfn = filename + '.description'
773 self.report_writedescription(descfn)
774 descfile = open(descfn, 'wb')
775 try:
776 descfile.write(info_dict['description'].encode('utf-8'))
777 finally:
778 descfile.close()
779 except (OSError, IOError):
780 self.trouble(u'ERROR: Cannot write description file ' + descfn)
781 return
782
783 if self.params.get('writeinfojson', False):
784 infofn = filename + '.info.json'
785 self.report_writeinfojson(infofn)
786 try:
787 json.dump
788 except (NameError,AttributeError):
789 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
790 return
791 try:
792 infof = open(infofn, 'wb')
793 try:
794 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
795 json.dump(json_info_dict, infof)
796 finally:
797 infof.close()
798 except (OSError, IOError):
799 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
800 return
801
802 if not self.params.get('skip_download', False):
803 if self.params.get('nooverwrites', False) and os.path.exists(filename):
804 success = True
805 else:
806 try:
807 success = self._do_download(filename, info_dict)
808 except (OSError, IOError), err:
809 raise UnavailableVideoError
810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
811 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
812 return
813 except (ContentTooShortError, ), err:
814 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
815 return
816
817 if success:
818 try:
819 self.post_process(filename, info_dict)
820 except (PostProcessingError), err:
821 self.trouble(u'ERROR: postprocessing: %s' % str(err))
822 return
823
824 def download(self, url_list):
825 """Download a given list of URLs."""
826 if len(url_list) > 1 and self.fixed_template():
827 raise SameFileError(self.params['outtmpl'])
828
829 for url in url_list:
830 suitable_found = False
831 for ie in self._ies:
832 # Go to next InfoExtractor if not suitable
833 if not ie.suitable(url):
834 continue
835
836 # Suitable InfoExtractor found
837 suitable_found = True
838
839 # Extract information from URL and process it
840 ie.extract(url)
841
842 # Suitable InfoExtractor had been found; go to next URL
843 break
844
845 if not suitable_found:
846 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
847
848 return self._download_retcode
849
850 def post_process(self, filename, ie_info):
851 """Run the postprocessing chain on the given file."""
852 info = dict(ie_info)
853 info['filepath'] = filename
854 for pp in self._pps:
855 info = pp.run(info)
856 if info is None:
857 break
858
859 def _download_with_rtmpdump(self, filename, url, player_url):
860 self.report_destination(filename)
861 tmpfilename = self.temp_name(filename)
862
863 # Check for rtmpdump first
864 try:
865 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
866 except (OSError, IOError):
867 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
868 return False
869
870 # Download using rtmpdump. rtmpdump returns exit code 2 when
871 # the connection was interrumpted and resuming appears to be
872 # possible. This is part of rtmpdump's normal usage, AFAIK.
873 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
874 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
875 while retval == 2 or retval == 1:
876 prevsize = os.path.getsize(tmpfilename)
877 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
878 time.sleep(5.0) # This seems to be needed
879 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
880 cursize = os.path.getsize(tmpfilename)
881 if prevsize == cursize and retval == 1:
882 break
883 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
884 if prevsize == cursize and retval == 2 and cursize > 1024:
885 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
886 retval = 0
887 break
888 if retval == 0:
889 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
890 self.try_rename(tmpfilename, filename)
891 return True
892 else:
893 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
894 return False
895
896 def _do_download(self, filename, info_dict):
897 url = info_dict['url']
898 player_url = info_dict.get('player_url', None)
899
900 # Check file already present
901 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
902 self.report_file_already_downloaded(filename)
903 return True
904
905 # Attempt to download using rtmpdump
906 if url.startswith('rtmp'):
907 return self._download_with_rtmpdump(filename, url, player_url)
908
909 tmpfilename = self.temp_name(filename)
910 stream = None
911
912 # Do not include the Accept-Encoding header
913 headers = {'Youtubedl-no-compression': 'True'}
914 basic_request = urllib2.Request(url, None, headers)
915 request = urllib2.Request(url, None, headers)
916
917 # Establish possible resume length
918 if os.path.isfile(tmpfilename):
919 resume_len = os.path.getsize(tmpfilename)
920 else:
921 resume_len = 0
922
923 open_mode = 'wb'
924 if resume_len != 0:
925 if self.params.get('continuedl', False):
926 self.report_resuming_byte(resume_len)
927 request.add_header('Range','bytes=%d-' % resume_len)
928 open_mode = 'ab'
929 else:
930 resume_len = 0
931
932 count = 0
933 retries = self.params.get('retries', 0)
934 while count <= retries:
935 # Establish connection
936 try:
937 if count == 0 and 'urlhandle' in info_dict:
938 data = info_dict['urlhandle']
939 data = urllib2.urlopen(request)
940 break
941 except (urllib2.HTTPError, ), err:
942 if (err.code < 500 or err.code >= 600) and err.code != 416:
943 # Unexpected HTTP error
944 raise
945 elif err.code == 416:
946 # Unable to resume (requested range not satisfiable)
947 try:
948 # Open the connection again without the range header
949 data = urllib2.urlopen(basic_request)
950 content_length = data.info()['Content-Length']
951 except (urllib2.HTTPError, ), err:
952 if err.code < 500 or err.code >= 600:
953 raise
954 else:
955 # Examine the reported length
956 if (content_length is not None and
957 (resume_len - 100 < long(content_length) < resume_len + 100)):
958 # The file had already been fully downloaded.
959 # Explanation to the above condition: in issue #175 it was revealed that
960 # YouTube sometimes adds or removes a few bytes from the end of the file,
961 # changing the file size slightly and causing problems for some users. So
962 # I decided to implement a suggested change and consider the file
963 # completely downloaded if the file size differs less than 100 bytes from
964 # the one in the hard drive.
965 self.report_file_already_downloaded(filename)
966 self.try_rename(tmpfilename, filename)
967 return True
968 else:
969 # The length does not match, we start the download over
970 self.report_unable_to_resume()
971 open_mode = 'wb'
972 break
973 # Retry
974 count += 1
975 if count <= retries:
976 self.report_retry(count, retries)
977
978 if count > retries:
979 self.trouble(u'ERROR: giving up after %s retries' % retries)
980 return False
981
982 data_len = data.info().get('Content-length', None)
983 if data_len is not None:
984 data_len = long(data_len) + resume_len
985 data_len_str = self.format_bytes(data_len)
986 byte_counter = 0 + resume_len
987 block_size = 1024
988 start = time.time()
989 while True:
990 # Download and write
991 before = time.time()
992 data_block = data.read(block_size)
993 after = time.time()
994 if len(data_block) == 0:
995 break
996 byte_counter += len(data_block)
997
998 # Open file just in time
999 if stream is None:
1000 try:
1001 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1002 assert stream is not None
1003 filename = self.undo_temp_name(tmpfilename)
1004 self.report_destination(filename)
1005 except (OSError, IOError), err:
1006 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1007 return False
1008 try:
1009 stream.write(data_block)
1010 except (IOError, OSError), err:
1011 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1012 return False
1013 block_size = self.best_block_size(after - before, len(data_block))
1014
1015 # Progress message
1016 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1017 if data_len is None:
1018 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1019 else:
1020 percent_str = self.calc_percent(byte_counter, data_len)
1021 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1022 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1023
1024 # Apply rate limit
1025 self.slow_down(start, byte_counter - resume_len)
1026
1027 if stream is None:
1028 self.trouble(u'\nERROR: Did not get any data blocks')
1029 return False
1030 stream.close()
1031 self.report_finish()
1032 if data_len is not None and byte_counter != data_len:
1033 raise ContentTooShortError(byte_counter, long(data_len))
1034 self.try_rename(tmpfilename, filename)
1035
1036 # Update file modification time
1037 if self.params.get('updatetime', True):
1038 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1039
1040 return True
1041
1042
1043 class InfoExtractor(object):
1044 """Information Extractor class.
1045
1046 Information extractors are the classes that, given a URL, extract
1047 information from the video (or videos) the URL refers to. This
1048 information includes the real video URL, the video title and simplified
1049 title, author and others. The information is stored in a dictionary
1050 which is then passed to the FileDownloader. The FileDownloader
1051 processes this information possibly downloading the video to the file
1052 system, among other possible outcomes. The dictionaries must include
1053 the following fields:
1054
1055 id: Video identifier.
1056 url: Final video URL.
1057 uploader: Nickname of the video uploader.
1058 title: Literal title.
1059 stitle: Simplified title.
1060 ext: Video filename extension.
1061 format: Video format.
1062 player_url: SWF Player URL (may be None).
1063
1064 The following fields are optional. Their primary purpose is to allow
1065 youtube-dl to serve as the backend for a video search function, such
1066 as the one in youtube2mp3. They are only used when their respective
1067 forced printing functions are called:
1068
1069 thumbnail: Full URL to a video thumbnail image.
1070 description: One-line video description.
1071
1072 Subclasses of this one should re-define the _real_initialize() and
1073 _real_extract() methods and define a _VALID_URL regexp.
1074 Probably, they should also be added to the list of extractors.
1075 """
1076
1077 _ready = False
1078 _downloader = None
1079
1080 def __init__(self, downloader=None):
1081 """Constructor. Receives an optional downloader."""
1082 self._ready = False
1083 self.set_downloader(downloader)
1084
1085 def suitable(self, url):
1086 """Receives a URL and returns True if suitable for this IE."""
1087 return re.match(self._VALID_URL, url) is not None
1088
1089 def initialize(self):
1090 """Initializes an instance (authentication, etc)."""
1091 if not self._ready:
1092 self._real_initialize()
1093 self._ready = True
1094
1095 def extract(self, url):
1096 """Extracts URL information and returns it in list of dicts."""
1097 self.initialize()
1098 return self._real_extract(url)
1099
1100 def set_downloader(self, downloader):
1101 """Sets the downloader for this IE."""
1102 self._downloader = downloader
1103
1104 def _real_initialize(self):
1105 """Real initialization process. Redefine in subclasses."""
1106 pass
1107
1108 def _real_extract(self, url):
1109 """Real extraction process. Redefine in subclasses."""
1110 pass
1111
1112
1113 class YoutubeIE(InfoExtractor):
1114 """Information extractor for youtube.com."""
1115
1116 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1117 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1118 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1119 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1120 _NETRC_MACHINE = 'youtube'
1121 # Listed in order of quality
1122 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1123 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1124 _video_extensions = {
1125 '13': '3gp',
1126 '17': 'mp4',
1127 '18': 'mp4',
1128 '22': 'mp4',
1129 '37': 'mp4',
1130 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1131 '43': 'webm',
1132 '44': 'webm',
1133 '45': 'webm',
1134 }
1135 _video_dimensions = {
1136 '5': '240x400',
1137 '6': '???',
1138 '13': '???',
1139 '17': '144x176',
1140 '18': '360x640',
1141 '22': '720x1280',
1142 '34': '360x640',
1143 '35': '480x854',
1144 '37': '1080x1920',
1145 '38': '3072x4096',
1146 '43': '360x640',
1147 '44': '480x854',
1148 '45': '720x1280',
1149 }
1150 IE_NAME = u'youtube'
1151
1152 def report_lang(self):
1153 """Report attempt to set language."""
1154 self._downloader.to_screen(u'[youtube] Setting language')
1155
1156 def report_login(self):
1157 """Report attempt to log in."""
1158 self._downloader.to_screen(u'[youtube] Logging in')
1159
1160 def report_age_confirmation(self):
1161 """Report attempt to confirm age."""
1162 self._downloader.to_screen(u'[youtube] Confirming age')
1163
1164 def report_video_webpage_download(self, video_id):
1165 """Report attempt to download video webpage."""
1166 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1167
1168 def report_video_info_webpage_download(self, video_id):
1169 """Report attempt to download video info webpage."""
1170 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1171
1172 def report_information_extraction(self, video_id):
1173 """Report attempt to extract video information."""
1174 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1175
1176 def report_unavailable_format(self, video_id, format):
1177 """Report extracted video URL."""
1178 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1179
1180 def report_rtmp_download(self):
1181 """Indicate the download will use the RTMP protocol."""
1182 self._downloader.to_screen(u'[youtube] RTMP download detected')
1183
1184 def _print_formats(self, formats):
1185 print 'Available formats:'
1186 for x in formats:
1187 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1188
1189 def _real_initialize(self):
1190 if self._downloader is None:
1191 return
1192
1193 username = None
1194 password = None
1195 downloader_params = self._downloader.params
1196
1197 # Attempt to use provided username and password or .netrc data
1198 if downloader_params.get('username', None) is not None:
1199 username = downloader_params['username']
1200 password = downloader_params['password']
1201 elif downloader_params.get('usenetrc', False):
1202 try:
1203 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1204 if info is not None:
1205 username = info[0]
1206 password = info[2]
1207 else:
1208 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1209 except (IOError, netrc.NetrcParseError), err:
1210 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1211 return
1212
1213 # Set language
1214 request = urllib2.Request(self._LANG_URL)
1215 try:
1216 self.report_lang()
1217 urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1220 return
1221
1222 # No authentication to be performed
1223 if username is None:
1224 return
1225
1226 # Log in
1227 login_form = {
1228 'current_form': 'loginForm',
1229 'next': '/',
1230 'action_login': 'Log In',
1231 'username': username,
1232 'password': password,
1233 }
1234 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1235 try:
1236 self.report_login()
1237 login_results = urllib2.urlopen(request).read()
1238 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1239 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1240 return
1241 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1243 return
1244
1245 # Confirm age
1246 age_form = {
1247 'next_url': '/',
1248 'action_confirm': 'Confirm',
1249 }
1250 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1251 try:
1252 self.report_age_confirmation()
1253 age_results = urllib2.urlopen(request).read()
1254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1255 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1256 return
1257
1258 def _real_extract(self, url):
1259 # Extract video id from URL
1260 mobj = re.match(self._VALID_URL, url)
1261 if mobj is None:
1262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1263 return
1264 video_id = mobj.group(2)
1265
1266 # Get video webpage
1267 self.report_video_webpage_download(video_id)
1268 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1269 try:
1270 video_webpage = urllib2.urlopen(request).read()
1271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1273 return
1274
1275 # Attempt to extract SWF player URL
1276 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277 if mobj is not None:
1278 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279 else:
1280 player_url = None
1281
1282 # Get video info
1283 self.report_video_info_webpage_download(video_id)
1284 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286 % (video_id, el_type))
1287 request = urllib2.Request(video_info_url)
1288 try:
1289 video_info_webpage = urllib2.urlopen(request).read()
1290 video_info = parse_qs(video_info_webpage)
1291 if 'token' in video_info:
1292 break
1293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1295 return
1296 if 'token' not in video_info:
1297 if 'reason' in video_info:
1298 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1299 else:
1300 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1301 return
1302
1303 # Start extracting information
1304 self.report_information_extraction(video_id)
1305
1306 # uploader
1307 if 'author' not in video_info:
1308 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1309 return
1310 video_uploader = urllib.unquote_plus(video_info['author'][0])
1311
1312 # title
1313 if 'title' not in video_info:
1314 self._downloader.trouble(u'ERROR: unable to extract video title')
1315 return
1316 video_title = urllib.unquote_plus(video_info['title'][0])
1317 video_title = video_title.decode('utf-8')
1318 video_title = sanitize_title(video_title)
1319
1320 # simplified title
1321 simple_title = _simplify_title(video_title)
1322
1323 # thumbnail image
1324 if 'thumbnail_url' not in video_info:
1325 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1326 video_thumbnail = ''
1327 else: # don't panic if we can't find it
1328 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1329
1330 # upload date
1331 upload_date = u'NA'
1332 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1333 if mobj is not None:
1334 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1335 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336 for expression in format_expressions:
1337 try:
1338 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1339 except:
1340 pass
1341
1342 # description
1343 try:
1344 lxml.etree
1345 except NameError:
1346 video_description = u'No description available.'
1347 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1348 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1349 if mobj is not None:
1350 video_description = mobj.group(1).decode('utf-8')
1351 else:
1352 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1353 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1354 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1355 # TODO use another parser
1356
1357 # token
1358 video_token = urllib.unquote_plus(video_info['token'][0])
1359
1360 # Decide which formats to download
1361 req_format = self._downloader.params.get('format', None)
1362
1363 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1364 self.report_rtmp_download()
1365 video_url_list = [(None, video_info['conn'][0])]
1366 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1367 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1368 url_data = [parse_qs(uds) for uds in url_data_strs]
1369 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1370 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1371
1372 format_limit = self._downloader.params.get('format_limit', None)
1373 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1374 if format_limit is not None and format_limit in available_formats:
1375 format_list = available_formats[available_formats.index(format_limit):]
1376 else:
1377 format_list = available_formats
1378 existing_formats = [x for x in format_list if x in url_map]
1379 if len(existing_formats) == 0:
1380 self._downloader.trouble(u'ERROR: no known formats available for video')
1381 return
1382 if self._downloader.params.get('listformats', None):
1383 self._print_formats(existing_formats)
1384 return
1385 if req_format is None or req_format == 'best':
1386 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1387 elif req_format == 'worst':
1388 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1389 elif req_format in ('-1', 'all'):
1390 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1391 else:
1392 # Specific formats. We pick the first in a slash-delimeted sequence.
1393 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1394 req_formats = req_format.split('/')
1395 video_url_list = None
1396 for rf in req_formats:
1397 if rf in url_map:
1398 video_url_list = [(rf, url_map[rf])]
1399 break
1400 if video_url_list is None:
1401 self._downloader.trouble(u'ERROR: requested format not available')
1402 return
1403 else:
1404 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1405 return
1406
1407 for format_param, video_real_url in video_url_list:
1408 # At this point we have a new video
1409 self._downloader.increment_downloads()
1410
1411 # Extension
1412 video_extension = self._video_extensions.get(format_param, 'flv')
1413
1414 try:
1415 # Process video information
1416 self._downloader.process_info({
1417 'id': video_id.decode('utf-8'),
1418 'url': video_real_url.decode('utf-8'),
1419 'uploader': video_uploader.decode('utf-8'),
1420 'upload_date': upload_date,
1421 'title': video_title,
1422 'stitle': simple_title,
1423 'ext': video_extension.decode('utf-8'),
1424 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1425 'thumbnail': video_thumbnail.decode('utf-8'),
1426 'description': video_description,
1427 'player_url': player_url,
1428 })
1429 except UnavailableVideoError, err:
1430 self._downloader.trouble(u'\nERROR: unable to download video')
1431
1432
1433 class MetacafeIE(InfoExtractor):
1434 """Information Extractor for metacafe.com."""
1435
1436 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1437 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1438 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1439 _youtube_ie = None
1440 IE_NAME = u'metacafe'
1441
1442 def __init__(self, youtube_ie, downloader=None):
1443 InfoExtractor.__init__(self, downloader)
1444 self._youtube_ie = youtube_ie
1445
1446 def report_disclaimer(self):
1447 """Report disclaimer retrieval."""
1448 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1449
1450 def report_age_confirmation(self):
1451 """Report attempt to confirm age."""
1452 self._downloader.to_screen(u'[metacafe] Confirming age')
1453
1454 def report_download_webpage(self, video_id):
1455 """Report webpage download."""
1456 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1457
1458 def report_extraction(self, video_id):
1459 """Report information extraction."""
1460 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1461
1462 def _real_initialize(self):
1463 # Retrieve disclaimer
1464 request = urllib2.Request(self._DISCLAIMER)
1465 try:
1466 self.report_disclaimer()
1467 disclaimer = urllib2.urlopen(request).read()
1468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1470 return
1471
1472 # Confirm age
1473 disclaimer_form = {
1474 'filters': '0',
1475 'submit': "Continue - I'm over 18",
1476 }
1477 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1478 try:
1479 self.report_age_confirmation()
1480 disclaimer = urllib2.urlopen(request).read()
1481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1483 return
1484
1485 def _real_extract(self, url):
1486 # Extract id and simplified title from URL
1487 mobj = re.match(self._VALID_URL, url)
1488 if mobj is None:
1489 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1490 return
1491
1492 video_id = mobj.group(1)
1493
1494 # Check if video comes from YouTube
1495 mobj2 = re.match(r'^yt-(.*)$', video_id)
1496 if mobj2 is not None:
1497 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1498 return
1499
1500 # At this point we have a new video
1501 self._downloader.increment_downloads()
1502
1503 simple_title = mobj.group(2).decode('utf-8')
1504
1505 # Retrieve video webpage to extract further information
1506 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1507 try:
1508 self.report_download_webpage(video_id)
1509 webpage = urllib2.urlopen(request).read()
1510 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1512 return
1513
1514 # Extract URL, uploader and title from webpage
1515 self.report_extraction(video_id)
1516 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1517 if mobj is not None:
1518 mediaURL = urllib.unquote(mobj.group(1))
1519 video_extension = mediaURL[-3:]
1520
1521 # Extract gdaKey if available
1522 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1523 if mobj is None:
1524 video_url = mediaURL
1525 else:
1526 gdaKey = mobj.group(1)
1527 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1528 else:
1529 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1530 if mobj is None:
1531 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532 return
1533 vardict = parse_qs(mobj.group(1))
1534 if 'mediaData' not in vardict:
1535 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536 return
1537 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1538 if mobj is None:
1539 self._downloader.trouble(u'ERROR: unable to extract media URL')
1540 return
1541 mediaURL = mobj.group(1).replace('\\/', '/')
1542 video_extension = mediaURL[-3:]
1543 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1544
1545 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1546 if mobj is None:
1547 self._downloader.trouble(u'ERROR: unable to extract title')
1548 return
1549 video_title = mobj.group(1).decode('utf-8')
1550 video_title = sanitize_title(video_title)
1551
1552 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1553 if mobj is None:
1554 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1555 return
1556 video_uploader = mobj.group(1)
1557
1558 try:
1559 # Process video information
1560 self._downloader.process_info({
1561 'id': video_id.decode('utf-8'),
1562 'url': video_url.decode('utf-8'),
1563 'uploader': video_uploader.decode('utf-8'),
1564 'upload_date': u'NA',
1565 'title': video_title,
1566 'stitle': simple_title,
1567 'ext': video_extension.decode('utf-8'),
1568 'format': u'NA',
1569 'player_url': None,
1570 })
1571 except UnavailableVideoError:
1572 self._downloader.trouble(u'\nERROR: unable to download video')
1573
1574
1575 class DailymotionIE(InfoExtractor):
1576 """Information Extractor for Dailymotion"""
1577
1578 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1579 IE_NAME = u'dailymotion'
1580
1581 def __init__(self, downloader=None):
1582 InfoExtractor.__init__(self, downloader)
1583
1584 def report_download_webpage(self, video_id):
1585 """Report webpage download."""
1586 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1587
1588 def report_extraction(self, video_id):
1589 """Report information extraction."""
1590 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1591
1592 def _real_extract(self, url):
1593 htmlParser = HTMLParser.HTMLParser()
1594
1595 # Extract id and simplified title from URL
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1599 return
1600
1601 # At this point we have a new video
1602 self._downloader.increment_downloads()
1603 video_id = mobj.group(1)
1604
1605 video_extension = 'flv'
1606
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request(url)
1609 request.add_header('Cookie', 'family_filter=off')
1610 try:
1611 self.report_download_webpage(video_id)
1612 webpage = urllib2.urlopen(request).read()
1613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1615 return
1616
1617 # Extract URL, uploader and title from webpage
1618 self.report_extraction(video_id)
1619 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1620 if mobj is None:
1621 self._downloader.trouble(u'ERROR: unable to extract media URL')
1622 return
1623 sequence = urllib.unquote(mobj.group(1))
1624 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1625 if mobj is None:
1626 self._downloader.trouble(u'ERROR: unable to extract media URL')
1627 return
1628 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1629
1630 # if needed add http://www.dailymotion.com/ if relative URL
1631
1632 video_url = mediaURL
1633
1634 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1635 if mobj is None:
1636 self._downloader.trouble(u'ERROR: unable to extract title')
1637 return
1638 video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8')
1639 video_title = sanitize_title(video_title)
1640 simple_title = _simplify_title(video_title)
1641
1642 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1643 if mobj is None:
1644 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1645 return
1646 video_uploader = mobj.group(1)
1647
1648 try:
1649 # Process video information
1650 self._downloader.process_info({
1651 'id': video_id.decode('utf-8'),
1652 'url': video_url.decode('utf-8'),
1653 'uploader': video_uploader.decode('utf-8'),
1654 'upload_date': u'NA',
1655 'title': video_title,
1656 'stitle': simple_title,
1657 'ext': video_extension.decode('utf-8'),
1658 'format': u'NA',
1659 'player_url': None,
1660 })
1661 except UnavailableVideoError:
1662 self._downloader.trouble(u'\nERROR: unable to download video')
1663
1664
1665 class GoogleIE(InfoExtractor):
1666 """Information extractor for video.google.com."""
1667
1668 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1669 IE_NAME = u'video.google'
1670
1671 def __init__(self, downloader=None):
1672 InfoExtractor.__init__(self, downloader)
1673
1674 def report_download_webpage(self, video_id):
1675 """Report webpage download."""
1676 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1677
1678 def report_extraction(self, video_id):
1679 """Report information extraction."""
1680 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1681
1682 def _real_extract(self, url):
1683 # Extract id from URL
1684 mobj = re.match(self._VALID_URL, url)
1685 if mobj is None:
1686 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1687 return
1688
1689 # At this point we have a new video
1690 self._downloader.increment_downloads()
1691 video_id = mobj.group(1)
1692
1693 video_extension = 'mp4'
1694
1695 # Retrieve video webpage to extract further information
1696 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1697 try:
1698 self.report_download_webpage(video_id)
1699 webpage = urllib2.urlopen(request).read()
1700 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1702 return
1703
1704 # Extract URL, uploader, and title from webpage
1705 self.report_extraction(video_id)
1706 mobj = re.search(r"download_url:'([^']+)'", webpage)
1707 if mobj is None:
1708 video_extension = 'flv'
1709 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1710 if mobj is None:
1711 self._downloader.trouble(u'ERROR: unable to extract media URL')
1712 return
1713 mediaURL = urllib.unquote(mobj.group(1))
1714 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1715 mediaURL = mediaURL.replace('\\x26', '\x26')
1716
1717 video_url = mediaURL
1718
1719 mobj = re.search(r'<title>(.*)</title>', webpage)
1720 if mobj is None:
1721 self._downloader.trouble(u'ERROR: unable to extract title')
1722 return
1723 video_title = mobj.group(1).decode('utf-8')
1724 video_title = sanitize_title(video_title)
1725 simple_title = _simplify_title(video_title)
1726
1727 # Extract video description
1728 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1729 if mobj is None:
1730 self._downloader.trouble(u'ERROR: unable to extract video description')
1731 return
1732 video_description = mobj.group(1).decode('utf-8')
1733 if not video_description:
1734 video_description = 'No description available.'
1735
1736 # Extract video thumbnail
1737 if self._downloader.params.get('forcethumbnail', False):
1738 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1739 try:
1740 webpage = urllib2.urlopen(request).read()
1741 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1742 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743 return
1744 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1745 if mobj is None:
1746 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1747 return
1748 video_thumbnail = mobj.group(1)
1749 else: # we need something to pass to process_info
1750 video_thumbnail = ''
1751
1752 try:
1753 # Process video information
1754 self._downloader.process_info({
1755 'id': video_id.decode('utf-8'),
1756 'url': video_url.decode('utf-8'),
1757 'uploader': u'NA',
1758 'upload_date': u'NA',
1759 'title': video_title,
1760 'stitle': simple_title,
1761 'ext': video_extension.decode('utf-8'),
1762 'format': u'NA',
1763 'player_url': None,
1764 })
1765 except UnavailableVideoError:
1766 self._downloader.trouble(u'\nERROR: unable to download video')
1767
1768
1769 class PhotobucketIE(InfoExtractor):
1770 """Information extractor for photobucket.com."""
1771
1772 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1773 IE_NAME = u'photobucket'
1774
1775 def __init__(self, downloader=None):
1776 InfoExtractor.__init__(self, downloader)
1777
1778 def report_download_webpage(self, video_id):
1779 """Report webpage download."""
1780 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1781
1782 def report_extraction(self, video_id):
1783 """Report information extraction."""
1784 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1785
1786 def _real_extract(self, url):
1787 # Extract id from URL
1788 mobj = re.match(self._VALID_URL, url)
1789 if mobj is None:
1790 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1791 return
1792
1793 # At this point we have a new video
1794 self._downloader.increment_downloads()
1795 video_id = mobj.group(1)
1796
1797 video_extension = 'flv'
1798
1799 # Retrieve video webpage to extract further information
1800 request = urllib2.Request(url)
1801 try:
1802 self.report_download_webpage(video_id)
1803 webpage = urllib2.urlopen(request).read()
1804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1806 return
1807
1808 # Extract URL, uploader, and title from webpage
1809 self.report_extraction(video_id)
1810 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1811 if mobj is None:
1812 self._downloader.trouble(u'ERROR: unable to extract media URL')
1813 return
1814 mediaURL = urllib.unquote(mobj.group(1))
1815
1816 video_url = mediaURL
1817
1818 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1819 if mobj is None:
1820 self._downloader.trouble(u'ERROR: unable to extract title')
1821 return
1822 video_title = mobj.group(1).decode('utf-8')
1823 video_title = sanitize_title(video_title)
1824 simple_title = _simplify_title(vide_title)
1825
1826 video_uploader = mobj.group(2).decode('utf-8')
1827
1828 try:
1829 # Process video information
1830 self._downloader.process_info({
1831 'id': video_id.decode('utf-8'),
1832 'url': video_url.decode('utf-8'),
1833 'uploader': video_uploader,
1834 'upload_date': u'NA',
1835 'title': video_title,
1836 'stitle': simple_title,
1837 'ext': video_extension.decode('utf-8'),
1838 'format': u'NA',
1839 'player_url': None,
1840 })
1841 except UnavailableVideoError:
1842 self._downloader.trouble(u'\nERROR: unable to download video')
1843
1844
1845 class YahooIE(InfoExtractor):
1846 """Information extractor for video.yahoo.com."""
1847
1848 # _VALID_URL matches all Yahoo! Video URLs
1849 # _VPAGE_URL matches only the extractable '/watch/' URLs
1850 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1851 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1852 IE_NAME = u'video.yahoo'
1853
1854 def __init__(self, downloader=None):
1855 InfoExtractor.__init__(self, downloader)
1856
1857 def report_download_webpage(self, video_id):
1858 """Report webpage download."""
1859 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1860
1861 def report_extraction(self, video_id):
1862 """Report information extraction."""
1863 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1864
1865 def _real_extract(self, url, new_video=True):
1866 # Extract ID from URL
1867 mobj = re.match(self._VALID_URL, url)
1868 if mobj is None:
1869 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1870 return
1871
1872 # At this point we have a new video
1873 self._downloader.increment_downloads()
1874 video_id = mobj.group(2)
1875 video_extension = 'flv'
1876
1877 # Rewrite valid but non-extractable URLs as
1878 # extractable English language /watch/ URLs
1879 if re.match(self._VPAGE_URL, url) is None:
1880 request = urllib2.Request(url)
1881 try:
1882 webpage = urllib2.urlopen(request).read()
1883 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1885 return
1886
1887 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1888 if mobj is None:
1889 self._downloader.trouble(u'ERROR: Unable to extract id field')
1890 return
1891 yahoo_id = mobj.group(1)
1892
1893 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1894 if mobj is None:
1895 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1896 return
1897 yahoo_vid = mobj.group(1)
1898
1899 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1900 return self._real_extract(url, new_video=False)
1901
1902 # Retrieve video webpage to extract further information
1903 request = urllib2.Request(url)
1904 try:
1905 self.report_download_webpage(video_id)
1906 webpage = urllib2.urlopen(request).read()
1907 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1909 return
1910
1911 # Extract uploader and title from webpage
1912 self.report_extraction(video_id)
1913 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: unable to extract video title')
1916 return
1917 video_title = mobj.group(1).decode('utf-8')
1918 simple_title = _simplify_title(video_title)
1919
1920 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1921 if mobj is None:
1922 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1923 return
1924 video_uploader = mobj.group(1).decode('utf-8')
1925
1926 # Extract video thumbnail
1927 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1928 if mobj is None:
1929 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1930 return
1931 video_thumbnail = mobj.group(1).decode('utf-8')
1932
1933 # Extract video description
1934 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1935 if mobj is None:
1936 self._downloader.trouble(u'ERROR: unable to extract video description')
1937 return
1938 video_description = mobj.group(1).decode('utf-8')
1939 if not video_description:
1940 video_description = 'No description available.'
1941
1942 # Extract video height and width
1943 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1944 if mobj is None:
1945 self._downloader.trouble(u'ERROR: unable to extract video height')
1946 return
1947 yv_video_height = mobj.group(1)
1948
1949 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1950 if mobj is None:
1951 self._downloader.trouble(u'ERROR: unable to extract video width')
1952 return
1953 yv_video_width = mobj.group(1)
1954
1955 # Retrieve video playlist to extract media URL
1956 # I'm not completely sure what all these options are, but we
1957 # seem to need most of them, otherwise the server sends a 401.
1958 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1959 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1960 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1961 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1962 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1963 try:
1964 self.report_download_webpage(video_id)
1965 webpage = urllib2.urlopen(request).read()
1966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1967 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1968 return
1969
1970 # Extract media URL from playlist XML
1971 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1972 if mobj is None:
1973 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1974 return
1975 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1976 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1977
1978 try:
1979 # Process video information
1980 self._downloader.process_info({
1981 'id': video_id.decode('utf-8'),
1982 'url': video_url,
1983 'uploader': video_uploader,
1984 'upload_date': u'NA',
1985 'title': video_title,
1986 'stitle': simple_title,
1987 'ext': video_extension.decode('utf-8'),
1988 'thumbnail': video_thumbnail.decode('utf-8'),
1989 'description': video_description,
1990 'thumbnail': video_thumbnail,
1991 'player_url': None,
1992 })
1993 except UnavailableVideoError:
1994 self._downloader.trouble(u'\nERROR: unable to download video')
1995
1996
1997 class VimeoIE(InfoExtractor):
1998 """Information extractor for vimeo.com."""
1999
2000 # _VALID_URL matches Vimeo URLs
2001 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2002 IE_NAME = u'vimeo'
2003
2004 def __init__(self, downloader=None):
2005 InfoExtractor.__init__(self, downloader)
2006
2007 def report_download_webpage(self, video_id):
2008 """Report webpage download."""
2009 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2010
2011 def report_extraction(self, video_id):
2012 """Report information extraction."""
2013 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2014
2015 def _real_extract(self, url, new_video=True):
2016 # Extract ID from URL
2017 mobj = re.match(self._VALID_URL, url)
2018 if mobj is None:
2019 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2020 return
2021
2022 # At this point we have a new video
2023 self._downloader.increment_downloads()
2024 video_id = mobj.group(1)
2025
2026 # Retrieve video webpage to extract further information
2027 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2028 try:
2029 self.report_download_webpage(video_id)
2030 webpage = urllib2.urlopen(request).read()
2031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2032 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2033 return
2034
2035 # Now we begin extracting as much information as we can from what we
2036 # retrieved. First we extract the information common to all extractors,
2037 # and latter we extract those that are Vimeo specific.
2038 self.report_extraction(video_id)
2039
2040 # Extract title
2041 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2042 if mobj is None:
2043 self._downloader.trouble(u'ERROR: unable to extract video title')
2044 return
2045 video_title = mobj.group(1).decode('utf-8')
2046 simple_title = _simplify_title(video_title)
2047
2048 # Extract uploader
2049 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2050 if mobj is None:
2051 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2052 return
2053 video_uploader = mobj.group(1).decode('utf-8')
2054
2055 # Extract video thumbnail
2056 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2057 if mobj is None:
2058 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2059 return
2060 video_thumbnail = mobj.group(1).decode('utf-8')
2061
2062 # # Extract video description
2063 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2064 # if mobj is None:
2065 # self._downloader.trouble(u'ERROR: unable to extract video description')
2066 # return
2067 # video_description = mobj.group(1).decode('utf-8')
2068 # if not video_description: video_description = 'No description available.'
2069 video_description = 'Foo.'
2070
2071 # Vimeo specific: extract request signature
2072 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2073 if mobj is None:
2074 self._downloader.trouble(u'ERROR: unable to extract request signature')
2075 return
2076 sig = mobj.group(1).decode('utf-8')
2077
2078 # Vimeo specific: extract video quality information
2079 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2080 if mobj is None:
2081 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2082 return
2083 quality = mobj.group(1).decode('utf-8')
2084
2085 if int(quality) == 1:
2086 quality = 'hd'
2087 else:
2088 quality = 'sd'
2089
2090 # Vimeo specific: Extract request signature expiration
2091 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2092 if mobj is None:
2093 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2094 return
2095 sig_exp = mobj.group(1).decode('utf-8')
2096
2097 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2098
2099 try:
2100 # Process video information
2101 self._downloader.process_info({
2102 'id': video_id.decode('utf-8'),
2103 'url': video_url,
2104 'uploader': video_uploader,
2105 'upload_date': u'NA',
2106 'title': video_title,
2107 'stitle': simple_title,
2108 'ext': u'mp4',
2109 'thumbnail': video_thumbnail.decode('utf-8'),
2110 'description': video_description,
2111 'thumbnail': video_thumbnail,
2112 'description': video_description,
2113 'player_url': None,
2114 })
2115 except UnavailableVideoError:
2116 self._downloader.trouble(u'ERROR: unable to download video')
2117
2118
2119 class GenericIE(InfoExtractor):
2120 """Generic last-resort information extractor."""
2121
2122 _VALID_URL = r'.*'
2123 IE_NAME = u'generic'
2124
2125 def __init__(self, downloader=None):
2126 InfoExtractor.__init__(self, downloader)
2127
2128 def report_download_webpage(self, video_id):
2129 """Report webpage download."""
2130 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2131 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2132
2133 def report_extraction(self, video_id):
2134 """Report information extraction."""
2135 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2136
2137 def _real_extract(self, url):
2138 # At this point we have a new video
2139 self._downloader.increment_downloads()
2140
2141 video_id = url.split('/')[-1]
2142 request = urllib2.Request(url)
2143 try:
2144 self.report_download_webpage(video_id)
2145 webpage = urllib2.urlopen(request).read()
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148 return
2149 except ValueError, err:
2150 # since this is the last-resort InfoExtractor, if
2151 # this error is thrown, it'll be thrown here
2152 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2153 return
2154
2155 self.report_extraction(video_id)
2156 # Start with something easy: JW Player in SWFObject
2157 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2158 if mobj is None:
2159 # Broaden the search a little bit
2160 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2161 if mobj is None:
2162 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2163 return
2164
2165 # It's possible that one of the regexes
2166 # matched, but returned an empty group:
2167 if mobj.group(1) is None:
2168 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2169 return
2170
2171 video_url = urllib.unquote(mobj.group(1))
2172 video_id = os.path.basename(video_url)
2173
2174 # here's a fun little line of code for you:
2175 video_extension = os.path.splitext(video_id)[1][1:]
2176 video_id = os.path.splitext(video_id)[0]
2177
2178 # it's tempting to parse this further, but you would
2179 # have to take into account all the variations like
2180 # Video Title - Site Name
2181 # Site Name | Video Title
2182 # Video Title - Tagline | Site Name
2183 # and so on and so forth; it's just not practical
2184 mobj = re.search(r'<title>(.*)</title>', webpage)
2185 if mobj is None:
2186 self._downloader.trouble(u'ERROR: unable to extract title')
2187 return
2188 video_title = mobj.group(1).decode('utf-8')
2189 video_title = sanitize_title(video_title)
2190 simple_title = _simplify_title(video_title)
2191
2192 # video uploader is domain name
2193 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2194 if mobj is None:
2195 self._downloader.trouble(u'ERROR: unable to extract title')
2196 return
2197 video_uploader = mobj.group(1).decode('utf-8')
2198
2199 try:
2200 # Process video information
2201 self._downloader.process_info({
2202 'id': video_id.decode('utf-8'),
2203 'url': video_url.decode('utf-8'),
2204 'uploader': video_uploader,
2205 'upload_date': u'NA',
2206 'title': video_title,
2207 'stitle': simple_title,
2208 'ext': video_extension.decode('utf-8'),
2209 'format': u'NA',
2210 'player_url': None,
2211 })
2212 except UnavailableVideoError, err:
2213 self._downloader.trouble(u'\nERROR: unable to download video')
2214
2215
2216 class YoutubeSearchIE(InfoExtractor):
2217 """Information Extractor for YouTube search queries."""
2218 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2219 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2220 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2221 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2222 _youtube_ie = None
2223 _max_youtube_results = 1000
2224 IE_NAME = u'youtube:search'
2225
2226 def __init__(self, youtube_ie, downloader=None):
2227 InfoExtractor.__init__(self, downloader)
2228 self._youtube_ie = youtube_ie
2229
2230 def report_download_page(self, query, pagenum):
2231 """Report attempt to download playlist page with given number."""
2232 query = query.decode(preferredencoding())
2233 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2234
2235 def _real_initialize(self):
2236 self._youtube_ie.initialize()
2237
2238 def _real_extract(self, query):
2239 mobj = re.match(self._VALID_URL, query)
2240 if mobj is None:
2241 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2242 return
2243
2244 prefix, query = query.split(':')
2245 prefix = prefix[8:]
2246 query = query.encode('utf-8')
2247 if prefix == '':
2248 self._download_n_results(query, 1)
2249 return
2250 elif prefix == 'all':
2251 self._download_n_results(query, self._max_youtube_results)
2252 return
2253 else:
2254 try:
2255 n = long(prefix)
2256 if n <= 0:
2257 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2258 return
2259 elif n > self._max_youtube_results:
2260 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2261 n = self._max_youtube_results
2262 self._download_n_results(query, n)
2263 return
2264 except ValueError: # parsing prefix as integer fails
2265 self._download_n_results(query, 1)
2266 return
2267
2268 def _download_n_results(self, query, n):
2269 """Downloads a specified number of results for a query"""
2270
2271 video_ids = []
2272 already_seen = set()
2273 pagenum = 1
2274
2275 while True:
2276 self.report_download_page(query, pagenum)
2277 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2278 request = urllib2.Request(result_url)
2279 try:
2280 page = urllib2.urlopen(request).read()
2281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2282 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2283 return
2284
2285 # Extract video identifiers
2286 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2287 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2288 if video_id not in already_seen:
2289 video_ids.append(video_id)
2290 already_seen.add(video_id)
2291 if len(video_ids) == n:
2292 # Specified n videos reached
2293 for id in video_ids:
2294 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2295 return
2296
2297 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2298 for id in video_ids:
2299 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2300 return
2301
2302 pagenum = pagenum + 1
2303
2304
2305 class GoogleSearchIE(InfoExtractor):
2306 """Information Extractor for Google Video search queries."""
2307 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2308 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2309 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2310 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2311 _google_ie = None
2312 _max_google_results = 1000
2313 IE_NAME = u'video.google:search'
2314
2315 def __init__(self, google_ie, downloader=None):
2316 InfoExtractor.__init__(self, downloader)
2317 self._google_ie = google_ie
2318
2319 def report_download_page(self, query, pagenum):
2320 """Report attempt to download playlist page with given number."""
2321 query = query.decode(preferredencoding())
2322 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2323
2324 def _real_initialize(self):
2325 self._google_ie.initialize()
2326
2327 def _real_extract(self, query):
2328 mobj = re.match(self._VALID_URL, query)
2329 if mobj is None:
2330 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2331 return
2332
2333 prefix, query = query.split(':')
2334 prefix = prefix[8:]
2335 query = query.encode('utf-8')
2336 if prefix == '':
2337 self._download_n_results(query, 1)
2338 return
2339 elif prefix == 'all':
2340 self._download_n_results(query, self._max_google_results)
2341 return
2342 else:
2343 try:
2344 n = long(prefix)
2345 if n <= 0:
2346 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2347 return
2348 elif n > self._max_google_results:
2349 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2350 n = self._max_google_results
2351 self._download_n_results(query, n)
2352 return
2353 except ValueError: # parsing prefix as integer fails
2354 self._download_n_results(query, 1)
2355 return
2356
2357 def _download_n_results(self, query, n):
2358 """Downloads a specified number of results for a query"""
2359
2360 video_ids = []
2361 already_seen = set()
2362 pagenum = 1
2363
2364 while True:
2365 self.report_download_page(query, pagenum)
2366 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2367 request = urllib2.Request(result_url)
2368 try:
2369 page = urllib2.urlopen(request).read()
2370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2372 return
2373
2374 # Extract video identifiers
2375 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2376 video_id = mobj.group(1)
2377 if video_id not in already_seen:
2378 video_ids.append(video_id)
2379 already_seen.add(video_id)
2380 if len(video_ids) == n:
2381 # Specified n videos reached
2382 for id in video_ids:
2383 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2384 return
2385
2386 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2387 for id in video_ids:
2388 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2389 return
2390
2391 pagenum = pagenum + 1
2392
2393
2394 class YahooSearchIE(InfoExtractor):
2395 """Information Extractor for Yahoo! Video search queries."""
2396 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2397 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2398 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2399 _MORE_PAGES_INDICATOR = r'\s*Next'
2400 _yahoo_ie = None
2401 _max_yahoo_results = 1000
2402 IE_NAME = u'video.yahoo:search'
2403
2404 def __init__(self, yahoo_ie, downloader=None):
2405 InfoExtractor.__init__(self, downloader)
2406 self._yahoo_ie = yahoo_ie
2407
2408 def report_download_page(self, query, pagenum):
2409 """Report attempt to download playlist page with given number."""
2410 query = query.decode(preferredencoding())
2411 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2412
2413 def _real_initialize(self):
2414 self._yahoo_ie.initialize()
2415
2416 def _real_extract(self, query):
2417 mobj = re.match(self._VALID_URL, query)
2418 if mobj is None:
2419 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2420 return
2421
2422 prefix, query = query.split(':')
2423 prefix = prefix[8:]
2424 query = query.encode('utf-8')
2425 if prefix == '':
2426 self._download_n_results(query, 1)
2427 return
2428 elif prefix == 'all':
2429 self._download_n_results(query, self._max_yahoo_results)
2430 return
2431 else:
2432 try:
2433 n = long(prefix)
2434 if n <= 0:
2435 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2436 return
2437 elif n > self._max_yahoo_results:
2438 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2439 n = self._max_yahoo_results
2440 self._download_n_results(query, n)
2441 return
2442 except ValueError: # parsing prefix as integer fails
2443 self._download_n_results(query, 1)
2444 return
2445
2446 def _download_n_results(self, query, n):
2447 """Downloads a specified number of results for a query"""
2448
2449 video_ids = []
2450 already_seen = set()
2451 pagenum = 1
2452
2453 while True:
2454 self.report_download_page(query, pagenum)
2455 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2456 request = urllib2.Request(result_url)
2457 try:
2458 page = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2461 return
2462
2463 # Extract video identifiers
2464 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465 video_id = mobj.group(1)
2466 if video_id not in already_seen:
2467 video_ids.append(video_id)
2468 already_seen.add(video_id)
2469 if len(video_ids) == n:
2470 # Specified n videos reached
2471 for id in video_ids:
2472 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2473 return
2474
2475 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2476 for id in video_ids:
2477 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2478 return
2479
2480 pagenum = pagenum + 1
2481
2482
2483 class YoutubePlaylistIE(InfoExtractor):
2484 """Information Extractor for YouTube playlists."""
2485
2486 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2487 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2488 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2490 _youtube_ie = None
2491 IE_NAME = u'youtube:playlist'
2492
2493 def __init__(self, youtube_ie, downloader=None):
2494 InfoExtractor.__init__(self, downloader)
2495 self._youtube_ie = youtube_ie
2496
2497 def report_download_page(self, playlist_id, pagenum):
2498 """Report attempt to download playlist page with given number."""
2499 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2500
2501 def _real_initialize(self):
2502 self._youtube_ie.initialize()
2503
2504 def _real_extract(self, url):
2505 # Extract playlist id
2506 mobj = re.match(self._VALID_URL, url)
2507 if mobj is None:
2508 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2509 return
2510
2511 # Single video case
2512 if mobj.group(3) is not None:
2513 self._youtube_ie.extract(mobj.group(3))
2514 return
2515
2516 # Download playlist pages
2517 # prefix is 'p' as default for playlists but there are other types that need extra care
2518 playlist_prefix = mobj.group(1)
2519 if playlist_prefix == 'a':
2520 playlist_access = 'artist'
2521 else:
2522 playlist_prefix = 'p'
2523 playlist_access = 'view_play_list'
2524 playlist_id = mobj.group(2)
2525 video_ids = []
2526 pagenum = 1
2527
2528 while True:
2529 self.report_download_page(playlist_id, pagenum)
2530 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2531 request = urllib2.Request(url)
2532 try:
2533 page = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536 return
2537
2538 # Extract video identifiers
2539 ids_in_page = []
2540 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2541 if mobj.group(1) not in ids_in_page:
2542 ids_in_page.append(mobj.group(1))
2543 video_ids.extend(ids_in_page)
2544
2545 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2546 break
2547 pagenum = pagenum + 1
2548
2549 playliststart = self._downloader.params.get('playliststart', 1) - 1
2550 playlistend = self._downloader.params.get('playlistend', -1)
2551 video_ids = video_ids[playliststart:playlistend]
2552
2553 for id in video_ids:
2554 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2555 return
2556
2557
2558 class YoutubeUserIE(InfoExtractor):
2559 """Information Extractor for YouTube users."""
2560
2561 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2562 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2563 _GDATA_PAGE_SIZE = 50
2564 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2565 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2566 _youtube_ie = None
2567 IE_NAME = u'youtube:user'
2568
2569 def __init__(self, youtube_ie, downloader=None):
2570 InfoExtractor.__init__(self, downloader)
2571 self._youtube_ie = youtube_ie
2572
2573 def report_download_page(self, username, start_index):
2574 """Report attempt to download user page."""
2575 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2576 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2577
2578 def _real_initialize(self):
2579 self._youtube_ie.initialize()
2580
2581 def _real_extract(self, url):
2582 # Extract username
2583 mobj = re.match(self._VALID_URL, url)
2584 if mobj is None:
2585 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2586 return
2587
2588 username = mobj.group(1)
2589
2590 # Download video ids using YouTube Data API. Result size per
2591 # query is limited (currently to 50 videos) so we need to query
2592 # page by page until there are no video ids - it means we got
2593 # all of them.
2594
2595 video_ids = []
2596 pagenum = 0
2597
2598 while True:
2599 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2600 self.report_download_page(username, start_index)
2601
2602 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2603
2604 try:
2605 page = urllib2.urlopen(request).read()
2606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2608 return
2609
2610 # Extract video identifiers
2611 ids_in_page = []
2612
2613 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614 if mobj.group(1) not in ids_in_page:
2615 ids_in_page.append(mobj.group(1))
2616
2617 video_ids.extend(ids_in_page)
2618
2619 # A little optimization - if current page is not
2620 # "full", ie. does not contain PAGE_SIZE video ids then
2621 # we can assume that this page is the last one - there
2622 # are no more ids on further pages - no need to query
2623 # again.
2624
2625 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2626 break
2627
2628 pagenum += 1
2629
2630 all_ids_count = len(video_ids)
2631 playliststart = self._downloader.params.get('playliststart', 1) - 1
2632 playlistend = self._downloader.params.get('playlistend', -1)
2633
2634 if playlistend == -1:
2635 video_ids = video_ids[playliststart:]
2636 else:
2637 video_ids = video_ids[playliststart:playlistend]
2638
2639 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2640 (username, all_ids_count, len(video_ids)))
2641
2642 for video_id in video_ids:
2643 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2644
2645
2646 class DepositFilesIE(InfoExtractor):
2647 """Information extractor for depositfiles.com"""
2648
2649 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2650 IE_NAME = u'DepositFiles'
2651
2652 def __init__(self, downloader=None):
2653 InfoExtractor.__init__(self, downloader)
2654
2655 def report_download_webpage(self, file_id):
2656 """Report webpage download."""
2657 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2658
2659 def report_extraction(self, file_id):
2660 """Report information extraction."""
2661 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2662
2663 def _real_extract(self, url):
2664 # At this point we have a new file
2665 self._downloader.increment_downloads()
2666
2667 file_id = url.split('/')[-1]
2668 # Rebuild url in english locale
2669 url = 'http://depositfiles.com/en/files/' + file_id
2670
2671 # Retrieve file webpage with 'Free download' button pressed
2672 free_download_indication = { 'gateway_result' : '1' }
2673 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2674 try:
2675 self.report_download_webpage(file_id)
2676 webpage = urllib2.urlopen(request).read()
2677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2679 return
2680
2681 # Search for the real file URL
2682 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2683 if (mobj is None) or (mobj.group(1) is None):
2684 # Try to figure out reason of the error.
2685 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2686 if (mobj is not None) and (mobj.group(1) is not None):
2687 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2688 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2689 else:
2690 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2691 return
2692
2693 file_url = mobj.group(1)
2694 file_extension = os.path.splitext(file_url)[1][1:]
2695
2696 # Search for file title
2697 mobj = re.search(r'<b title="(.*?)">', webpage)
2698 if mobj is None:
2699 self._downloader.trouble(u'ERROR: unable to extract title')
2700 return
2701 file_title = mobj.group(1).decode('utf-8')
2702
2703 try:
2704 # Process file information
2705 self._downloader.process_info({
2706 'id': file_id.decode('utf-8'),
2707 'url': file_url.decode('utf-8'),
2708 'uploader': u'NA',
2709 'upload_date': u'NA',
2710 'title': file_title,
2711 'stitle': file_title,
2712 'ext': file_extension.decode('utf-8'),
2713 'format': u'NA',
2714 'player_url': None,
2715 })
2716 except UnavailableVideoError, err:
2717 self._downloader.trouble(u'ERROR: unable to download file')
2718
2719
2720 class FacebookIE(InfoExtractor):
2721 """Information Extractor for Facebook"""
2722
2723 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2724 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2725 _NETRC_MACHINE = 'facebook'
2726 _available_formats = ['video', 'highqual', 'lowqual']
2727 _video_extensions = {
2728 'video': 'mp4',
2729 'highqual': 'mp4',
2730 'lowqual': 'mp4',
2731 }
2732 IE_NAME = u'facebook'
2733
2734 def __init__(self, downloader=None):
2735 InfoExtractor.__init__(self, downloader)
2736
2737 def _reporter(self, message):
2738 """Add header and report message."""
2739 self._downloader.to_screen(u'[facebook] %s' % message)
2740
2741 def report_login(self):
2742 """Report attempt to log in."""
2743 self._reporter(u'Logging in')
2744
2745 def report_video_webpage_download(self, video_id):
2746 """Report attempt to download video webpage."""
2747 self._reporter(u'%s: Downloading video webpage' % video_id)
2748
2749 def report_information_extraction(self, video_id):
2750 """Report attempt to extract video information."""
2751 self._reporter(u'%s: Extracting video information' % video_id)
2752
2753 def _parse_page(self, video_webpage):
2754 """Extract video information from page"""
2755 # General data
2756 data = {'title': r'\("video_title", "(.*?)"\)',
2757 'description': r'<div class="datawrap">(.*?)</div>',
2758 'owner': r'\("video_owner_name", "(.*?)"\)',
2759 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2760 }
2761 video_info = {}
2762 for piece in data.keys():
2763 mobj = re.search(data[piece], video_webpage)
2764 if mobj is not None:
2765 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2766
2767 # Video urls
2768 video_urls = {}
2769 for fmt in self._available_formats:
2770 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2771 if mobj is not None:
2772 # URL is in a Javascript segment inside an escaped Unicode format within
2773 # the generally utf-8 page
2774 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2775 video_info['video_urls'] = video_urls
2776
2777 return video_info
2778
2779 def _real_initialize(self):
2780 if self._downloader is None:
2781 return
2782
2783 useremail = None
2784 password = None
2785 downloader_params = self._downloader.params
2786
2787 # Attempt to use provided username and password or .netrc data
2788 if downloader_params.get('username', None) is not None:
2789 useremail = downloader_params['username']
2790 password = downloader_params['password']
2791 elif downloader_params.get('usenetrc', False):
2792 try:
2793 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2794 if info is not None:
2795 useremail = info[0]
2796 password = info[2]
2797 else:
2798 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2799 except (IOError, netrc.NetrcParseError), err:
2800 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2801 return
2802
2803 if useremail is None:
2804 return
2805
2806 # Log in
2807 login_form = {
2808 'email': useremail,
2809 'pass': password,
2810 'login': 'Log+In'
2811 }
2812 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2813 try:
2814 self.report_login()
2815 login_results = urllib2.urlopen(request).read()
2816 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2817 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2818 return
2819 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2820 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2821 return
2822
2823 def _real_extract(self, url):
2824 mobj = re.match(self._VALID_URL, url)
2825 if mobj is None:
2826 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2827 return
2828 video_id = mobj.group('ID')
2829
2830 # Get video webpage
2831 self.report_video_webpage_download(video_id)
2832 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2833 try:
2834 page = urllib2.urlopen(request)
2835 video_webpage = page.read()
2836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2838 return
2839
2840 # Start extracting information
2841 self.report_information_extraction(video_id)
2842
2843 # Extract information
2844 video_info = self._parse_page(video_webpage)
2845
2846 # uploader
2847 if 'owner' not in video_info:
2848 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2849 return
2850 video_uploader = video_info['owner']
2851
2852 # title
2853 if 'title' not in video_info:
2854 self._downloader.trouble(u'ERROR: unable to extract video title')
2855 return
2856 video_title = video_info['title']
2857 video_title = video_title.decode('utf-8')
2858 video_title = sanitize_title(video_title)
2859
2860 simple_title = _simplify_title(video_title)
2861
2862 # thumbnail image
2863 if 'thumbnail' not in video_info:
2864 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2865 video_thumbnail = ''
2866 else:
2867 video_thumbnail = video_info['thumbnail']
2868
2869 # upload date
2870 upload_date = u'NA'
2871 if 'upload_date' in video_info:
2872 upload_time = video_info['upload_date']
2873 timetuple = email.utils.parsedate_tz(upload_time)
2874 if timetuple is not None:
2875 try:
2876 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2877 except:
2878 pass
2879
2880 # description
2881 video_description = video_info.get('description', 'No description available.')
2882
2883 url_map = video_info['video_urls']
2884 if len(url_map.keys()) > 0:
2885 # Decide which formats to download
2886 req_format = self._downloader.params.get('format', None)
2887 format_limit = self._downloader.params.get('format_limit', None)
2888
2889 if format_limit is not None and format_limit in self._available_formats:
2890 format_list = self._available_formats[self._available_formats.index(format_limit):]
2891 else:
2892 format_list = self._available_formats
2893 existing_formats = [x for x in format_list if x in url_map]
2894 if len(existing_formats) == 0:
2895 self._downloader.trouble(u'ERROR: no known formats available for video')
2896 return
2897 if req_format is None:
2898 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2899 elif req_format == 'worst':
2900 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2901 elif req_format == '-1':
2902 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2903 else:
2904 # Specific format
2905 if req_format not in url_map:
2906 self._downloader.trouble(u'ERROR: requested format not available')
2907 return
2908 video_url_list = [(req_format, url_map[req_format])] # Specific format
2909
2910 for format_param, video_real_url in video_url_list:
2911
2912 # At this point we have a new video
2913 self._downloader.increment_downloads()
2914
2915 # Extension
2916 video_extension = self._video_extensions.get(format_param, 'mp4')
2917
2918 try:
2919 # Process video information
2920 self._downloader.process_info({
2921 'id': video_id.decode('utf-8'),
2922 'url': video_real_url.decode('utf-8'),
2923 'uploader': video_uploader.decode('utf-8'),
2924 'upload_date': upload_date,
2925 'title': video_title,
2926 'stitle': simple_title,
2927 'ext': video_extension.decode('utf-8'),
2928 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2929 'thumbnail': video_thumbnail.decode('utf-8'),
2930 'description': video_description.decode('utf-8'),
2931 'player_url': None,
2932 })
2933 except UnavailableVideoError, err:
2934 self._downloader.trouble(u'\nERROR: unable to download video')
2935
2936 class BlipTVIE(InfoExtractor):
2937 """Information extractor for blip.tv"""
2938
2939 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2940 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2941 IE_NAME = u'blip.tv'
2942
2943 def report_extraction(self, file_id):
2944 """Report information extraction."""
2945 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2946
2947 def report_direct_download(self, title):
2948 """Report information extraction."""
2949 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2950
2951 def _real_extract(self, url):
2952 mobj = re.match(self._VALID_URL, url)
2953 if mobj is None:
2954 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955 return
2956
2957 if '?' in url:
2958 cchar = '&'
2959 else:
2960 cchar = '?'
2961 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2962 request = urllib2.Request(json_url)
2963 self.report_extraction(mobj.group(1))
2964 info = None
2965 try:
2966 urlh = urllib2.urlopen(request)
2967 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2968 basename = url.split('/')[-1]
2969 title,ext = os.path.splitext(basename)
2970 title = title.decode('UTF-8')
2971 ext = ext.replace('.', '')
2972 self.report_direct_download(title)
2973 info = {
2974 'id': title,
2975 'url': url,
2976 'title': title,
2977 'stitle': _simplify_title(title),
2978 'ext': ext,
2979 'urlhandle': urlh
2980 }
2981 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2982 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2983 return
2984 if info is None: # Regular URL
2985 try:
2986 json_code = urlh.read()
2987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2989 return
2990
2991 try:
2992 json_data = json.loads(json_code)
2993 if 'Post' in json_data:
2994 data = json_data['Post']
2995 else:
2996 data = json_data
2997
2998 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2999 video_url = data['media']['url']
3000 umobj = re.match(self._URL_EXT, video_url)
3001 if umobj is None:
3002 raise ValueError('Can not determine filename extension')
3003 ext = umobj.group(1)
3004
3005 info = {
3006 'id': data['item_id'],
3007 'url': video_url,
3008 'uploader': data['display_name'],
3009 'upload_date': upload_date,
3010 'title': data['title'],
3011 'stitle': _simplify_title(data['title']),
3012 'ext': ext,
3013 'format': data['media']['mimeType'],
3014 'thumbnail': data['thumbnailUrl'],
3015 'description': data['description'],
3016 'player_url': data['embedUrl']
3017 }
3018 except (ValueError,KeyError), err:
3019 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3020 return
3021
3022 self._downloader.increment_downloads()
3023
3024 try:
3025 self._downloader.process_info(info)
3026 except UnavailableVideoError, err:
3027 self._downloader.trouble(u'\nERROR: unable to download video')
3028
3029
3030 class MyVideoIE(InfoExtractor):
3031 """Information Extractor for myvideo.de."""
3032
3033 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3034 IE_NAME = u'myvideo'
3035
3036 def __init__(self, downloader=None):
3037 InfoExtractor.__init__(self, downloader)
3038
3039 def report_download_webpage(self, video_id):
3040 """Report webpage download."""
3041 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3042
3043 def report_extraction(self, video_id):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3046
3047 def _real_extract(self,url):
3048 mobj = re.match(self._VALID_URL, url)
3049 if mobj is None:
3050 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3051 return
3052
3053 video_id = mobj.group(1)
3054
3055 # Get video webpage
3056 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3057 try:
3058 self.report_download_webpage(video_id)
3059 webpage = urllib2.urlopen(request).read()
3060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3061 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3062 return
3063
3064 self.report_extraction(video_id)
3065 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3066 webpage)
3067 if mobj is None:
3068 self._downloader.trouble(u'ERROR: unable to extract media URL')
3069 return
3070 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3071
3072 mobj = re.search('<title>([^<]+)</title>', webpage)
3073 if mobj is None:
3074 self._downloader.trouble(u'ERROR: unable to extract title')
3075 return
3076
3077 video_title = mobj.group(1)
3078 video_title = sanitize_title(video_title)
3079
3080 simple_title = _simplify_title(video_title)
3081
3082 try:
3083 self._downloader.process_info({
3084 'id': video_id,
3085 'url': video_url,
3086 'uploader': u'NA',
3087 'upload_date': u'NA',
3088 'title': video_title,
3089 'stitle': simple_title,
3090 'ext': u'flv',
3091 'format': u'NA',
3092 'player_url': None,
3093 })
3094 except UnavailableVideoError:
3095 self._downloader.trouble(u'\nERROR: Unable to download video')
3096
3097 class ComedyCentralIE(InfoExtractor):
3098 """Information extractor for The Daily Show and Colbert Report """
3099
3100 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3101 IE_NAME = u'comedycentral'
3102
3103 def report_extraction(self, episode_id):
3104 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105
3106 def report_config_download(self, episode_id):
3107 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108
3109 def report_index_download(self, episode_id):
3110 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111
3112 def report_player_url(self, episode_id):
3113 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114
3115 def _real_extract(self, url):
3116 mobj = re.match(self._VALID_URL, url)
3117 if mobj is None:
3118 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3119 return
3120
3121 if mobj.group('shortname'):
3122 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3123 url = u'http://www.thedailyshow.com/full-episodes/'
3124 else:
3125 url = u'http://www.colbertnation.com/full-episodes/'
3126 mobj = re.match(self._VALID_URL, url)
3127 assert mobj is not None
3128
3129 dlNewest = not mobj.group('episode')
3130 if dlNewest:
3131 epTitle = mobj.group('showname')
3132 else:
3133 epTitle = mobj.group('episode')
3134
3135 req = urllib2.Request(url)
3136 self.report_extraction(epTitle)
3137 try:
3138 htmlHandle = urllib2.urlopen(req)
3139 html = htmlHandle.read()
3140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3141 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3142 return
3143 if dlNewest:
3144 url = htmlHandle.geturl()
3145 mobj = re.match(self._VALID_URL, url)
3146 if mobj is None:
3147 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3148 return
3149 if mobj.group('episode') == '':
3150 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3151 return
3152 epTitle = mobj.group('episode')
3153
3154 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3155 if len(mMovieParams) == 0:
3156 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3157 return
3158
3159 playerUrl_raw = mMovieParams[0][0]
3160 self.report_player_url(epTitle)
3161 try:
3162 urlHandle = urllib2.urlopen(playerUrl_raw)
3163 playerUrl = urlHandle.geturl()
3164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3165 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3166 return
3167
3168 uri = mMovieParams[0][1]
3169 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3170 self.report_index_download(epTitle)
3171 try:
3172 indexXml = urllib2.urlopen(indexUrl).read()
3173 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3174 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3175 return
3176
3177 idoc = xml.etree.ElementTree.fromstring(indexXml)
3178 itemEls = idoc.findall('.//item')
3179 for itemEl in itemEls:
3180 mediaId = itemEl.findall('./guid')[0].text
3181 shortMediaId = mediaId.split(':')[-1]
3182 showId = mediaId.split(':')[-2].replace('.com', '')
3183 officialTitle = itemEl.findall('./title')[0].text
3184 officialDate = itemEl.findall('./pubDate')[0].text
3185
3186 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3187 urllib.urlencode({'uri': mediaId}))
3188 configReq = urllib2.Request(configUrl)
3189 self.report_config_download(epTitle)
3190 try:
3191 configXml = urllib2.urlopen(configReq).read()
3192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3193 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3194 return
3195
3196 cdoc = xml.etree.ElementTree.fromstring(configXml)
3197 turls = []
3198 for rendition in cdoc.findall('.//rendition'):
3199 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3200 turls.append(finfo)
3201
3202 if len(turls) == 0:
3203 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3204 continue
3205
3206 # For now, just pick the highest bitrate
3207 format,video_url = turls[-1]
3208
3209 self._downloader.increment_downloads()
3210
3211 effTitle = showId + u'-' + epTitle
3212 info = {
3213 'id': shortMediaId,
3214 'url': video_url,
3215 'uploader': showId,
3216 'upload_date': officialDate,
3217 'title': effTitle,
3218 'stitle': _simplify_title(effTitle),
3219 'ext': 'mp4',
3220 'format': format,
3221 'thumbnail': None,
3222 'description': officialTitle,
3223 'player_url': playerUrl
3224 }
3225
3226 try:
3227 self._downloader.process_info(info)
3228 except UnavailableVideoError, err:
3229 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3230 continue
3231
3232
3233 class EscapistIE(InfoExtractor):
3234 """Information extractor for The Escapist """
3235
3236 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3237 IE_NAME = u'escapist'
3238
3239 def report_extraction(self, showName):
3240 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3241
3242 def report_config_download(self, showName):
3243 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3244
3245 def _real_extract(self, url):
3246 htmlParser = HTMLParser.HTMLParser()
3247
3248 mobj = re.match(self._VALID_URL, url)
3249 if mobj is None:
3250 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3251 return
3252 showName = mobj.group('showname')
3253 videoId = mobj.group('episode')
3254
3255 self.report_extraction(showName)
3256 try:
3257 webPage = urllib2.urlopen(url).read()
3258 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3260 return
3261
3262 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3263 description = htmlParser.unescape(descMatch.group(1))
3264 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3265 imgUrl = htmlParser.unescape(imgMatch.group(1))
3266 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3267 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3268 configUrlMatch = re.search('config=(.*)$', playerUrl)
3269 configUrl = urllib2.unquote(configUrlMatch.group(1))
3270
3271 self.report_config_download(showName)
3272 try:
3273 configJSON = urllib2.urlopen(configUrl).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3276 return
3277
3278 # Technically, it's JavaScript, not JSON
3279 configJSON = configJSON.replace("'", '"')
3280
3281 try:
3282 config = json.loads(configJSON)
3283 except (ValueError,), err:
3284 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3285 return
3286
3287 playlist = config['playlist']
3288 videoUrl = playlist[1]['url']
3289
3290 self._downloader.increment_downloads()
3291 info = {
3292 'id': videoId,
3293 'url': videoUrl,
3294 'uploader': showName,
3295 'upload_date': None,
3296 'title': showName,
3297 'stitle': _simplify_title(showName),
3298 'ext': 'flv',
3299 'format': 'flv',
3300 'thumbnail': imgUrl,
3301 'description': description,
3302 'player_url': playerUrl,
3303 }
3304
3305 try:
3306 self._downloader.process_info(info)
3307 except UnavailableVideoError, err:
3308 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3309
3310
3311 class CollegeHumorIE(InfoExtractor):
3312 """Information extractor for collegehumor.com"""
3313
3314 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3315 IE_NAME = u'collegehumor'
3316
3317 def report_webpage(self, video_id):
3318 """Report information extraction."""
3319 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3320
3321 def report_extraction(self, video_id):
3322 """Report information extraction."""
3323 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3324
3325 def _real_extract(self, url):
3326 htmlParser = HTMLParser.HTMLParser()
3327
3328 mobj = re.match(self._VALID_URL, url)
3329 if mobj is None:
3330 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3331 return
3332 video_id = mobj.group('videoid')
3333
3334 self.report_webpage(video_id)
3335 request = urllib2.Request(url)
3336 try:
3337 webpage = urllib2.urlopen(request).read()
3338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3339 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3340 return
3341
3342 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3343 if m is None:
3344 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3345 return
3346 internal_video_id = m.group('internalvideoid')
3347
3348 info = {
3349 'id': video_id,
3350 'internal_id': internal_video_id,
3351 }
3352
3353 self.report_extraction(video_id)
3354 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3355 try:
3356 metaXml = urllib2.urlopen(xmlUrl).read()
3357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3359 return
3360
3361 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3362 try:
3363 videoNode = mdoc.findall('./video')[0]
3364 info['description'] = videoNode.findall('./description')[0].text
3365 info['title'] = videoNode.findall('./caption')[0].text
3366 info['stitle'] = _simplify_title(info['title'])
3367 info['url'] = videoNode.findall('./file')[0].text
3368 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3369 info['ext'] = info['url'].rpartition('.')[2]
3370 info['format'] = info['ext']
3371 except IndexError:
3372 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3373 return
3374
3375 self._downloader.increment_downloads()
3376
3377 try:
3378 self._downloader.process_info(info)
3379 except UnavailableVideoError, err:
3380 self._downloader.trouble(u'\nERROR: unable to download video')
3381
3382
3383 class XVideosIE(InfoExtractor):
3384 """Information extractor for xvideos.com"""
3385
3386 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3387 IE_NAME = u'xvideos'
3388
3389 def report_webpage(self, video_id):
3390 """Report information extraction."""
3391 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3392
3393 def report_extraction(self, video_id):
3394 """Report information extraction."""
3395 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3396
3397 def _real_extract(self, url):
3398 htmlParser = HTMLParser.HTMLParser()
3399
3400 mobj = re.match(self._VALID_URL, url)
3401 if mobj is None:
3402 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403 return
3404 video_id = mobj.group(1).decode('utf-8')
3405
3406 self.report_webpage(video_id)
3407
3408 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3409 try:
3410 webpage = urllib2.urlopen(request).read()
3411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3413 return
3414
3415 self.report_extraction(video_id)
3416
3417
3418 # Extract video URL
3419 mobj = re.search(r'flv_url=(.+?)&', webpage)
3420 if mobj is None:
3421 self._downloader.trouble(u'ERROR: unable to extract video url')
3422 return
3423 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3424
3425
3426 # Extract title
3427 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3428 if mobj is None:
3429 self._downloader.trouble(u'ERROR: unable to extract video title')
3430 return
3431 video_title = mobj.group(1).decode('utf-8')
3432
3433
3434 # Extract video thumbnail
3435 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3436 if mobj is None:
3437 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3438 return
3439 video_thumbnail = mobj.group(1).decode('utf-8')
3440
3441
3442
3443 self._downloader.increment_downloads()
3444 info = {
3445 'id': video_id,
3446 'url': video_url,
3447 'uploader': None,
3448 'upload_date': None,
3449 'title': video_title,
3450 'stitle': _simplify_title(video_title),
3451 'ext': 'flv',
3452 'format': 'flv',
3453 'thumbnail': video_thumbnail,
3454 'description': None,
3455 'player_url': None,
3456 }
3457
3458 try:
3459 self._downloader.process_info(info)
3460 except UnavailableVideoError, err:
3461 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3462
3463
3464 class SoundcloudIE(InfoExtractor):
3465 """Information extractor for soundcloud.com
3466 To access the media, the uid of the song and a stream token
3467 must be extracted from the page source and the script must make
3468 a request to media.soundcloud.com/crossdomain.xml. Then
3469 the media can be grabbed by requesting from an url composed
3470 of the stream token and uid
3471 """
3472
3473 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3474 IE_NAME = u'soundcloud'
3475
3476 def __init__(self, downloader=None):
3477 InfoExtractor.__init__(self, downloader)
3478
3479 def report_webpage(self, video_id):
3480 """Report information extraction."""
3481 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3482
3483 def report_extraction(self, video_id):
3484 """Report information extraction."""
3485 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3486
3487 def _real_extract(self, url):
3488 htmlParser = HTMLParser.HTMLParser()
3489
3490 mobj = re.match(self._VALID_URL, url)
3491 if mobj is None:
3492 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3493 return
3494
3495 # extract uploader (which is in the url)
3496 uploader = mobj.group(1).decode('utf-8')
3497 # extract simple title (uploader + slug of song title)
3498 slug_title = mobj.group(2).decode('utf-8')
3499 simple_title = uploader + '-' + slug_title
3500
3501 self.report_webpage('%s/%s' % (uploader, slug_title))
3502
3503 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3504 try:
3505 webpage = urllib2.urlopen(request).read()
3506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3507 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3508 return
3509
3510 self.report_extraction('%s/%s' % (uploader, slug_title))
3511
3512 # extract uid and stream token that soundcloud hands out for access
3513 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3514 if mobj:
3515 video_id = mobj.group(1)
3516 stream_token = mobj.group(2)
3517
3518 # extract unsimplified title
3519 mobj = re.search('"title":"(.*?)",', webpage)
3520 if mobj:
3521 title = mobj.group(1)
3522
3523 # construct media url (with uid/token)
3524 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3525 mediaURL = mediaURL % (video_id, stream_token)
3526
3527 # description
3528 description = u'No description available'
3529 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3530 if mobj:
3531 description = mobj.group(1)
3532
3533 # upload date
3534 upload_date = None
3535 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3536 if mobj:
3537 try:
3538 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3539 except Exception, e:
3540 print str(e)
3541
3542 # for soundcloud, a request to a cross domain is required for cookies
3543 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3544
3545 try:
3546 self._downloader.process_info({
3547 'id': video_id.decode('utf-8'),
3548 'url': mediaURL,
3549 'uploader': uploader.decode('utf-8'),
3550 'upload_date': upload_date,
3551 'title': simple_title.decode('utf-8'),
3552 'stitle': simple_title.decode('utf-8'),
3553 'ext': u'mp3',
3554 'format': u'NA',
3555 'player_url': None,
3556 'description': description.decode('utf-8')
3557 })
3558 except UnavailableVideoError:
3559 self._downloader.trouble(u'\nERROR: unable to download video')
3560
3561
3562 class InfoQIE(InfoExtractor):
3563 """Information extractor for infoq.com"""
3564
3565 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3566 IE_NAME = u'infoq'
3567
3568 def report_webpage(self, video_id):
3569 """Report information extraction."""
3570 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3571
3572 def report_extraction(self, video_id):
3573 """Report information extraction."""
3574 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3575
3576 def _real_extract(self, url):
3577 htmlParser = HTMLParser.HTMLParser()
3578
3579 mobj = re.match(self._VALID_URL, url)
3580 if mobj is None:
3581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3582 return
3583
3584 self.report_webpage(url)
3585
3586 request = urllib2.Request(url)
3587 try:
3588 webpage = urllib2.urlopen(request).read()
3589 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3590 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3591 return
3592
3593 self.report_extraction(url)
3594
3595
3596 # Extract video URL
3597 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3598 if mobj is None:
3599 self._downloader.trouble(u'ERROR: unable to extract video url')
3600 return
3601 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3602
3603
3604 # Extract title
3605 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3606 if mobj is None:
3607 self._downloader.trouble(u'ERROR: unable to extract video title')
3608 return
3609 video_title = mobj.group(1).decode('utf-8')
3610
3611 # Extract description
3612 video_description = u'No description available.'
3613 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3614 if mobj is not None:
3615 video_description = mobj.group(1).decode('utf-8')
3616
3617 video_filename = video_url.split('/')[-1]
3618 video_id, extension = video_filename.split('.')
3619
3620 self._downloader.increment_downloads()
3621 info = {
3622 'id': video_id,
3623 'url': video_url,
3624 'uploader': None,
3625 'upload_date': None,
3626 'title': video_title,
3627 'stitle': _simplify_title(video_title),
3628 'ext': extension,
3629 'format': extension, # Extension is always(?) mp4, but seems to be flv
3630 'thumbnail': None,
3631 'description': video_description,
3632 'player_url': None,
3633 }
3634
3635 try:
3636 self._downloader.process_info(info)
3637 except UnavailableVideoError, err:
3638 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3639
3640 class MixcloudIE(InfoExtractor):
3641 """Information extractor for www.mixcloud.com"""
3642 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3643 IE_NAME = u'mixcloud'
3644
3645 def __init__(self, downloader=None):
3646 InfoExtractor.__init__(self, downloader)
3647
3648 def report_download_json(self, file_id):
3649 """Report JSON download."""
3650 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3651
3652 def report_extraction(self, file_id):
3653 """Report information extraction."""
3654 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3655
3656 def get_urls(self, jsonData, fmt, bitrate='best'):
3657 """Get urls from 'audio_formats' section in json"""
3658 file_url = None
3659 try:
3660 bitrate_list = jsonData[fmt]
3661 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3662 bitrate = max(bitrate_list) # select highest
3663
3664 url_list = jsonData[fmt][bitrate]
3665 except TypeError: # we have no bitrate info.
3666 url_list = jsonData[fmt]
3667
3668 return url_list
3669
3670 def check_urls(self, url_list):
3671 """Returns 1st active url from list"""
3672 for url in url_list:
3673 try:
3674 urllib2.urlopen(url)
3675 return url
3676 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3677 url = None
3678
3679 return None
3680
3681 def _print_formats(self, formats):
3682 print 'Available formats:'
3683 for fmt in formats.keys():
3684 for b in formats[fmt]:
3685 try:
3686 ext = formats[fmt][b][0]
3687 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3688 except TypeError: # we have no bitrate info
3689 ext = formats[fmt][0]
3690 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3691 break
3692
3693 def _real_extract(self, url):
3694 mobj = re.match(self._VALID_URL, url)
3695 if mobj is None:
3696 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3697 return
3698 # extract uploader & filename from url
3699 uploader = mobj.group(1).decode('utf-8')
3700 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3701
3702 # construct API request
3703 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3704 # retrieve .json file with links to files
3705 request = urllib2.Request(file_url)
3706 try:
3707 self.report_download_json(file_url)
3708 jsonData = urllib2.urlopen(request).read()
3709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3710 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3711 return
3712
3713 # parse JSON
3714 json_data = json.loads(jsonData)
3715 player_url = json_data['player_swf_url']
3716 formats = dict(json_data['audio_formats'])
3717
3718 req_format = self._downloader.params.get('format', None)
3719 bitrate = None
3720
3721 if self._downloader.params.get('listformats', None):
3722 self._print_formats(formats)
3723 return
3724
3725 if req_format is None or req_format == 'best':
3726 for format_param in formats.keys():
3727 url_list = self.get_urls(formats, format_param)
3728 # check urls
3729 file_url = self.check_urls(url_list)
3730 if file_url is not None:
3731 break # got it!
3732 else:
3733 if req_format not in formats.keys():
3734 self._downloader.trouble(u'ERROR: format is not available')
3735 return
3736
3737 url_list = self.get_urls(formats, req_format)
3738 file_url = self.check_urls(url_list)
3739 format_param = req_format
3740
3741 # We have audio
3742 self._downloader.increment_downloads()
3743 try:
3744 # Process file information
3745 self._downloader.process_info({
3746 'id': file_id.decode('utf-8'),
3747 'url': file_url.decode('utf-8'),
3748 'uploader': uploader.decode('utf-8'),
3749 'upload_date': u'NA',
3750 'title': json_data['name'],
3751 'stitle': _simplify_title(json_data['name']),
3752 'ext': file_url.split('.')[-1].decode('utf-8'),
3753 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3754 'thumbnail': json_data['thumbnail_url'],
3755 'description': json_data['description'],
3756 'player_url': player_url.decode('utf-8'),
3757 })
3758 except UnavailableVideoError, err:
3759 self._downloader.trouble(u'ERROR: unable to download file')
3760
3761 class StanfordOpenClassroomIE(InfoExtractor):
3762 """Information extractor for Stanford's Open ClassRoom"""
3763
3764 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3765 IE_NAME = u'stanfordoc'
3766
3767 def report_download_webpage(self, objid):
3768 """Report information extraction."""
3769 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3770
3771 def report_extraction(self, video_id):
3772 """Report information extraction."""
3773 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3774
3775 def _real_extract(self, url):
3776 mobj = re.match(self._VALID_URL, url)
3777 if mobj is None:
3778 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3779 return
3780
3781 if mobj.group('course') and mobj.group('video'): # A specific video
3782 course = mobj.group('course')
3783 video = mobj.group('video')
3784 info = {
3785 'id': _simplify_title(course + '_' + video),
3786 }
3787
3788 self.report_extraction(info['id'])
3789 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3790 xmlUrl = baseUrl + video + '.xml'
3791 try:
3792 metaXml = urllib2.urlopen(xmlUrl).read()
3793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3794 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3795 return
3796 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3797 try:
3798 info['title'] = mdoc.findall('./title')[0].text
3799 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3800 except IndexError:
3801 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3802 return
3803 info['stitle'] = _simplify_title(info['title'])
3804 info['ext'] = info['url'].rpartition('.')[2]
3805 info['format'] = info['ext']
3806 self._downloader.increment_downloads()
3807 try:
3808 self._downloader.process_info(info)
3809 except UnavailableVideoError, err:
3810 self._downloader.trouble(u'\nERROR: unable to download video')
3811 elif mobj.group('course'): # A course page
3812 unescapeHTML = HTMLParser.HTMLParser().unescape
3813
3814 course = mobj.group('course')
3815 info = {
3816 'id': _simplify_title(course),
3817 'type': 'playlist',
3818 }
3819
3820 self.report_download_webpage(info['id'])
3821 try:
3822 coursepage = urllib2.urlopen(url).read()
3823 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3824 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3825 return
3826
3827 m = re.search('<h1>([^<]+)</h1>', coursepage)
3828 if m:
3829 info['title'] = unescapeHTML(m.group(1))
3830 else:
3831 info['title'] = info['id']
3832 info['stitle'] = _simplify_title(info['title'])
3833
3834 m = re.search('<description>([^<]+)</description>', coursepage)
3835 if m:
3836 info['description'] = unescapeHTML(m.group(1))
3837
3838 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3839 info['list'] = [
3840 {
3841 'type': 'reference',
3842 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3843 }
3844 for vpage in links]
3845
3846 for entry in info['list']:
3847 assert entry['type'] == 'reference'
3848 self.extract(entry['url'])
3849 else: # Root page
3850 unescapeHTML = HTMLParser.HTMLParser().unescape
3851
3852 info = {
3853 'id': 'Stanford OpenClassroom',
3854 'type': 'playlist',
3855 }
3856
3857 self.report_download_webpage(info['id'])
3858 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3859 try:
3860 rootpage = urllib2.urlopen(rootURL).read()
3861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3862 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3863 return
3864
3865 info['title'] = info['id']
3866 info['stitle'] = _simplify_title(info['title'])
3867
3868 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3869 info['list'] = [
3870 {
3871 'type': 'reference',
3872 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3873 }
3874 for cpage in links]
3875
3876 for entry in info['list']:
3877 assert entry['type'] == 'reference'
3878 self.extract(entry['url'])
3879
3880
3881 class PostProcessor(object):
3882 """Post Processor class.
3883
3884 PostProcessor objects can be added to downloaders with their
3885 add_post_processor() method. When the downloader has finished a
3886 successful download, it will take its internal chain of PostProcessors
3887 and start calling the run() method on each one of them, first with
3888 an initial argument and then with the returned value of the previous
3889 PostProcessor.
3890
3891 The chain will be stopped if one of them ever returns None or the end
3892 of the chain is reached.
3893
3894 PostProcessor objects follow a "mutual registration" process similar
3895 to InfoExtractor objects.
3896 """
3897
3898 _downloader = None
3899
3900 def __init__(self, downloader=None):
3901 self._downloader = downloader
3902
3903 def set_downloader(self, downloader):
3904 """Sets the downloader for this PP."""
3905 self._downloader = downloader
3906
3907 def run(self, information):
3908 """Run the PostProcessor.
3909
3910 The "information" argument is a dictionary like the ones
3911 composed by InfoExtractors. The only difference is that this
3912 one has an extra field called "filepath" that points to the
3913 downloaded file.
3914
3915 When this method returns None, the postprocessing chain is
3916 stopped. However, this method may return an information
3917 dictionary that will be passed to the next postprocessing
3918 object in the chain. It can be the one it received after
3919 changing some fields.
3920
3921 In addition, this method may raise a PostProcessingError
3922 exception that will be taken into account by the downloader
3923 it was called from.
3924 """
3925 return information # by default, do nothing
3926
3927 class AudioConversionError(BaseException):
3928 def __init__(self, message):
3929 self.message = message
3930
3931 class FFmpegExtractAudioPP(PostProcessor):
3932
3933 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3934 PostProcessor.__init__(self, downloader)
3935 if preferredcodec is None:
3936 preferredcodec = 'best'
3937 self._preferredcodec = preferredcodec
3938 self._preferredquality = preferredquality
3939 self._keepvideo = keepvideo
3940
3941 @staticmethod
3942 def get_audio_codec(path):
3943 try:
3944 cmd = ['ffprobe', '-show_streams', '--', path]
3945 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3946 output = handle.communicate()[0]
3947 if handle.wait() != 0:
3948 return None
3949 except (IOError, OSError):
3950 return None
3951 audio_codec = None
3952 for line in output.split('\n'):
3953 if line.startswith('codec_name='):
3954 audio_codec = line.split('=')[1].strip()
3955 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3956 return audio_codec
3957 return None
3958
3959 @staticmethod
3960 def run_ffmpeg(path, out_path, codec, more_opts):
3961 if codec is None:
3962 acodec_opts = []
3963 else:
3964 acodec_opts = ['-acodec', codec]
3965 cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path]
3966 try:
3967 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3968 stdout,stderr = p.communicate()
3969 except (IOError, OSError):
3970 e = sys.exc_info()[1]
3971 if isinstance(e, OSError) and e.errno == 2:
3972 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
3973 else:
3974 raise e
3975 if p.returncode != 0:
3976 msg = stderr.strip().split('\n')[-1]
3977 raise AudioConversionError(msg)
3978
3979 def run(self, information):
3980 path = information['filepath']
3981
3982 filecodec = self.get_audio_codec(path)
3983 if filecodec is None:
3984 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3985 return None
3986
3987 more_opts = []
3988 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3989 if self._preferredcodec == 'm4a' and filecodec == 'aac':
3990 # Lossless, but in another container
3991 acodec = 'copy'
3992 extension = self._preferredcodec
3993 more_opts = ['-absf', 'aac_adtstoasc']
3994 elif filecodec in ['aac', 'mp3', 'vorbis']:
3995 # Lossless if possible
3996 acodec = 'copy'
3997 extension = filecodec
3998 if filecodec == 'aac':
3999 more_opts = ['-f', 'adts']
4000 if filecodec == 'vorbis':
4001 extension = 'ogg'
4002 else:
4003 # MP3 otherwise.
4004 acodec = 'libmp3lame'
4005 extension = 'mp3'
4006 more_opts = []
4007 if self._preferredquality is not None:
4008 more_opts += ['-ab', self._preferredquality]
4009 else:
4010 # We convert the audio (lossy)
4011 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4012 extension = self._preferredcodec
4013 more_opts = []
4014 if self._preferredquality is not None:
4015 more_opts += ['-ab', self._preferredquality]
4016 if self._preferredcodec == 'aac':
4017 more_opts += ['-f', 'adts']
4018 if self._preferredcodec == 'm4a':
4019 more_opts += ['-absf', 'aac_adtstoasc']
4020 if self._preferredcodec == 'vorbis':
4021 extension = 'ogg'
4022 if self._preferredcodec == 'wav':
4023 extension = 'wav'
4024 more_opts += ['-f', 'wav']
4025
4026 (prefix, ext) = os.path.splitext(path)
4027 new_path = prefix + '.' + extension
4028 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4029 try:
4030 self.run_ffmpeg(path, new_path, acodec, more_opts)
4031 except:
4032 etype,e,tb = sys.exc_info()
4033 if isinstance(e, AudioConversionError):
4034 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4035 else:
4036 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4037 return None
4038
4039 # Try to update the date time for extracted audio file.
4040 if information.get('filetime') is not None:
4041 try:
4042 os.utime(new_path, (time.time(), information['filetime']))
4043 except:
4044 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4045
4046 if not self._keepvideo:
4047 try:
4048 os.remove(path)
4049 except (IOError, OSError):
4050 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4051 return None
4052
4053 information['filepath'] = new_path
4054 return information
4055
4056
4057 def updateSelf(downloader, filename):
4058 ''' Update the program file with the latest version from the repository '''
4059 # Note: downloader only used for options
4060 if not os.access(filename, os.W_OK):
4061 sys.exit('ERROR: no write permissions on %s' % filename)
4062
4063 downloader.to_screen('Updating to latest version...')
4064
4065 try:
4066 try:
4067 urlh = urllib.urlopen(UPDATE_URL)
4068 newcontent = urlh.read()
4069
4070 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4071 if vmatch is not None and vmatch.group(1) == __version__:
4072 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4073 return
4074 finally:
4075 urlh.close()
4076 except (IOError, OSError), err:
4077 sys.exit('ERROR: unable to download latest version')
4078
4079 try:
4080 outf = open(filename, 'wb')
4081 try:
4082 outf.write(newcontent)
4083 finally:
4084 outf.close()
4085 except (IOError, OSError), err:
4086 sys.exit('ERROR: unable to overwrite current version')
4087
4088 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4089
4090 def parseOpts():
4091 # Deferred imports
4092 import getpass
4093 import optparse
4094 import shlex
4095
4096 def _readOptions(filename):
4097 try:
4098 optionf = open(filename)
4099 except IOError:
4100 return [] # silently skip if file is not present
4101 try:
4102 res = []
4103 for l in optionf:
4104 res += shlex.split(l, comments=True)
4105 finally:
4106 optionf.close()
4107 return res
4108
4109 def _format_option_string(option):
4110 ''' ('-o', '--option') -> -o, --format METAVAR'''
4111
4112 opts = []
4113
4114 if option._short_opts: opts.append(option._short_opts[0])
4115 if option._long_opts: opts.append(option._long_opts[0])
4116 if len(opts) > 1: opts.insert(1, ', ')
4117
4118 if option.takes_value(): opts.append(' %s' % option.metavar)
4119
4120 return "".join(opts)
4121
4122 def _find_term_columns():
4123 columns = os.environ.get('COLUMNS', None)
4124 if columns:
4125 return int(columns)
4126
4127 try:
4128 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4129 out,err = sp.communicate()
4130 return int(out.split()[1])
4131 except:
4132 pass
4133 return None
4134
4135 max_width = 80
4136 max_help_position = 80
4137
4138 # No need to wrap help messages if we're on a wide console
4139 columns = _find_term_columns()
4140 if columns: max_width = columns
4141
4142 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4143 fmt.format_option_strings = _format_option_string
4144
4145 kw = {
4146 'version' : __version__,
4147 'formatter' : fmt,
4148 'usage' : '%prog [options] url [url...]',
4149 'conflict_handler' : 'resolve',
4150 }
4151
4152 parser = optparse.OptionParser(**kw)
4153
4154 # option groups
4155 general = optparse.OptionGroup(parser, 'General Options')
4156 selection = optparse.OptionGroup(parser, 'Video Selection')
4157 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4158 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4159 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4160 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4161 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4162
4163 general.add_option('-h', '--help',
4164 action='help', help='print this help text and exit')
4165 general.add_option('-v', '--version',
4166 action='version', help='print program version and exit')
4167 general.add_option('-U', '--update',
4168 action='store_true', dest='update_self', help='update this program to latest version')
4169 general.add_option('-i', '--ignore-errors',
4170 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4171 general.add_option('-r', '--rate-limit',
4172 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4173 general.add_option('-R', '--retries',
4174 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4175 general.add_option('--dump-user-agent',
4176 action='store_true', dest='dump_user_agent',
4177 help='display the current browser identification', default=False)
4178 general.add_option('--list-extractors',
4179 action='store_true', dest='list_extractors',
4180 help='List all supported extractors and the URLs they would handle', default=False)
4181
4182 selection.add_option('--playlist-start',
4183 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4184 selection.add_option('--playlist-end',
4185 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4186 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4187 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4188 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4189
4190 authentication.add_option('-u', '--username',
4191 dest='username', metavar='USERNAME', help='account username')
4192 authentication.add_option('-p', '--password',
4193 dest='password', metavar='PASSWORD', help='account password')
4194 authentication.add_option('-n', '--netrc',
4195 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4196
4197
4198 video_format.add_option('-f', '--format',
4199 action='store', dest='format', metavar='FORMAT', help='video format code')
4200 video_format.add_option('--all-formats',
4201 action='store_const', dest='format', help='download all available video formats', const='all')
4202 video_format.add_option('--prefer-free-formats',
4203 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4204 video_format.add_option('--max-quality',
4205 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4206 video_format.add_option('-F', '--list-formats',
4207 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4208
4209
4210 verbosity.add_option('-q', '--quiet',
4211 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4212 verbosity.add_option('-s', '--simulate',
4213 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4214 verbosity.add_option('--skip-download',
4215 action='store_true', dest='skip_download', help='do not download the video', default=False)
4216 verbosity.add_option('-g', '--get-url',
4217 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4218 verbosity.add_option('-e', '--get-title',
4219 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4220 verbosity.add_option('--get-thumbnail',
4221 action='store_true', dest='getthumbnail',
4222 help='simulate, quiet but print thumbnail URL', default=False)
4223 verbosity.add_option('--get-description',
4224 action='store_true', dest='getdescription',
4225 help='simulate, quiet but print video description', default=False)
4226 verbosity.add_option('--get-filename',
4227 action='store_true', dest='getfilename',
4228 help='simulate, quiet but print output filename', default=False)
4229 verbosity.add_option('--get-format',
4230 action='store_true', dest='getformat',
4231 help='simulate, quiet but print output format', default=False)
4232 verbosity.add_option('--no-progress',
4233 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4234 verbosity.add_option('--console-title',
4235 action='store_true', dest='consoletitle',
4236 help='display progress in console titlebar', default=False)
4237
4238
4239 filesystem.add_option('-t', '--title',
4240 action='store_true', dest='usetitle', help='use title in file name', default=False)
4241 filesystem.add_option('-l', '--literal',
4242 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4243 filesystem.add_option('-A', '--auto-number',
4244 action='store_true', dest='autonumber',
4245 help='number downloaded files starting from 00000', default=False)
4246 filesystem.add_option('-o', '--output',
4247 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4248 filesystem.add_option('-a', '--batch-file',
4249 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4250 filesystem.add_option('-w', '--no-overwrites',
4251 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4252 filesystem.add_option('-c', '--continue',
4253 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4254 filesystem.add_option('--no-continue',
4255 action='store_false', dest='continue_dl',
4256 help='do not resume partially downloaded files (restart from beginning)')
4257 filesystem.add_option('--cookies',
4258 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4259 filesystem.add_option('--no-part',
4260 action='store_true', dest='nopart', help='do not use .part files', default=False)
4261 filesystem.add_option('--no-mtime',
4262 action='store_false', dest='updatetime',
4263 help='do not use the Last-modified header to set the file modification time', default=True)
4264 filesystem.add_option('--write-description',
4265 action='store_true', dest='writedescription',
4266 help='write video description to a .description file', default=False)
4267 filesystem.add_option('--write-info-json',
4268 action='store_true', dest='writeinfojson',
4269 help='write video metadata to a .info.json file', default=False)
4270
4271
4272 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4273 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4274 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4275 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4276 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4277 help='ffmpeg audio bitrate specification, 128k by default')
4278 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4279 help='keeps the video file on disk after the post-processing; the video is erased by default')
4280
4281
4282 parser.add_option_group(general)
4283 parser.add_option_group(selection)
4284 parser.add_option_group(filesystem)
4285 parser.add_option_group(verbosity)
4286 parser.add_option_group(video_format)
4287 parser.add_option_group(authentication)
4288 parser.add_option_group(postproc)
4289
4290 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4291 if xdg_config_home:
4292 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4293 else:
4294 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4295 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4296 opts, args = parser.parse_args(argv)
4297
4298 return parser, opts, args
4299
4300 def gen_extractors():
4301 """ Return a list of an instance of every supported extractor.
4302 The order does matter; the first extractor matched is the one handling the URL.
4303 """
4304 youtube_ie = YoutubeIE()
4305 google_ie = GoogleIE()
4306 yahoo_ie = YahooIE()
4307 return [
4308 YoutubePlaylistIE(youtube_ie),
4309 YoutubeUserIE(youtube_ie),
4310 YoutubeSearchIE(youtube_ie),
4311 youtube_ie,
4312 MetacafeIE(youtube_ie),
4313 DailymotionIE(),
4314 google_ie,
4315 GoogleSearchIE(google_ie),
4316 PhotobucketIE(),
4317 yahoo_ie,
4318 YahooSearchIE(yahoo_ie),
4319 DepositFilesIE(),
4320 FacebookIE(),
4321 BlipTVIE(),
4322 VimeoIE(),
4323 MyVideoIE(),
4324 ComedyCentralIE(),
4325 EscapistIE(),
4326 CollegeHumorIE(),
4327 XVideosIE(),
4328 SoundcloudIE(),
4329 InfoQIE(),
4330 MixcloudIE(),
4331 StanfordOpenClassroomIE(),
4332
4333 GenericIE()
4334 ]
4335
4336 def _real_main():
4337 parser, opts, args = parseOpts()
4338
4339 # Open appropriate CookieJar
4340 if opts.cookiefile is None:
4341 jar = cookielib.CookieJar()
4342 else:
4343 try:
4344 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4345 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4346 jar.load()
4347 except (IOError, OSError), err:
4348 sys.exit(u'ERROR: unable to open cookie file')
4349
4350 # Dump user agent
4351 if opts.dump_user_agent:
4352 print std_headers['User-Agent']
4353 sys.exit(0)
4354
4355 # Batch file verification
4356 batchurls = []
4357 if opts.batchfile is not None:
4358 try:
4359 if opts.batchfile == '-':
4360 batchfd = sys.stdin
4361 else:
4362 batchfd = open(opts.batchfile, 'r')
4363 batchurls = batchfd.readlines()
4364 batchurls = [x.strip() for x in batchurls]
4365 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4366 except IOError:
4367 sys.exit(u'ERROR: batch file could not be read')
4368 all_urls = batchurls + args
4369
4370 # General configuration
4371 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4372 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4373 urllib2.install_opener(opener)
4374 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4375
4376 extractors = gen_extractors()
4377
4378 if opts.list_extractors:
4379 for ie in extractors:
4380 print(ie.IE_NAME)
4381 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4382 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4383 for mu in matchedUrls:
4384 print(u' ' + mu)
4385 sys.exit(0)
4386
4387 # Conflicting, missing and erroneous options
4388 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4389 parser.error(u'using .netrc conflicts with giving username/password')
4390 if opts.password is not None and opts.username is None:
4391 parser.error(u'account username missing')
4392 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4393 parser.error(u'using output template conflicts with using title, literal title or auto number')
4394 if opts.usetitle and opts.useliteral:
4395 parser.error(u'using title conflicts with using literal title')
4396 if opts.username is not None and opts.password is None:
4397 opts.password = getpass.getpass(u'Type account password and press return:')
4398 if opts.ratelimit is not None:
4399 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4400 if numeric_limit is None:
4401 parser.error(u'invalid rate limit specified')
4402 opts.ratelimit = numeric_limit
4403 if opts.retries is not None:
4404 try:
4405 opts.retries = long(opts.retries)
4406 except (TypeError, ValueError), err:
4407 parser.error(u'invalid retry count specified')
4408 try:
4409 opts.playliststart = int(opts.playliststart)
4410 if opts.playliststart <= 0:
4411 raise ValueError(u'Playlist start must be positive')
4412 except (TypeError, ValueError), err:
4413 parser.error(u'invalid playlist start number specified')
4414 try:
4415 opts.playlistend = int(opts.playlistend)
4416 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4417 raise ValueError(u'Playlist end must be greater than playlist start')
4418 except (TypeError, ValueError), err:
4419 parser.error(u'invalid playlist end number specified')
4420 if opts.extractaudio:
4421 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4422 parser.error(u'invalid audio format specified')
4423
4424 # File downloader
4425 fd = FileDownloader({
4426 'usenetrc': opts.usenetrc,
4427 'username': opts.username,
4428 'password': opts.password,
4429 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4430 'forceurl': opts.geturl,
4431 'forcetitle': opts.gettitle,
4432 'forcethumbnail': opts.getthumbnail,
4433 'forcedescription': opts.getdescription,
4434 'forcefilename': opts.getfilename,
4435 'forceformat': opts.getformat,
4436 'simulate': opts.simulate,
4437 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4438 'format': opts.format,
4439 'format_limit': opts.format_limit,
4440 'listformats': opts.listformats,
4441 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4442 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4443 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4444 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4445 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4446 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4447 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4448 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4449 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4450 or u'%(id)s.%(ext)s'),
4451 'ignoreerrors': opts.ignoreerrors,
4452 'ratelimit': opts.ratelimit,
4453 'nooverwrites': opts.nooverwrites,
4454 'retries': opts.retries,
4455 'continuedl': opts.continue_dl,
4456 'noprogress': opts.noprogress,
4457 'playliststart': opts.playliststart,
4458 'playlistend': opts.playlistend,
4459 'logtostderr': opts.outtmpl == '-',
4460 'consoletitle': opts.consoletitle,
4461 'nopart': opts.nopart,
4462 'updatetime': opts.updatetime,
4463 'writedescription': opts.writedescription,
4464 'writeinfojson': opts.writeinfojson,
4465 'matchtitle': opts.matchtitle,
4466 'rejecttitle': opts.rejecttitle,
4467 'max_downloads': opts.max_downloads,
4468 'prefer_free_formats': opts.prefer_free_formats,
4469 })
4470 for extractor in extractors:
4471 fd.add_info_extractor(extractor)
4472
4473 # PostProcessors
4474 if opts.extractaudio:
4475 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4476
4477 # Update version
4478 if opts.update_self:
4479 updateSelf(fd, sys.argv[0])
4480
4481 # Maybe do nothing
4482 if len(all_urls) < 1:
4483 if not opts.update_self:
4484 parser.error(u'you must provide at least one URL')
4485 else:
4486 sys.exit()
4487
4488 try:
4489 retcode = fd.download(all_urls)
4490 except MaxDownloadsReached:
4491 fd.to_screen(u'--max-download limit reached, aborting.')
4492 retcode = 101
4493
4494 # Dump cookie jar if requested
4495 if opts.cookiefile is not None:
4496 try:
4497 jar.save()
4498 except (IOError, OSError), err:
4499 sys.exit(u'ERROR: unable to save cookie jar')
4500
4501 sys.exit(retcode)
4502
4503 def main():
4504 try:
4505 _real_main()
4506 except DownloadError:
4507 sys.exit(1)
4508 except SameFileError:
4509 sys.exit(u'ERROR: fixed output name but more than one file to download')
4510 except KeyboardInterrupt:
4511 sys.exit(u'\nERROR: Interrupted by user')
4512
4513 if __name__ == '__main__':
4514 main()
4515
4516 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: