]> jfr.im git - yt-dlp.git/blob - youtube-dl
added youtube closed captions .srt support (see #90)
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52 import ctypes
53
54 try:
55 import email.utils
56 except ImportError: # Python 2.4
57 import email.Utils
58 try:
59 import cStringIO as StringIO
60 except ImportError:
61 import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65 from urlparse import parse_qs
66 except ImportError:
67 from cgi import parse_qs
68
69 try:
70 import lxml.etree
71 except ImportError:
72 pass # Handled below
73
74 try:
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88 import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 import re
91 class json(object):
92 @staticmethod
93 def loads(s):
94 s = s.decode('UTF-8')
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
99 i += 1
100 if expectMore:
101 if i >= len(s):
102 raiseError('Premature end', i)
103 return i
104 def decodeEscape(match):
105 esc = match.group(1)
106 _STATIC = {
107 '"': '"',
108 '\\': '\\',
109 '/': '/',
110 'b': unichr(0x8),
111 'f': unichr(0xc),
112 'n': '\n',
113 'r': '\r',
114 't': '\t',
115 }
116 if esc in _STATIC:
117 return _STATIC[esc]
118 if esc[0] == 'u':
119 if len(esc) == 1+4:
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
126 def parseString(i):
127 i += 1
128 e = i
129 while True:
130 e = s.index('"', e)
131 bslashes = 0
132 while s[e-bslashes-1] == '\\':
133 bslashes += 1
134 if bslashes % 2 == 1:
135 e += 1
136 continue
137 break
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
140 return (e+1,stri)
141 def parseObj(i):
142 i += 1
143 res = {}
144 i = skipSpace(i)
145 if s[i] == '}': # Empty dictionary
146 return (i+1,res)
147 while True:
148 if s[i] != '"':
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
151 i = skipSpace(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
154 i,val = parse(i+1)
155 res[key] = val
156 i = skipSpace(i)
157 if s[i] == '}':
158 return (i+1, res)
159 if s[i] != ',':
160 raiseError('Expected comma or closing curly brace', i)
161 i = skipSpace(i+1)
162 def parseArray(i):
163 res = []
164 i = skipSpace(i+1)
165 if s[i] == ']': # Empty array
166 return (i+1,res)
167 while True:
168 i,val = parse(i)
169 res.append(val)
170 i = skipSpace(i) # Raise exception if premature end
171 if s[i] == ']':
172 return (i+1, res)
173 if s[i] != ',':
174 raiseError('Expected a comma or closing bracket', i)
175 i = skipSpace(i+1)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
179 return (i+len(k), v)
180 raiseError('Not a boolean (or null)', i)
181 def parseNumber(i):
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183 if mobj is None:
184 raiseError('Not a number', i)
185 nums = mobj.group(1)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190 def parse(i):
191 i = skipSpace(i)
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
194 return (i,res)
195 i,res = parse(0)
196 if i < len(s):
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198 return res
199
200 def preferredencoding():
201 """Get preferred encoding.
202
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 def yield_preferredencoding():
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
212 while True:
213 yield pref
214 return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
219
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
222 """
223 entity = matchobj.group(1)
224
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
228
229 # Unicode character
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
231 if mobj is not None:
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
234 base = 16
235 numstr = u'0%s' % numstr
236 else:
237 base = 10
238 return unichr(long(numstr, base))
239
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
252
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
256 function.
257
258 It returns the tuple (stream, definitive_file_name).
259 """
260 try:
261 if filename == u'-':
262 if sys.platform == 'win32':
263 import msvcrt
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
275
276
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
279 timestamp = None
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
283 return timestamp
284
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
291 res = []
292 for el in iterable:
293 if el not in res:
294 res.append(el)
295 return res
296
297 def _unescapeHTML(s):
298 """
299 @param s a string (of type unicode)
300 """
301 assert type(s) == type(u'')
302
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307 """
308 @param s The name of the file (of type unicode)
309 """
310
311 assert type(s) == type(u'')
312
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317 return s
318 else:
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322 """Download Error exception.
323
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
326 error message.
327 """
328 pass
329
330
331 class SameFileError(Exception):
332 """Same File exception.
333
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
336 """
337 pass
338
339
340 class PostProcessingError(Exception):
341 """Post Processing exception.
342
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
345 """
346 pass
347
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
350 pass
351
352
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
355
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
358 """
359 pass
360
361
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
364
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
368 """
369 # Both in bytes
370 downloaded = None
371 expected = None
372
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
380
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
387
388 Part of this code was copied from:
389
390 http://techknack.net/python-urllib2-handlers/
391
392 Andrew Rowls, the author of that code, agreed to release it to the
393 public domain.
394 """
395
396 @staticmethod
397 def deflate(data):
398 try:
399 return zlib.decompress(data, -zlib.MAX_WBITS)
400 except zlib.error:
401 return zlib.decompress(data)
402
403 @staticmethod
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
408 ret.code = code
409 return ret
410
411 def http_request(self, req):
412 for h in std_headers:
413 if h in req.headers:
414 del req.headers[h]
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
420 return req
421
422 def http_response(self, req, resp):
423 old_resp = resp
424 # gzip
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
429 # deflate
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
434 return resp
435
436
437 class FileDownloader(object):
438 """File Downloader class.
439
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
446
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
454
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
461
462 Available options:
463
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 """
495
496 params = None
497 _ies = []
498 _pps = []
499 _download_retcode = None
500 _num_downloads = None
501 _screen_file = None
502
503 def __init__(self, params):
504 """Create a FileDownloader object with the given options."""
505 self._ies = []
506 self._pps = []
507 self._download_retcode = 0
508 self._num_downloads = 0
509 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
510 self.params = params
511
512 @staticmethod
513 def format_bytes(bytes):
514 if bytes is None:
515 return 'N/A'
516 if type(bytes) is str:
517 bytes = float(bytes)
518 if bytes == 0.0:
519 exponent = 0
520 else:
521 exponent = long(math.log(bytes, 1024.0))
522 suffix = 'bkMGTPEZY'[exponent]
523 converted = float(bytes) / float(1024 ** exponent)
524 return '%.2f%s' % (converted, suffix)
525
526 @staticmethod
527 def calc_percent(byte_counter, data_len):
528 if data_len is None:
529 return '---.-%'
530 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
531
532 @staticmethod
533 def calc_eta(start, now, total, current):
534 if total is None:
535 return '--:--'
536 dif = now - start
537 if current == 0 or dif < 0.001: # One millisecond
538 return '--:--'
539 rate = float(current) / dif
540 eta = long((float(total) - float(current)) / rate)
541 (eta_mins, eta_secs) = divmod(eta, 60)
542 if eta_mins > 99:
543 return '--:--'
544 return '%02d:%02d' % (eta_mins, eta_secs)
545
546 @staticmethod
547 def calc_speed(start, now, bytes):
548 dif = now - start
549 if bytes == 0 or dif < 0.001: # One millisecond
550 return '%10s' % '---b/s'
551 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
552
553 @staticmethod
554 def best_block_size(elapsed_time, bytes):
555 new_min = max(bytes / 2.0, 1.0)
556 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
557 if elapsed_time < 0.001:
558 return long(new_max)
559 rate = bytes / elapsed_time
560 if rate > new_max:
561 return long(new_max)
562 if rate < new_min:
563 return long(new_min)
564 return long(rate)
565
566 @staticmethod
567 def parse_bytes(bytestr):
568 """Parse a string indicating a byte quantity into a long integer."""
569 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
570 if matchobj is None:
571 return None
572 number = float(matchobj.group(1))
573 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
574 return long(round(number * multiplier))
575
576 def add_info_extractor(self, ie):
577 """Add an InfoExtractor object to the end of the list."""
578 self._ies.append(ie)
579 ie.set_downloader(self)
580
581 def add_post_processor(self, pp):
582 """Add a PostProcessor object to the end of the chain."""
583 self._pps.append(pp)
584 pp.set_downloader(self)
585
586 def to_screen(self, message, skip_eol=False):
587 """Print message to stdout if not in quiet mode."""
588 assert type(message) == type(u'')
589 if not self.params.get('quiet', False):
590 terminator = [u'\n', u''][skip_eol]
591 output = message + terminator
592
593 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
594 output = output.encode(preferredencoding(), 'ignore')
595 self._screen_file.write(output)
596 self._screen_file.flush()
597
598 def to_stderr(self, message):
599 """Print message to stderr."""
600 print >>sys.stderr, message.encode(preferredencoding())
601
602 def to_cons_title(self, message):
603 """Set console/terminal window title to message."""
604 if not self.params.get('consoletitle', False):
605 return
606 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
607 # c_wchar_p() might not be necessary if `message` is
608 # already of type unicode()
609 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
610 elif 'TERM' in os.environ:
611 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612
613 def fixed_template(self):
614 """Checks if the output template is fixed."""
615 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616
617 def trouble(self, message=None):
618 """Determine action to take when a download problem appears.
619
620 Depending on if the downloader has been configured to ignore
621 download errors or not, this method may throw an exception or
622 not when errors are found, after printing the message.
623 """
624 if message is not None:
625 self.to_stderr(message)
626 if not self.params.get('ignoreerrors', False):
627 raise DownloadError(message)
628 self._download_retcode = 1
629
630 def slow_down(self, start_time, byte_counter):
631 """Sleep if the download speed is over the rate limit."""
632 rate_limit = self.params.get('ratelimit', None)
633 if rate_limit is None or byte_counter == 0:
634 return
635 now = time.time()
636 elapsed = now - start_time
637 if elapsed <= 0.0:
638 return
639 speed = float(byte_counter) / elapsed
640 if speed > rate_limit:
641 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642
643 def temp_name(self, filename):
644 """Returns a temporary filename for the given filename."""
645 if self.params.get('nopart', False) or filename == u'-' or \
646 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647 return filename
648 return filename + u'.part'
649
650 def undo_temp_name(self, filename):
651 if filename.endswith(u'.part'):
652 return filename[:-len(u'.part')]
653 return filename
654
655 def try_rename(self, old_filename, new_filename):
656 try:
657 if old_filename == new_filename:
658 return
659 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
660 except (IOError, OSError), err:
661 self.trouble(u'ERROR: unable to rename file')
662
663 def try_utime(self, filename, last_modified_hdr):
664 """Try to set the last-modified time of the given file."""
665 if last_modified_hdr is None:
666 return
667 if not os.path.isfile(_encodeFilename(filename)):
668 return
669 timestr = last_modified_hdr
670 if timestr is None:
671 return
672 filetime = timeconvert(timestr)
673 if filetime is None:
674 return filetime
675 try:
676 os.utime(filename, (time.time(), filetime))
677 except:
678 pass
679 return filetime
680
681 def report_writedescription(self, descfn):
682 """ Report that the description file is being written """
683 self.to_screen(u'[info] Writing video description to: ' + descfn)
684
685 def report_writesubtitles(self, srtfn):
686 """ Report that the subtitles file is being written """
687 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
688
689 def report_writeinfojson(self, infofn):
690 """ Report that the metadata file has been written """
691 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
692
693 def report_destination(self, filename):
694 """Report destination filename."""
695 self.to_screen(u'[download] Destination: ' + filename)
696
697 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
698 """Report download progress."""
699 if self.params.get('noprogress', False):
700 return
701 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
702 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
703 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
704 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
705
706 def report_resuming_byte(self, resume_len):
707 """Report attempt to resume at given byte."""
708 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
709
710 def report_retry(self, count, retries):
711 """Report retry in case of HTTP error 5xx"""
712 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
713
714 def report_file_already_downloaded(self, file_name):
715 """Report file has already been fully downloaded."""
716 try:
717 self.to_screen(u'[download] %s has already been downloaded' % file_name)
718 except (UnicodeEncodeError), err:
719 self.to_screen(u'[download] The file has already been downloaded')
720
721 def report_unable_to_resume(self):
722 """Report it was impossible to resume download."""
723 self.to_screen(u'[download] Unable to resume')
724
725 def report_finish(self):
726 """Report download finished."""
727 if self.params.get('noprogress', False):
728 self.to_screen(u'[download] Download completed')
729 else:
730 self.to_screen(u'')
731
732 def increment_downloads(self):
733 """Increment the ordinal that assigns a number to each file."""
734 self._num_downloads += 1
735
736 def prepare_filename(self, info_dict):
737 """Generate the output filename."""
738 try:
739 template_dict = dict(info_dict)
740 template_dict['epoch'] = unicode(long(time.time()))
741 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
742 filename = self.params['outtmpl'] % template_dict
743 return filename
744 except (ValueError, KeyError), err:
745 self.trouble(u'ERROR: invalid system charset or erroneous output template')
746 return None
747
748 def _match_entry(self, info_dict):
749 """ Returns None iff the file should be downloaded """
750
751 title = info_dict['title']
752 matchtitle = self.params.get('matchtitle', False)
753 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
754 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
755 rejecttitle = self.params.get('rejecttitle', False)
756 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
757 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
758 return None
759
760 def process_info(self, info_dict):
761 """Process a single dictionary returned by an InfoExtractor."""
762
763 reason = self._match_entry(info_dict)
764 if reason is not None:
765 self.to_screen(u'[download] ' + reason)
766 return
767
768 max_downloads = self.params.get('max_downloads')
769 if max_downloads is not None:
770 if self._num_downloads > int(max_downloads):
771 raise MaxDownloadsReached()
772
773 filename = self.prepare_filename(info_dict)
774
775 # Forced printings
776 if self.params.get('forcetitle', False):
777 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
778 if self.params.get('forceurl', False):
779 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
781 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcedescription', False) and 'description' in info_dict:
783 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcefilename', False) and filename is not None:
785 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forceformat', False):
787 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
788
789 # Do nothing else if in simulate mode
790 if self.params.get('simulate', False):
791 return
792
793 if filename is None:
794 return
795
796 try:
797 dn = os.path.dirname(_encodeFilename(filename))
798 if dn != '' and not os.path.exists(dn): # dn is already encoded
799 os.makedirs(dn)
800 except (OSError, IOError), err:
801 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
802 return
803
804 if self.params.get('writedescription', False):
805 try:
806 descfn = filename + u'.description'
807 self.report_writedescription(descfn)
808 descfile = open(_encodeFilename(descfn), 'wb')
809 try:
810 descfile.write(info_dict['description'].encode('utf-8'))
811 finally:
812 descfile.close()
813 except (OSError, IOError):
814 self.trouble(u'ERROR: Cannot write description file ' + descfn)
815 return
816
817 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
818 # subtitles download errors are already managed as troubles in relevant IE
819 # that way it will silently go on when used with unsupporting IE
820 try:
821 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
822 self.report_writesubtitles(srtfn)
823 srtfile = open(_encodeFilename(srtfn), 'wb')
824 try:
825 srtfile.write(info_dict['subtitles'].encode('utf-8'))
826 finally:
827 srtfile.close()
828 except (OSError, IOError):
829 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
830 return
831
832 if self.params.get('writeinfojson', False):
833 infofn = filename + u'.info.json'
834 self.report_writeinfojson(infofn)
835 try:
836 json.dump
837 except (NameError,AttributeError):
838 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
839 return
840 try:
841 infof = open(_encodeFilename(infofn), 'wb')
842 try:
843 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
844 json.dump(json_info_dict, infof)
845 finally:
846 infof.close()
847 except (OSError, IOError):
848 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
849 return
850
851 if not self.params.get('skip_download', False):
852 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
853 success = True
854 else:
855 try:
856 success = self._do_download(filename, info_dict)
857 except (OSError, IOError), err:
858 raise UnavailableVideoError
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
861 return
862 except (ContentTooShortError, ), err:
863 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
864 return
865
866 if success:
867 try:
868 self.post_process(filename, info_dict)
869 except (PostProcessingError), err:
870 self.trouble(u'ERROR: postprocessing: %s' % str(err))
871 return
872
873 def download(self, url_list):
874 """Download a given list of URLs."""
875 if len(url_list) > 1 and self.fixed_template():
876 raise SameFileError(self.params['outtmpl'])
877
878 for url in url_list:
879 suitable_found = False
880 for ie in self._ies:
881 # Go to next InfoExtractor if not suitable
882 if not ie.suitable(url):
883 continue
884
885 # Suitable InfoExtractor found
886 suitable_found = True
887
888 # Extract information from URL and process it
889 ie.extract(url)
890
891 # Suitable InfoExtractor had been found; go to next URL
892 break
893
894 if not suitable_found:
895 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
896
897 return self._download_retcode
898
899 def post_process(self, filename, ie_info):
900 """Run the postprocessing chain on the given file."""
901 info = dict(ie_info)
902 info['filepath'] = filename
903 for pp in self._pps:
904 info = pp.run(info)
905 if info is None:
906 break
907
908 def _download_with_rtmpdump(self, filename, url, player_url):
909 self.report_destination(filename)
910 tmpfilename = self.temp_name(filename)
911
912 # Check for rtmpdump first
913 try:
914 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
915 except (OSError, IOError):
916 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
917 return False
918
919 # Download using rtmpdump. rtmpdump returns exit code 2 when
920 # the connection was interrumpted and resuming appears to be
921 # possible. This is part of rtmpdump's normal usage, AFAIK.
922 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
923 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
924 if self.params.get('verbose', False):
925 try:
926 import pipes
927 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
928 except ImportError:
929 shell_quote = repr
930 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
931 retval = subprocess.call(args)
932 while retval == 2 or retval == 1:
933 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
934 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
935 time.sleep(5.0) # This seems to be needed
936 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
937 cursize = os.path.getsize(_encodeFilename(tmpfilename))
938 if prevsize == cursize and retval == 1:
939 break
940 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
941 if prevsize == cursize and retval == 2 and cursize > 1024:
942 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
943 retval = 0
944 break
945 if retval == 0:
946 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
947 self.try_rename(tmpfilename, filename)
948 return True
949 else:
950 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
951 return False
952
953 def _do_download(self, filename, info_dict):
954 url = info_dict['url']
955 player_url = info_dict.get('player_url', None)
956
957 # Check file already present
958 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
959 self.report_file_already_downloaded(filename)
960 return True
961
962 # Attempt to download using rtmpdump
963 if url.startswith('rtmp'):
964 return self._download_with_rtmpdump(filename, url, player_url)
965
966 tmpfilename = self.temp_name(filename)
967 stream = None
968
969 # Do not include the Accept-Encoding header
970 headers = {'Youtubedl-no-compression': 'True'}
971 basic_request = urllib2.Request(url, None, headers)
972 request = urllib2.Request(url, None, headers)
973
974 # Establish possible resume length
975 if os.path.isfile(_encodeFilename(tmpfilename)):
976 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
977 else:
978 resume_len = 0
979
980 open_mode = 'wb'
981 if resume_len != 0:
982 if self.params.get('continuedl', False):
983 self.report_resuming_byte(resume_len)
984 request.add_header('Range','bytes=%d-' % resume_len)
985 open_mode = 'ab'
986 else:
987 resume_len = 0
988
989 count = 0
990 retries = self.params.get('retries', 0)
991 while count <= retries:
992 # Establish connection
993 try:
994 if count == 0 and 'urlhandle' in info_dict:
995 data = info_dict['urlhandle']
996 data = urllib2.urlopen(request)
997 break
998 except (urllib2.HTTPError, ), err:
999 if (err.code < 500 or err.code >= 600) and err.code != 416:
1000 # Unexpected HTTP error
1001 raise
1002 elif err.code == 416:
1003 # Unable to resume (requested range not satisfiable)
1004 try:
1005 # Open the connection again without the range header
1006 data = urllib2.urlopen(basic_request)
1007 content_length = data.info()['Content-Length']
1008 except (urllib2.HTTPError, ), err:
1009 if err.code < 500 or err.code >= 600:
1010 raise
1011 else:
1012 # Examine the reported length
1013 if (content_length is not None and
1014 (resume_len - 100 < long(content_length) < resume_len + 100)):
1015 # The file had already been fully downloaded.
1016 # Explanation to the above condition: in issue #175 it was revealed that
1017 # YouTube sometimes adds or removes a few bytes from the end of the file,
1018 # changing the file size slightly and causing problems for some users. So
1019 # I decided to implement a suggested change and consider the file
1020 # completely downloaded if the file size differs less than 100 bytes from
1021 # the one in the hard drive.
1022 self.report_file_already_downloaded(filename)
1023 self.try_rename(tmpfilename, filename)
1024 return True
1025 else:
1026 # The length does not match, we start the download over
1027 self.report_unable_to_resume()
1028 open_mode = 'wb'
1029 break
1030 # Retry
1031 count += 1
1032 if count <= retries:
1033 self.report_retry(count, retries)
1034
1035 if count > retries:
1036 self.trouble(u'ERROR: giving up after %s retries' % retries)
1037 return False
1038
1039 data_len = data.info().get('Content-length', None)
1040 if data_len is not None:
1041 data_len = long(data_len) + resume_len
1042 data_len_str = self.format_bytes(data_len)
1043 byte_counter = 0 + resume_len
1044 block_size = 1024
1045 start = time.time()
1046 while True:
1047 # Download and write
1048 before = time.time()
1049 data_block = data.read(block_size)
1050 after = time.time()
1051 if len(data_block) == 0:
1052 break
1053 byte_counter += len(data_block)
1054
1055 # Open file just in time
1056 if stream is None:
1057 try:
1058 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1059 assert stream is not None
1060 filename = self.undo_temp_name(tmpfilename)
1061 self.report_destination(filename)
1062 except (OSError, IOError), err:
1063 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1064 return False
1065 try:
1066 stream.write(data_block)
1067 except (IOError, OSError), err:
1068 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1069 return False
1070 block_size = self.best_block_size(after - before, len(data_block))
1071
1072 # Progress message
1073 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1074 if data_len is None:
1075 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1076 else:
1077 percent_str = self.calc_percent(byte_counter, data_len)
1078 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1079 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1080
1081 # Apply rate limit
1082 self.slow_down(start, byte_counter - resume_len)
1083
1084 if stream is None:
1085 self.trouble(u'\nERROR: Did not get any data blocks')
1086 return False
1087 stream.close()
1088 self.report_finish()
1089 if data_len is not None and byte_counter != data_len:
1090 raise ContentTooShortError(byte_counter, long(data_len))
1091 self.try_rename(tmpfilename, filename)
1092
1093 # Update file modification time
1094 if self.params.get('updatetime', True):
1095 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1096
1097 return True
1098
1099
1100 class InfoExtractor(object):
1101 """Information Extractor class.
1102
1103 Information extractors are the classes that, given a URL, extract
1104 information from the video (or videos) the URL refers to. This
1105 information includes the real video URL, the video title and simplified
1106 title, author and others. The information is stored in a dictionary
1107 which is then passed to the FileDownloader. The FileDownloader
1108 processes this information possibly downloading the video to the file
1109 system, among other possible outcomes. The dictionaries must include
1110 the following fields:
1111
1112 id: Video identifier.
1113 url: Final video URL.
1114 uploader: Nickname of the video uploader.
1115 title: Literal title.
1116 stitle: Simplified title.
1117 ext: Video filename extension.
1118 format: Video format.
1119 player_url: SWF Player URL (may be None).
1120
1121 The following fields are optional. Their primary purpose is to allow
1122 youtube-dl to serve as the backend for a video search function, such
1123 as the one in youtube2mp3. They are only used when their respective
1124 forced printing functions are called:
1125
1126 thumbnail: Full URL to a video thumbnail image.
1127 description: One-line video description.
1128
1129 Subclasses of this one should re-define the _real_initialize() and
1130 _real_extract() methods and define a _VALID_URL regexp.
1131 Probably, they should also be added to the list of extractors.
1132 """
1133
1134 _ready = False
1135 _downloader = None
1136
1137 def __init__(self, downloader=None):
1138 """Constructor. Receives an optional downloader."""
1139 self._ready = False
1140 self.set_downloader(downloader)
1141
1142 def suitable(self, url):
1143 """Receives a URL and returns True if suitable for this IE."""
1144 return re.match(self._VALID_URL, url) is not None
1145
1146 def initialize(self):
1147 """Initializes an instance (authentication, etc)."""
1148 if not self._ready:
1149 self._real_initialize()
1150 self._ready = True
1151
1152 def extract(self, url):
1153 """Extracts URL information and returns it in list of dicts."""
1154 self.initialize()
1155 return self._real_extract(url)
1156
1157 def set_downloader(self, downloader):
1158 """Sets the downloader for this IE."""
1159 self._downloader = downloader
1160
1161 def _real_initialize(self):
1162 """Real initialization process. Redefine in subclasses."""
1163 pass
1164
1165 def _real_extract(self, url):
1166 """Real extraction process. Redefine in subclasses."""
1167 pass
1168
1169
1170 class YoutubeIE(InfoExtractor):
1171 """Information extractor for youtube.com."""
1172
1173 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1174 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1175 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1176 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1177 _NETRC_MACHINE = 'youtube'
1178 # Listed in order of quality
1179 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1180 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1181 _video_extensions = {
1182 '13': '3gp',
1183 '17': 'mp4',
1184 '18': 'mp4',
1185 '22': 'mp4',
1186 '37': 'mp4',
1187 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1188 '43': 'webm',
1189 '44': 'webm',
1190 '45': 'webm',
1191 }
1192 _video_dimensions = {
1193 '5': '240x400',
1194 '6': '???',
1195 '13': '???',
1196 '17': '144x176',
1197 '18': '360x640',
1198 '22': '720x1280',
1199 '34': '360x640',
1200 '35': '480x854',
1201 '37': '1080x1920',
1202 '38': '3072x4096',
1203 '43': '360x640',
1204 '44': '480x854',
1205 '45': '720x1280',
1206 }
1207 IE_NAME = u'youtube'
1208
1209 def report_lang(self):
1210 """Report attempt to set language."""
1211 self._downloader.to_screen(u'[youtube] Setting language')
1212
1213 def report_login(self):
1214 """Report attempt to log in."""
1215 self._downloader.to_screen(u'[youtube] Logging in')
1216
1217 def report_age_confirmation(self):
1218 """Report attempt to confirm age."""
1219 self._downloader.to_screen(u'[youtube] Confirming age')
1220
1221 def report_video_webpage_download(self, video_id):
1222 """Report attempt to download video webpage."""
1223 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1224
1225 def report_video_info_webpage_download(self, video_id):
1226 """Report attempt to download video info webpage."""
1227 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1228
1229 def report_video_subtitles_download(self, video_id):
1230 """Report attempt to download video info webpage."""
1231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1232
1233 def report_information_extraction(self, video_id):
1234 """Report attempt to extract video information."""
1235 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1236
1237 def report_unavailable_format(self, video_id, format):
1238 """Report extracted video URL."""
1239 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1240
1241 def report_rtmp_download(self):
1242 """Indicate the download will use the RTMP protocol."""
1243 self._downloader.to_screen(u'[youtube] RTMP download detected')
1244
1245 def _closed_captions_xml_to_srt(self, xml_string):
1246 srt = ''
1247 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1248 # TODO parse xml instead of regex
1249 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1250 if not dur: dur = '4'
1251 start = float(start)
1252 end = start + float(dur)
1253 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1254 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1255 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1256 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1257 srt += str(n) + '\n'
1258 srt += start + ' --> ' + end + '\n'
1259 srt += caption + '\n\n'
1260 return srt
1261
1262 def _print_formats(self, formats):
1263 print 'Available formats:'
1264 for x in formats:
1265 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1266
1267 def _real_initialize(self):
1268 if self._downloader is None:
1269 return
1270
1271 username = None
1272 password = None
1273 downloader_params = self._downloader.params
1274
1275 # Attempt to use provided username and password or .netrc data
1276 if downloader_params.get('username', None) is not None:
1277 username = downloader_params['username']
1278 password = downloader_params['password']
1279 elif downloader_params.get('usenetrc', False):
1280 try:
1281 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1282 if info is not None:
1283 username = info[0]
1284 password = info[2]
1285 else:
1286 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1287 except (IOError, netrc.NetrcParseError), err:
1288 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1289 return
1290
1291 # Set language
1292 request = urllib2.Request(self._LANG_URL)
1293 try:
1294 self.report_lang()
1295 urllib2.urlopen(request).read()
1296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1297 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1298 return
1299
1300 # No authentication to be performed
1301 if username is None:
1302 return
1303
1304 # Log in
1305 login_form = {
1306 'current_form': 'loginForm',
1307 'next': '/',
1308 'action_login': 'Log In',
1309 'username': username,
1310 'password': password,
1311 }
1312 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1313 try:
1314 self.report_login()
1315 login_results = urllib2.urlopen(request).read()
1316 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1317 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1318 return
1319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1321 return
1322
1323 # Confirm age
1324 age_form = {
1325 'next_url': '/',
1326 'action_confirm': 'Confirm',
1327 }
1328 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1329 try:
1330 self.report_age_confirmation()
1331 age_results = urllib2.urlopen(request).read()
1332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1334 return
1335
1336 def _real_extract(self, url):
1337 # Extract video id from URL
1338 mobj = re.match(self._VALID_URL, url)
1339 if mobj is None:
1340 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1341 return
1342 video_id = mobj.group(2)
1343
1344 # Get video webpage
1345 self.report_video_webpage_download(video_id)
1346 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1347 try:
1348 video_webpage = urllib2.urlopen(request).read()
1349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1351 return
1352
1353 # Attempt to extract SWF player URL
1354 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1355 if mobj is not None:
1356 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1357 else:
1358 player_url = None
1359
1360 # Get video info
1361 self.report_video_info_webpage_download(video_id)
1362 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1363 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1364 % (video_id, el_type))
1365 request = urllib2.Request(video_info_url)
1366 try:
1367 video_info_webpage = urllib2.urlopen(request).read()
1368 video_info = parse_qs(video_info_webpage)
1369 if 'token' in video_info:
1370 break
1371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1373 return
1374 if 'token' not in video_info:
1375 if 'reason' in video_info:
1376 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1377 else:
1378 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1379 return
1380
1381 # Start extracting information
1382 self.report_information_extraction(video_id)
1383
1384 # uploader
1385 if 'author' not in video_info:
1386 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1387 return
1388 video_uploader = urllib.unquote_plus(video_info['author'][0])
1389
1390 # title
1391 if 'title' not in video_info:
1392 self._downloader.trouble(u'ERROR: unable to extract video title')
1393 return
1394 video_title = urllib.unquote_plus(video_info['title'][0])
1395 video_title = video_title.decode('utf-8')
1396 video_title = sanitize_title(video_title)
1397
1398 # simplified title
1399 simple_title = _simplify_title(video_title)
1400
1401 # thumbnail image
1402 if 'thumbnail_url' not in video_info:
1403 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1404 video_thumbnail = ''
1405 else: # don't panic if we can't find it
1406 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1407
1408 # upload date
1409 upload_date = u'NA'
1410 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1411 if mobj is not None:
1412 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1413 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1414 for expression in format_expressions:
1415 try:
1416 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1417 except:
1418 pass
1419
1420 # description
1421 try:
1422 lxml.etree
1423 except NameError:
1424 video_description = u'No description available.'
1425 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1426 if mobj is not None:
1427 video_description = mobj.group(1).decode('utf-8')
1428 else:
1429 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1430 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1431 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1432 # TODO use another parser
1433
1434 # closed captions
1435 video_subtitles = None
1436 if self._downloader.params.get('writesubtitles', False):
1437 self.report_video_subtitles_download(video_id)
1438 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1439 try:
1440 srt_list = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1443 else:
1444 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1445 if srt_lang_list:
1446 if 'en' in srt_lang_list: srt_lang = 'en'
1447 else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
1448 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1449 try:
1450 srt_xml = urllib2.urlopen(request).read()
1451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1453 else:
1454 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1455 else:
1456 self._downloader.trouble(u'WARNING: video has no subtitles')
1457
1458 # token
1459 video_token = urllib.unquote_plus(video_info['token'][0])
1460
1461 # Decide which formats to download
1462 req_format = self._downloader.params.get('format', None)
1463
1464 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1465 self.report_rtmp_download()
1466 video_url_list = [(None, video_info['conn'][0])]
1467 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1468 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1469 url_data = [parse_qs(uds) for uds in url_data_strs]
1470 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1471 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1472
1473 format_limit = self._downloader.params.get('format_limit', None)
1474 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1475 if format_limit is not None and format_limit in available_formats:
1476 format_list = available_formats[available_formats.index(format_limit):]
1477 else:
1478 format_list = available_formats
1479 existing_formats = [x for x in format_list if x in url_map]
1480 if len(existing_formats) == 0:
1481 self._downloader.trouble(u'ERROR: no known formats available for video')
1482 return
1483 if self._downloader.params.get('listformats', None):
1484 self._print_formats(existing_formats)
1485 return
1486 if req_format is None or req_format == 'best':
1487 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1488 elif req_format == 'worst':
1489 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1490 elif req_format in ('-1', 'all'):
1491 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1492 else:
1493 # Specific formats. We pick the first in a slash-delimeted sequence.
1494 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1495 req_formats = req_format.split('/')
1496 video_url_list = None
1497 for rf in req_formats:
1498 if rf in url_map:
1499 video_url_list = [(rf, url_map[rf])]
1500 break
1501 if video_url_list is None:
1502 self._downloader.trouble(u'ERROR: requested format not available')
1503 return
1504 else:
1505 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1506 return
1507
1508 for format_param, video_real_url in video_url_list:
1509 # At this point we have a new video
1510 self._downloader.increment_downloads()
1511
1512 # Extension
1513 video_extension = self._video_extensions.get(format_param, 'flv')
1514
1515 try:
1516 # Process video information
1517 self._downloader.process_info({
1518 'id': video_id.decode('utf-8'),
1519 'url': video_real_url.decode('utf-8'),
1520 'uploader': video_uploader.decode('utf-8'),
1521 'upload_date': upload_date,
1522 'title': video_title,
1523 'stitle': simple_title,
1524 'ext': video_extension.decode('utf-8'),
1525 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1526 'thumbnail': video_thumbnail.decode('utf-8'),
1527 'description': video_description,
1528 'player_url': player_url,
1529 'subtitles': video_subtitles
1530 })
1531 except UnavailableVideoError, err:
1532 self._downloader.trouble(u'\nERROR: unable to download video')
1533
1534
1535 class MetacafeIE(InfoExtractor):
1536 """Information Extractor for metacafe.com."""
1537
1538 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1539 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1540 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1541 _youtube_ie = None
1542 IE_NAME = u'metacafe'
1543
1544 def __init__(self, youtube_ie, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1546 self._youtube_ie = youtube_ie
1547
1548 def report_disclaimer(self):
1549 """Report disclaimer retrieval."""
1550 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1551
1552 def report_age_confirmation(self):
1553 """Report attempt to confirm age."""
1554 self._downloader.to_screen(u'[metacafe] Confirming age')
1555
1556 def report_download_webpage(self, video_id):
1557 """Report webpage download."""
1558 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1559
1560 def report_extraction(self, video_id):
1561 """Report information extraction."""
1562 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1563
1564 def _real_initialize(self):
1565 # Retrieve disclaimer
1566 request = urllib2.Request(self._DISCLAIMER)
1567 try:
1568 self.report_disclaimer()
1569 disclaimer = urllib2.urlopen(request).read()
1570 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1572 return
1573
1574 # Confirm age
1575 disclaimer_form = {
1576 'filters': '0',
1577 'submit': "Continue - I'm over 18",
1578 }
1579 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1580 try:
1581 self.report_age_confirmation()
1582 disclaimer = urllib2.urlopen(request).read()
1583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1585 return
1586
1587 def _real_extract(self, url):
1588 # Extract id and simplified title from URL
1589 mobj = re.match(self._VALID_URL, url)
1590 if mobj is None:
1591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1592 return
1593
1594 video_id = mobj.group(1)
1595
1596 # Check if video comes from YouTube
1597 mobj2 = re.match(r'^yt-(.*)$', video_id)
1598 if mobj2 is not None:
1599 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1600 return
1601
1602 # At this point we have a new video
1603 self._downloader.increment_downloads()
1604
1605 simple_title = mobj.group(2).decode('utf-8')
1606
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1609 try:
1610 self.report_download_webpage(video_id)
1611 webpage = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1614 return
1615
1616 # Extract URL, uploader and title from webpage
1617 self.report_extraction(video_id)
1618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1619 if mobj is not None:
1620 mediaURL = urllib.unquote(mobj.group(1))
1621 video_extension = mediaURL[-3:]
1622
1623 # Extract gdaKey if available
1624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1625 if mobj is None:
1626 video_url = mediaURL
1627 else:
1628 gdaKey = mobj.group(1)
1629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1630 else:
1631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1632 if mobj is None:
1633 self._downloader.trouble(u'ERROR: unable to extract media URL')
1634 return
1635 vardict = parse_qs(mobj.group(1))
1636 if 'mediaData' not in vardict:
1637 self._downloader.trouble(u'ERROR: unable to extract media URL')
1638 return
1639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1640 if mobj is None:
1641 self._downloader.trouble(u'ERROR: unable to extract media URL')
1642 return
1643 mediaURL = mobj.group(1).replace('\\/', '/')
1644 video_extension = mediaURL[-3:]
1645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1646
1647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1648 if mobj is None:
1649 self._downloader.trouble(u'ERROR: unable to extract title')
1650 return
1651 video_title = mobj.group(1).decode('utf-8')
1652 video_title = sanitize_title(video_title)
1653
1654 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1655 if mobj is None:
1656 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1657 return
1658 video_uploader = mobj.group(1)
1659
1660 try:
1661 # Process video information
1662 self._downloader.process_info({
1663 'id': video_id.decode('utf-8'),
1664 'url': video_url.decode('utf-8'),
1665 'uploader': video_uploader.decode('utf-8'),
1666 'upload_date': u'NA',
1667 'title': video_title,
1668 'stitle': simple_title,
1669 'ext': video_extension.decode('utf-8'),
1670 'format': u'NA',
1671 'player_url': None,
1672 })
1673 except UnavailableVideoError:
1674 self._downloader.trouble(u'\nERROR: unable to download video')
1675
1676
1677 class DailymotionIE(InfoExtractor):
1678 """Information Extractor for Dailymotion"""
1679
1680 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1681 IE_NAME = u'dailymotion'
1682
1683 def __init__(self, downloader=None):
1684 InfoExtractor.__init__(self, downloader)
1685
1686 def report_download_webpage(self, video_id):
1687 """Report webpage download."""
1688 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1689
1690 def report_extraction(self, video_id):
1691 """Report information extraction."""
1692 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1693
1694 def _real_extract(self, url):
1695 # Extract id and simplified title from URL
1696 mobj = re.match(self._VALID_URL, url)
1697 if mobj is None:
1698 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1699 return
1700
1701 # At this point we have a new video
1702 self._downloader.increment_downloads()
1703 video_id = mobj.group(1)
1704
1705 video_extension = 'flv'
1706
1707 # Retrieve video webpage to extract further information
1708 request = urllib2.Request(url)
1709 request.add_header('Cookie', 'family_filter=off')
1710 try:
1711 self.report_download_webpage(video_id)
1712 webpage = urllib2.urlopen(request).read()
1713 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1714 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1715 return
1716
1717 # Extract URL, uploader and title from webpage
1718 self.report_extraction(video_id)
1719 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1720 if mobj is None:
1721 self._downloader.trouble(u'ERROR: unable to extract media URL')
1722 return
1723 sequence = urllib.unquote(mobj.group(1))
1724 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1725 if mobj is None:
1726 self._downloader.trouble(u'ERROR: unable to extract media URL')
1727 return
1728 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1729
1730 # if needed add http://www.dailymotion.com/ if relative URL
1731
1732 video_url = mediaURL
1733
1734 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1735 if mobj is None:
1736 self._downloader.trouble(u'ERROR: unable to extract title')
1737 return
1738 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1739 video_title = sanitize_title(video_title)
1740 simple_title = _simplify_title(video_title)
1741
1742 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1743 if mobj is None:
1744 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1745 return
1746 video_uploader = mobj.group(1)
1747
1748 try:
1749 # Process video information
1750 self._downloader.process_info({
1751 'id': video_id.decode('utf-8'),
1752 'url': video_url.decode('utf-8'),
1753 'uploader': video_uploader.decode('utf-8'),
1754 'upload_date': u'NA',
1755 'title': video_title,
1756 'stitle': simple_title,
1757 'ext': video_extension.decode('utf-8'),
1758 'format': u'NA',
1759 'player_url': None,
1760 })
1761 except UnavailableVideoError:
1762 self._downloader.trouble(u'\nERROR: unable to download video')
1763
1764
1765 class GoogleIE(InfoExtractor):
1766 """Information extractor for video.google.com."""
1767
1768 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1769 IE_NAME = u'video.google'
1770
1771 def __init__(self, downloader=None):
1772 InfoExtractor.__init__(self, downloader)
1773
1774 def report_download_webpage(self, video_id):
1775 """Report webpage download."""
1776 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1777
1778 def report_extraction(self, video_id):
1779 """Report information extraction."""
1780 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1781
1782 def _real_extract(self, url):
1783 # Extract id from URL
1784 mobj = re.match(self._VALID_URL, url)
1785 if mobj is None:
1786 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1787 return
1788
1789 # At this point we have a new video
1790 self._downloader.increment_downloads()
1791 video_id = mobj.group(1)
1792
1793 video_extension = 'mp4'
1794
1795 # Retrieve video webpage to extract further information
1796 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1797 try:
1798 self.report_download_webpage(video_id)
1799 webpage = urllib2.urlopen(request).read()
1800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1802 return
1803
1804 # Extract URL, uploader, and title from webpage
1805 self.report_extraction(video_id)
1806 mobj = re.search(r"download_url:'([^']+)'", webpage)
1807 if mobj is None:
1808 video_extension = 'flv'
1809 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1810 if mobj is None:
1811 self._downloader.trouble(u'ERROR: unable to extract media URL')
1812 return
1813 mediaURL = urllib.unquote(mobj.group(1))
1814 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1815 mediaURL = mediaURL.replace('\\x26', '\x26')
1816
1817 video_url = mediaURL
1818
1819 mobj = re.search(r'<title>(.*)</title>', webpage)
1820 if mobj is None:
1821 self._downloader.trouble(u'ERROR: unable to extract title')
1822 return
1823 video_title = mobj.group(1).decode('utf-8')
1824 video_title = sanitize_title(video_title)
1825 simple_title = _simplify_title(video_title)
1826
1827 # Extract video description
1828 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1829 if mobj is None:
1830 self._downloader.trouble(u'ERROR: unable to extract video description')
1831 return
1832 video_description = mobj.group(1).decode('utf-8')
1833 if not video_description:
1834 video_description = 'No description available.'
1835
1836 # Extract video thumbnail
1837 if self._downloader.params.get('forcethumbnail', False):
1838 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1839 try:
1840 webpage = urllib2.urlopen(request).read()
1841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843 return
1844 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1845 if mobj is None:
1846 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1847 return
1848 video_thumbnail = mobj.group(1)
1849 else: # we need something to pass to process_info
1850 video_thumbnail = ''
1851
1852 try:
1853 # Process video information
1854 self._downloader.process_info({
1855 'id': video_id.decode('utf-8'),
1856 'url': video_url.decode('utf-8'),
1857 'uploader': u'NA',
1858 'upload_date': u'NA',
1859 'title': video_title,
1860 'stitle': simple_title,
1861 'ext': video_extension.decode('utf-8'),
1862 'format': u'NA',
1863 'player_url': None,
1864 })
1865 except UnavailableVideoError:
1866 self._downloader.trouble(u'\nERROR: unable to download video')
1867
1868
1869 class PhotobucketIE(InfoExtractor):
1870 """Information extractor for photobucket.com."""
1871
1872 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1873 IE_NAME = u'photobucket'
1874
1875 def __init__(self, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1877
1878 def report_download_webpage(self, video_id):
1879 """Report webpage download."""
1880 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1881
1882 def report_extraction(self, video_id):
1883 """Report information extraction."""
1884 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1885
1886 def _real_extract(self, url):
1887 # Extract id from URL
1888 mobj = re.match(self._VALID_URL, url)
1889 if mobj is None:
1890 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1891 return
1892
1893 # At this point we have a new video
1894 self._downloader.increment_downloads()
1895 video_id = mobj.group(1)
1896
1897 video_extension = 'flv'
1898
1899 # Retrieve video webpage to extract further information
1900 request = urllib2.Request(url)
1901 try:
1902 self.report_download_webpage(video_id)
1903 webpage = urllib2.urlopen(request).read()
1904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1906 return
1907
1908 # Extract URL, uploader, and title from webpage
1909 self.report_extraction(video_id)
1910 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1911 if mobj is None:
1912 self._downloader.trouble(u'ERROR: unable to extract media URL')
1913 return
1914 mediaURL = urllib.unquote(mobj.group(1))
1915
1916 video_url = mediaURL
1917
1918 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1919 if mobj is None:
1920 self._downloader.trouble(u'ERROR: unable to extract title')
1921 return
1922 video_title = mobj.group(1).decode('utf-8')
1923 video_title = sanitize_title(video_title)
1924 simple_title = _simplify_title(vide_title)
1925
1926 video_uploader = mobj.group(2).decode('utf-8')
1927
1928 try:
1929 # Process video information
1930 self._downloader.process_info({
1931 'id': video_id.decode('utf-8'),
1932 'url': video_url.decode('utf-8'),
1933 'uploader': video_uploader,
1934 'upload_date': u'NA',
1935 'title': video_title,
1936 'stitle': simple_title,
1937 'ext': video_extension.decode('utf-8'),
1938 'format': u'NA',
1939 'player_url': None,
1940 })
1941 except UnavailableVideoError:
1942 self._downloader.trouble(u'\nERROR: unable to download video')
1943
1944
1945 class YahooIE(InfoExtractor):
1946 """Information extractor for video.yahoo.com."""
1947
1948 # _VALID_URL matches all Yahoo! Video URLs
1949 # _VPAGE_URL matches only the extractable '/watch/' URLs
1950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1952 IE_NAME = u'video.yahoo'
1953
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1956
1957 def report_download_webpage(self, video_id):
1958 """Report webpage download."""
1959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1960
1961 def report_extraction(self, video_id):
1962 """Report information extraction."""
1963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1964
1965 def _real_extract(self, url, new_video=True):
1966 # Extract ID from URL
1967 mobj = re.match(self._VALID_URL, url)
1968 if mobj is None:
1969 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1970 return
1971
1972 # At this point we have a new video
1973 self._downloader.increment_downloads()
1974 video_id = mobj.group(2)
1975 video_extension = 'flv'
1976
1977 # Rewrite valid but non-extractable URLs as
1978 # extractable English language /watch/ URLs
1979 if re.match(self._VPAGE_URL, url) is None:
1980 request = urllib2.Request(url)
1981 try:
1982 webpage = urllib2.urlopen(request).read()
1983 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1984 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1985 return
1986
1987 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1988 if mobj is None:
1989 self._downloader.trouble(u'ERROR: Unable to extract id field')
1990 return
1991 yahoo_id = mobj.group(1)
1992
1993 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1994 if mobj is None:
1995 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1996 return
1997 yahoo_vid = mobj.group(1)
1998
1999 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2000 return self._real_extract(url, new_video=False)
2001
2002 # Retrieve video webpage to extract further information
2003 request = urllib2.Request(url)
2004 try:
2005 self.report_download_webpage(video_id)
2006 webpage = urllib2.urlopen(request).read()
2007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2009 return
2010
2011 # Extract uploader and title from webpage
2012 self.report_extraction(video_id)
2013 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2014 if mobj is None:
2015 self._downloader.trouble(u'ERROR: unable to extract video title')
2016 return
2017 video_title = mobj.group(1).decode('utf-8')
2018 simple_title = _simplify_title(video_title)
2019
2020 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2021 if mobj is None:
2022 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2023 return
2024 video_uploader = mobj.group(1).decode('utf-8')
2025
2026 # Extract video thumbnail
2027 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2028 if mobj is None:
2029 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2030 return
2031 video_thumbnail = mobj.group(1).decode('utf-8')
2032
2033 # Extract video description
2034 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2035 if mobj is None:
2036 self._downloader.trouble(u'ERROR: unable to extract video description')
2037 return
2038 video_description = mobj.group(1).decode('utf-8')
2039 if not video_description:
2040 video_description = 'No description available.'
2041
2042 # Extract video height and width
2043 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2044 if mobj is None:
2045 self._downloader.trouble(u'ERROR: unable to extract video height')
2046 return
2047 yv_video_height = mobj.group(1)
2048
2049 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2050 if mobj is None:
2051 self._downloader.trouble(u'ERROR: unable to extract video width')
2052 return
2053 yv_video_width = mobj.group(1)
2054
2055 # Retrieve video playlist to extract media URL
2056 # I'm not completely sure what all these options are, but we
2057 # seem to need most of them, otherwise the server sends a 401.
2058 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2059 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2060 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2061 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2062 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2063 try:
2064 self.report_download_webpage(video_id)
2065 webpage = urllib2.urlopen(request).read()
2066 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2068 return
2069
2070 # Extract media URL from playlist XML
2071 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2072 if mobj is None:
2073 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2074 return
2075 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2076 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2077
2078 try:
2079 # Process video information
2080 self._downloader.process_info({
2081 'id': video_id.decode('utf-8'),
2082 'url': video_url,
2083 'uploader': video_uploader,
2084 'upload_date': u'NA',
2085 'title': video_title,
2086 'stitle': simple_title,
2087 'ext': video_extension.decode('utf-8'),
2088 'thumbnail': video_thumbnail.decode('utf-8'),
2089 'description': video_description,
2090 'thumbnail': video_thumbnail,
2091 'player_url': None,
2092 })
2093 except UnavailableVideoError:
2094 self._downloader.trouble(u'\nERROR: unable to download video')
2095
2096
2097 class VimeoIE(InfoExtractor):
2098 """Information extractor for vimeo.com."""
2099
2100 # _VALID_URL matches Vimeo URLs
2101 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2102 IE_NAME = u'vimeo'
2103
2104 def __init__(self, downloader=None):
2105 InfoExtractor.__init__(self, downloader)
2106
2107 def report_download_webpage(self, video_id):
2108 """Report webpage download."""
2109 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2110
2111 def report_extraction(self, video_id):
2112 """Report information extraction."""
2113 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2114
2115 def _real_extract(self, url, new_video=True):
2116 # Extract ID from URL
2117 mobj = re.match(self._VALID_URL, url)
2118 if mobj is None:
2119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120 return
2121
2122 # At this point we have a new video
2123 self._downloader.increment_downloads()
2124 video_id = mobj.group(1)
2125
2126 # Retrieve video webpage to extract further information
2127 request = urllib2.Request(url, None, std_headers)
2128 try:
2129 self.report_download_webpage(video_id)
2130 webpage = urllib2.urlopen(request).read()
2131 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2132 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2133 return
2134
2135 # Now we begin extracting as much information as we can from what we
2136 # retrieved. First we extract the information common to all extractors,
2137 # and latter we extract those that are Vimeo specific.
2138 self.report_extraction(video_id)
2139
2140 # Extract the config JSON
2141 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2142 try:
2143 config = json.loads(config)
2144 except:
2145 self._downloader.trouble(u'ERROR: unable to extract info section')
2146 return
2147
2148 # Extract title
2149 video_title = config["video"]["title"]
2150 simple_title = _simplify_title(video_title)
2151
2152 # Extract uploader
2153 video_uploader = config["video"]["owner"]["name"]
2154
2155 # Extract video thumbnail
2156 video_thumbnail = config["video"]["thumbnail"]
2157
2158 # Extract video description
2159 try:
2160 lxml.etree
2161 except NameError:
2162 video_description = u'No description available.'
2163 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2164 if mobj is not None:
2165 video_description = mobj.group(1)
2166 else:
2167 html_parser = lxml.etree.HTMLParser()
2168 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2169 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2170 # TODO use another parser
2171
2172 # Extract upload date
2173 video_upload_date = u'NA'
2174 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2175 if mobj is not None:
2176 video_upload_date = mobj.group(1)
2177
2178 # Vimeo specific: extract request signature and timestamp
2179 sig = config['request']['signature']
2180 timestamp = config['request']['timestamp']
2181
2182 # Vimeo specific: extract video codec and quality information
2183 # TODO bind to format param
2184 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2185 for codec in codecs:
2186 if codec[0] in config["video"]["files"]:
2187 video_codec = codec[0]
2188 video_extension = codec[1]
2189 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2190 else: quality = 'sd'
2191 break
2192 else:
2193 self._downloader.trouble(u'ERROR: no known codec found')
2194 return
2195
2196 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2197 %(video_id, sig, timestamp, quality, video_codec.upper())
2198
2199 try:
2200 # Process video information
2201 self._downloader.process_info({
2202 'id': video_id,
2203 'url': video_url,
2204 'uploader': video_uploader,
2205 'upload_date': video_upload_date,
2206 'title': video_title,
2207 'stitle': simple_title,
2208 'ext': video_extension,
2209 'thumbnail': video_thumbnail,
2210 'description': video_description,
2211 'player_url': None,
2212 })
2213 except UnavailableVideoError:
2214 self._downloader.trouble(u'ERROR: unable to download video')
2215
2216
2217 class GenericIE(InfoExtractor):
2218 """Generic last-resort information extractor."""
2219
2220 _VALID_URL = r'.*'
2221 IE_NAME = u'generic'
2222
2223 def __init__(self, downloader=None):
2224 InfoExtractor.__init__(self, downloader)
2225
2226 def report_download_webpage(self, video_id):
2227 """Report webpage download."""
2228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2230
2231 def report_extraction(self, video_id):
2232 """Report information extraction."""
2233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2234
2235 def _real_extract(self, url):
2236 # At this point we have a new video
2237 self._downloader.increment_downloads()
2238
2239 video_id = url.split('/')[-1]
2240 request = urllib2.Request(url)
2241 try:
2242 self.report_download_webpage(video_id)
2243 webpage = urllib2.urlopen(request).read()
2244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2246 return
2247 except ValueError, err:
2248 # since this is the last-resort InfoExtractor, if
2249 # this error is thrown, it'll be thrown here
2250 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2251 return
2252
2253 self.report_extraction(video_id)
2254 # Start with something easy: JW Player in SWFObject
2255 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2256 if mobj is None:
2257 # Broaden the search a little bit
2258 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2259 if mobj is None:
2260 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2261 return
2262
2263 # It's possible that one of the regexes
2264 # matched, but returned an empty group:
2265 if mobj.group(1) is None:
2266 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2267 return
2268
2269 video_url = urllib.unquote(mobj.group(1))
2270 video_id = os.path.basename(video_url)
2271
2272 # here's a fun little line of code for you:
2273 video_extension = os.path.splitext(video_id)[1][1:]
2274 video_id = os.path.splitext(video_id)[0]
2275
2276 # it's tempting to parse this further, but you would
2277 # have to take into account all the variations like
2278 # Video Title - Site Name
2279 # Site Name | Video Title
2280 # Video Title - Tagline | Site Name
2281 # and so on and so forth; it's just not practical
2282 mobj = re.search(r'<title>(.*)</title>', webpage)
2283 if mobj is None:
2284 self._downloader.trouble(u'ERROR: unable to extract title')
2285 return
2286 video_title = mobj.group(1).decode('utf-8')
2287 video_title = sanitize_title(video_title)
2288 simple_title = _simplify_title(video_title)
2289
2290 # video uploader is domain name
2291 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2292 if mobj is None:
2293 self._downloader.trouble(u'ERROR: unable to extract title')
2294 return
2295 video_uploader = mobj.group(1).decode('utf-8')
2296
2297 try:
2298 # Process video information
2299 self._downloader.process_info({
2300 'id': video_id.decode('utf-8'),
2301 'url': video_url.decode('utf-8'),
2302 'uploader': video_uploader,
2303 'upload_date': u'NA',
2304 'title': video_title,
2305 'stitle': simple_title,
2306 'ext': video_extension.decode('utf-8'),
2307 'format': u'NA',
2308 'player_url': None,
2309 })
2310 except UnavailableVideoError, err:
2311 self._downloader.trouble(u'\nERROR: unable to download video')
2312
2313
2314 class YoutubeSearchIE(InfoExtractor):
2315 """Information Extractor for YouTube search queries."""
2316 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2317 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2318 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2319 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2320 _youtube_ie = None
2321 _max_youtube_results = 1000
2322 IE_NAME = u'youtube:search'
2323
2324 def __init__(self, youtube_ie, downloader=None):
2325 InfoExtractor.__init__(self, downloader)
2326 self._youtube_ie = youtube_ie
2327
2328 def report_download_page(self, query, pagenum):
2329 """Report attempt to download playlist page with given number."""
2330 query = query.decode(preferredencoding())
2331 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2332
2333 def _real_initialize(self):
2334 self._youtube_ie.initialize()
2335
2336 def _real_extract(self, query):
2337 mobj = re.match(self._VALID_URL, query)
2338 if mobj is None:
2339 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2340 return
2341
2342 prefix, query = query.split(':')
2343 prefix = prefix[8:]
2344 query = query.encode('utf-8')
2345 if prefix == '':
2346 self._download_n_results(query, 1)
2347 return
2348 elif prefix == 'all':
2349 self._download_n_results(query, self._max_youtube_results)
2350 return
2351 else:
2352 try:
2353 n = long(prefix)
2354 if n <= 0:
2355 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2356 return
2357 elif n > self._max_youtube_results:
2358 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2359 n = self._max_youtube_results
2360 self._download_n_results(query, n)
2361 return
2362 except ValueError: # parsing prefix as integer fails
2363 self._download_n_results(query, 1)
2364 return
2365
2366 def _download_n_results(self, query, n):
2367 """Downloads a specified number of results for a query"""
2368
2369 video_ids = []
2370 already_seen = set()
2371 pagenum = 1
2372
2373 while True:
2374 self.report_download_page(query, pagenum)
2375 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2376 request = urllib2.Request(result_url)
2377 try:
2378 page = urllib2.urlopen(request).read()
2379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2380 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2381 return
2382
2383 # Extract video identifiers
2384 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2385 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2386 if video_id not in already_seen:
2387 video_ids.append(video_id)
2388 already_seen.add(video_id)
2389 if len(video_ids) == n:
2390 # Specified n videos reached
2391 for id in video_ids:
2392 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2393 return
2394
2395 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2396 for id in video_ids:
2397 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2398 return
2399
2400 pagenum = pagenum + 1
2401
2402
2403 class GoogleSearchIE(InfoExtractor):
2404 """Information Extractor for Google Video search queries."""
2405 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2406 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2407 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2408 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2409 _google_ie = None
2410 _max_google_results = 1000
2411 IE_NAME = u'video.google:search'
2412
2413 def __init__(self, google_ie, downloader=None):
2414 InfoExtractor.__init__(self, downloader)
2415 self._google_ie = google_ie
2416
2417 def report_download_page(self, query, pagenum):
2418 """Report attempt to download playlist page with given number."""
2419 query = query.decode(preferredencoding())
2420 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2421
2422 def _real_initialize(self):
2423 self._google_ie.initialize()
2424
2425 def _real_extract(self, query):
2426 mobj = re.match(self._VALID_URL, query)
2427 if mobj is None:
2428 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2429 return
2430
2431 prefix, query = query.split(':')
2432 prefix = prefix[8:]
2433 query = query.encode('utf-8')
2434 if prefix == '':
2435 self._download_n_results(query, 1)
2436 return
2437 elif prefix == 'all':
2438 self._download_n_results(query, self._max_google_results)
2439 return
2440 else:
2441 try:
2442 n = long(prefix)
2443 if n <= 0:
2444 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2445 return
2446 elif n > self._max_google_results:
2447 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2448 n = self._max_google_results
2449 self._download_n_results(query, n)
2450 return
2451 except ValueError: # parsing prefix as integer fails
2452 self._download_n_results(query, 1)
2453 return
2454
2455 def _download_n_results(self, query, n):
2456 """Downloads a specified number of results for a query"""
2457
2458 video_ids = []
2459 pagenum = 0
2460
2461 while True:
2462 self.report_download_page(query, pagenum)
2463 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2464 request = urllib2.Request(result_url)
2465 try:
2466 page = urllib2.urlopen(request).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469 return
2470
2471 # Extract video identifiers
2472 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2473 video_id = mobj.group(1)
2474 if video_id not in video_ids:
2475 video_ids.append(video_id)
2476 if len(video_ids) == n:
2477 # Specified n videos reached
2478 for id in video_ids:
2479 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2480 return
2481
2482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483 for id in video_ids:
2484 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2485 return
2486
2487 pagenum = pagenum + 1
2488
2489
2490 class YahooSearchIE(InfoExtractor):
2491 """Information Extractor for Yahoo! Video search queries."""
2492 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2493 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2494 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2495 _MORE_PAGES_INDICATOR = r'\s*Next'
2496 _yahoo_ie = None
2497 _max_yahoo_results = 1000
2498 IE_NAME = u'video.yahoo:search'
2499
2500 def __init__(self, yahoo_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._yahoo_ie = yahoo_ie
2503
2504 def report_download_page(self, query, pagenum):
2505 """Report attempt to download playlist page with given number."""
2506 query = query.decode(preferredencoding())
2507 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2508
2509 def _real_initialize(self):
2510 self._yahoo_ie.initialize()
2511
2512 def _real_extract(self, query):
2513 mobj = re.match(self._VALID_URL, query)
2514 if mobj is None:
2515 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2516 return
2517
2518 prefix, query = query.split(':')
2519 prefix = prefix[8:]
2520 query = query.encode('utf-8')
2521 if prefix == '':
2522 self._download_n_results(query, 1)
2523 return
2524 elif prefix == 'all':
2525 self._download_n_results(query, self._max_yahoo_results)
2526 return
2527 else:
2528 try:
2529 n = long(prefix)
2530 if n <= 0:
2531 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2532 return
2533 elif n > self._max_yahoo_results:
2534 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2535 n = self._max_yahoo_results
2536 self._download_n_results(query, n)
2537 return
2538 except ValueError: # parsing prefix as integer fails
2539 self._download_n_results(query, 1)
2540 return
2541
2542 def _download_n_results(self, query, n):
2543 """Downloads a specified number of results for a query"""
2544
2545 video_ids = []
2546 already_seen = set()
2547 pagenum = 1
2548
2549 while True:
2550 self.report_download_page(query, pagenum)
2551 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2552 request = urllib2.Request(result_url)
2553 try:
2554 page = urllib2.urlopen(request).read()
2555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2557 return
2558
2559 # Extract video identifiers
2560 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561 video_id = mobj.group(1)
2562 if video_id not in already_seen:
2563 video_ids.append(video_id)
2564 already_seen.add(video_id)
2565 if len(video_ids) == n:
2566 # Specified n videos reached
2567 for id in video_ids:
2568 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2569 return
2570
2571 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2572 for id in video_ids:
2573 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2574 return
2575
2576 pagenum = pagenum + 1
2577
2578
2579 class YoutubePlaylistIE(InfoExtractor):
2580 """Information Extractor for YouTube playlists."""
2581
2582 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2583 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2584 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2585 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2586 _youtube_ie = None
2587 IE_NAME = u'youtube:playlist'
2588
2589 def __init__(self, youtube_ie, downloader=None):
2590 InfoExtractor.__init__(self, downloader)
2591 self._youtube_ie = youtube_ie
2592
2593 def report_download_page(self, playlist_id, pagenum):
2594 """Report attempt to download playlist page with given number."""
2595 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2596
2597 def _real_initialize(self):
2598 self._youtube_ie.initialize()
2599
2600 def _real_extract(self, url):
2601 # Extract playlist id
2602 mobj = re.match(self._VALID_URL, url)
2603 if mobj is None:
2604 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2605 return
2606
2607 # Single video case
2608 if mobj.group(3) is not None:
2609 self._youtube_ie.extract(mobj.group(3))
2610 return
2611
2612 # Download playlist pages
2613 # prefix is 'p' as default for playlists but there are other types that need extra care
2614 playlist_prefix = mobj.group(1)
2615 if playlist_prefix == 'a':
2616 playlist_access = 'artist'
2617 else:
2618 playlist_prefix = 'p'
2619 playlist_access = 'view_play_list'
2620 playlist_id = mobj.group(2)
2621 video_ids = []
2622 pagenum = 1
2623
2624 while True:
2625 self.report_download_page(playlist_id, pagenum)
2626 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2627 request = urllib2.Request(url)
2628 try:
2629 page = urllib2.urlopen(request).read()
2630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2632 return
2633
2634 # Extract video identifiers
2635 ids_in_page = []
2636 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2637 if mobj.group(1) not in ids_in_page:
2638 ids_in_page.append(mobj.group(1))
2639 video_ids.extend(ids_in_page)
2640
2641 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2642 break
2643 pagenum = pagenum + 1
2644
2645 playliststart = self._downloader.params.get('playliststart', 1) - 1
2646 playlistend = self._downloader.params.get('playlistend', -1)
2647 video_ids = video_ids[playliststart:playlistend]
2648
2649 for id in video_ids:
2650 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2651 return
2652
2653
2654 class YoutubeUserIE(InfoExtractor):
2655 """Information Extractor for YouTube users."""
2656
2657 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2658 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2659 _GDATA_PAGE_SIZE = 50
2660 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2661 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2662 _youtube_ie = None
2663 IE_NAME = u'youtube:user'
2664
2665 def __init__(self, youtube_ie, downloader=None):
2666 InfoExtractor.__init__(self, downloader)
2667 self._youtube_ie = youtube_ie
2668
2669 def report_download_page(self, username, start_index):
2670 """Report attempt to download user page."""
2671 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2672 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2673
2674 def _real_initialize(self):
2675 self._youtube_ie.initialize()
2676
2677 def _real_extract(self, url):
2678 # Extract username
2679 mobj = re.match(self._VALID_URL, url)
2680 if mobj is None:
2681 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2682 return
2683
2684 username = mobj.group(1)
2685
2686 # Download video ids using YouTube Data API. Result size per
2687 # query is limited (currently to 50 videos) so we need to query
2688 # page by page until there are no video ids - it means we got
2689 # all of them.
2690
2691 video_ids = []
2692 pagenum = 0
2693
2694 while True:
2695 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2696 self.report_download_page(username, start_index)
2697
2698 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2699
2700 try:
2701 page = urllib2.urlopen(request).read()
2702 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2703 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2704 return
2705
2706 # Extract video identifiers
2707 ids_in_page = []
2708
2709 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2710 if mobj.group(1) not in ids_in_page:
2711 ids_in_page.append(mobj.group(1))
2712
2713 video_ids.extend(ids_in_page)
2714
2715 # A little optimization - if current page is not
2716 # "full", ie. does not contain PAGE_SIZE video ids then
2717 # we can assume that this page is the last one - there
2718 # are no more ids on further pages - no need to query
2719 # again.
2720
2721 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2722 break
2723
2724 pagenum += 1
2725
2726 all_ids_count = len(video_ids)
2727 playliststart = self._downloader.params.get('playliststart', 1) - 1
2728 playlistend = self._downloader.params.get('playlistend', -1)
2729
2730 if playlistend == -1:
2731 video_ids = video_ids[playliststart:]
2732 else:
2733 video_ids = video_ids[playliststart:playlistend]
2734
2735 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2736 (username, all_ids_count, len(video_ids)))
2737
2738 for video_id in video_ids:
2739 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2740
2741
2742 class DepositFilesIE(InfoExtractor):
2743 """Information extractor for depositfiles.com"""
2744
2745 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2746 IE_NAME = u'DepositFiles'
2747
2748 def __init__(self, downloader=None):
2749 InfoExtractor.__init__(self, downloader)
2750
2751 def report_download_webpage(self, file_id):
2752 """Report webpage download."""
2753 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2754
2755 def report_extraction(self, file_id):
2756 """Report information extraction."""
2757 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2758
2759 def _real_extract(self, url):
2760 # At this point we have a new file
2761 self._downloader.increment_downloads()
2762
2763 file_id = url.split('/')[-1]
2764 # Rebuild url in english locale
2765 url = 'http://depositfiles.com/en/files/' + file_id
2766
2767 # Retrieve file webpage with 'Free download' button pressed
2768 free_download_indication = { 'gateway_result' : '1' }
2769 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2770 try:
2771 self.report_download_webpage(file_id)
2772 webpage = urllib2.urlopen(request).read()
2773 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2774 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2775 return
2776
2777 # Search for the real file URL
2778 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2779 if (mobj is None) or (mobj.group(1) is None):
2780 # Try to figure out reason of the error.
2781 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2782 if (mobj is not None) and (mobj.group(1) is not None):
2783 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2784 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2785 else:
2786 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2787 return
2788
2789 file_url = mobj.group(1)
2790 file_extension = os.path.splitext(file_url)[1][1:]
2791
2792 # Search for file title
2793 mobj = re.search(r'<b title="(.*?)">', webpage)
2794 if mobj is None:
2795 self._downloader.trouble(u'ERROR: unable to extract title')
2796 return
2797 file_title = mobj.group(1).decode('utf-8')
2798
2799 try:
2800 # Process file information
2801 self._downloader.process_info({
2802 'id': file_id.decode('utf-8'),
2803 'url': file_url.decode('utf-8'),
2804 'uploader': u'NA',
2805 'upload_date': u'NA',
2806 'title': file_title,
2807 'stitle': file_title,
2808 'ext': file_extension.decode('utf-8'),
2809 'format': u'NA',
2810 'player_url': None,
2811 })
2812 except UnavailableVideoError, err:
2813 self._downloader.trouble(u'ERROR: unable to download file')
2814
2815
2816 class FacebookIE(InfoExtractor):
2817 """Information Extractor for Facebook"""
2818
2819 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2820 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2821 _NETRC_MACHINE = 'facebook'
2822 _available_formats = ['video', 'highqual', 'lowqual']
2823 _video_extensions = {
2824 'video': 'mp4',
2825 'highqual': 'mp4',
2826 'lowqual': 'mp4',
2827 }
2828 IE_NAME = u'facebook'
2829
2830 def __init__(self, downloader=None):
2831 InfoExtractor.__init__(self, downloader)
2832
2833 def _reporter(self, message):
2834 """Add header and report message."""
2835 self._downloader.to_screen(u'[facebook] %s' % message)
2836
2837 def report_login(self):
2838 """Report attempt to log in."""
2839 self._reporter(u'Logging in')
2840
2841 def report_video_webpage_download(self, video_id):
2842 """Report attempt to download video webpage."""
2843 self._reporter(u'%s: Downloading video webpage' % video_id)
2844
2845 def report_information_extraction(self, video_id):
2846 """Report attempt to extract video information."""
2847 self._reporter(u'%s: Extracting video information' % video_id)
2848
2849 def _parse_page(self, video_webpage):
2850 """Extract video information from page"""
2851 # General data
2852 data = {'title': r'\("video_title", "(.*?)"\)',
2853 'description': r'<div class="datawrap">(.*?)</div>',
2854 'owner': r'\("video_owner_name", "(.*?)"\)',
2855 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2856 }
2857 video_info = {}
2858 for piece in data.keys():
2859 mobj = re.search(data[piece], video_webpage)
2860 if mobj is not None:
2861 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2862
2863 # Video urls
2864 video_urls = {}
2865 for fmt in self._available_formats:
2866 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2867 if mobj is not None:
2868 # URL is in a Javascript segment inside an escaped Unicode format within
2869 # the generally utf-8 page
2870 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2871 video_info['video_urls'] = video_urls
2872
2873 return video_info
2874
2875 def _real_initialize(self):
2876 if self._downloader is None:
2877 return
2878
2879 useremail = None
2880 password = None
2881 downloader_params = self._downloader.params
2882
2883 # Attempt to use provided username and password or .netrc data
2884 if downloader_params.get('username', None) is not None:
2885 useremail = downloader_params['username']
2886 password = downloader_params['password']
2887 elif downloader_params.get('usenetrc', False):
2888 try:
2889 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2890 if info is not None:
2891 useremail = info[0]
2892 password = info[2]
2893 else:
2894 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2895 except (IOError, netrc.NetrcParseError), err:
2896 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2897 return
2898
2899 if useremail is None:
2900 return
2901
2902 # Log in
2903 login_form = {
2904 'email': useremail,
2905 'pass': password,
2906 'login': 'Log+In'
2907 }
2908 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2909 try:
2910 self.report_login()
2911 login_results = urllib2.urlopen(request).read()
2912 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2913 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2914 return
2915 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2917 return
2918
2919 def _real_extract(self, url):
2920 mobj = re.match(self._VALID_URL, url)
2921 if mobj is None:
2922 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2923 return
2924 video_id = mobj.group('ID')
2925
2926 # Get video webpage
2927 self.report_video_webpage_download(video_id)
2928 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2929 try:
2930 page = urllib2.urlopen(request)
2931 video_webpage = page.read()
2932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2933 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2934 return
2935
2936 # Start extracting information
2937 self.report_information_extraction(video_id)
2938
2939 # Extract information
2940 video_info = self._parse_page(video_webpage)
2941
2942 # uploader
2943 if 'owner' not in video_info:
2944 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2945 return
2946 video_uploader = video_info['owner']
2947
2948 # title
2949 if 'title' not in video_info:
2950 self._downloader.trouble(u'ERROR: unable to extract video title')
2951 return
2952 video_title = video_info['title']
2953 video_title = video_title.decode('utf-8')
2954 video_title = sanitize_title(video_title)
2955
2956 simple_title = _simplify_title(video_title)
2957
2958 # thumbnail image
2959 if 'thumbnail' not in video_info:
2960 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2961 video_thumbnail = ''
2962 else:
2963 video_thumbnail = video_info['thumbnail']
2964
2965 # upload date
2966 upload_date = u'NA'
2967 if 'upload_date' in video_info:
2968 upload_time = video_info['upload_date']
2969 timetuple = email.utils.parsedate_tz(upload_time)
2970 if timetuple is not None:
2971 try:
2972 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2973 except:
2974 pass
2975
2976 # description
2977 video_description = video_info.get('description', 'No description available.')
2978
2979 url_map = video_info['video_urls']
2980 if len(url_map.keys()) > 0:
2981 # Decide which formats to download
2982 req_format = self._downloader.params.get('format', None)
2983 format_limit = self._downloader.params.get('format_limit', None)
2984
2985 if format_limit is not None and format_limit in self._available_formats:
2986 format_list = self._available_formats[self._available_formats.index(format_limit):]
2987 else:
2988 format_list = self._available_formats
2989 existing_formats = [x for x in format_list if x in url_map]
2990 if len(existing_formats) == 0:
2991 self._downloader.trouble(u'ERROR: no known formats available for video')
2992 return
2993 if req_format is None:
2994 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2995 elif req_format == 'worst':
2996 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2997 elif req_format == '-1':
2998 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2999 else:
3000 # Specific format
3001 if req_format not in url_map:
3002 self._downloader.trouble(u'ERROR: requested format not available')
3003 return
3004 video_url_list = [(req_format, url_map[req_format])] # Specific format
3005
3006 for format_param, video_real_url in video_url_list:
3007
3008 # At this point we have a new video
3009 self._downloader.increment_downloads()
3010
3011 # Extension
3012 video_extension = self._video_extensions.get(format_param, 'mp4')
3013
3014 try:
3015 # Process video information
3016 self._downloader.process_info({
3017 'id': video_id.decode('utf-8'),
3018 'url': video_real_url.decode('utf-8'),
3019 'uploader': video_uploader.decode('utf-8'),
3020 'upload_date': upload_date,
3021 'title': video_title,
3022 'stitle': simple_title,
3023 'ext': video_extension.decode('utf-8'),
3024 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3025 'thumbnail': video_thumbnail.decode('utf-8'),
3026 'description': video_description.decode('utf-8'),
3027 'player_url': None,
3028 })
3029 except UnavailableVideoError, err:
3030 self._downloader.trouble(u'\nERROR: unable to download video')
3031
3032 class BlipTVIE(InfoExtractor):
3033 """Information extractor for blip.tv"""
3034
3035 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3036 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3037 IE_NAME = u'blip.tv'
3038
3039 def report_extraction(self, file_id):
3040 """Report information extraction."""
3041 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3042
3043 def report_direct_download(self, title):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3046
3047 def _real_extract(self, url):
3048 mobj = re.match(self._VALID_URL, url)
3049 if mobj is None:
3050 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3051 return
3052
3053 if '?' in url:
3054 cchar = '&'
3055 else:
3056 cchar = '?'
3057 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3058 request = urllib2.Request(json_url)
3059 self.report_extraction(mobj.group(1))
3060 info = None
3061 try:
3062 urlh = urllib2.urlopen(request)
3063 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3064 basename = url.split('/')[-1]
3065 title,ext = os.path.splitext(basename)
3066 title = title.decode('UTF-8')
3067 ext = ext.replace('.', '')
3068 self.report_direct_download(title)
3069 info = {
3070 'id': title,
3071 'url': url,
3072 'title': title,
3073 'stitle': _simplify_title(title),
3074 'ext': ext,
3075 'urlhandle': urlh
3076 }
3077 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3078 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3079 return
3080 if info is None: # Regular URL
3081 try:
3082 json_code = urlh.read()
3083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3085 return
3086
3087 try:
3088 json_data = json.loads(json_code)
3089 if 'Post' in json_data:
3090 data = json_data['Post']
3091 else:
3092 data = json_data
3093
3094 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3095 video_url = data['media']['url']
3096 umobj = re.match(self._URL_EXT, video_url)
3097 if umobj is None:
3098 raise ValueError('Can not determine filename extension')
3099 ext = umobj.group(1)
3100
3101 info = {
3102 'id': data['item_id'],
3103 'url': video_url,
3104 'uploader': data['display_name'],
3105 'upload_date': upload_date,
3106 'title': data['title'],
3107 'stitle': _simplify_title(data['title']),
3108 'ext': ext,
3109 'format': data['media']['mimeType'],
3110 'thumbnail': data['thumbnailUrl'],
3111 'description': data['description'],
3112 'player_url': data['embedUrl']
3113 }
3114 except (ValueError,KeyError), err:
3115 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3116 return
3117
3118 self._downloader.increment_downloads()
3119
3120 try:
3121 self._downloader.process_info(info)
3122 except UnavailableVideoError, err:
3123 self._downloader.trouble(u'\nERROR: unable to download video')
3124
3125
3126 class MyVideoIE(InfoExtractor):
3127 """Information Extractor for myvideo.de."""
3128
3129 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3130 IE_NAME = u'myvideo'
3131
3132 def __init__(self, downloader=None):
3133 InfoExtractor.__init__(self, downloader)
3134
3135 def report_download_webpage(self, video_id):
3136 """Report webpage download."""
3137 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3138
3139 def report_extraction(self, video_id):
3140 """Report information extraction."""
3141 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3142
3143 def _real_extract(self,url):
3144 mobj = re.match(self._VALID_URL, url)
3145 if mobj is None:
3146 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3147 return
3148
3149 video_id = mobj.group(1)
3150
3151 # Get video webpage
3152 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3153 try:
3154 self.report_download_webpage(video_id)
3155 webpage = urllib2.urlopen(request).read()
3156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3158 return
3159
3160 self.report_extraction(video_id)
3161 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3162 webpage)
3163 if mobj is None:
3164 self._downloader.trouble(u'ERROR: unable to extract media URL')
3165 return
3166 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3167
3168 mobj = re.search('<title>([^<]+)</title>', webpage)
3169 if mobj is None:
3170 self._downloader.trouble(u'ERROR: unable to extract title')
3171 return
3172
3173 video_title = mobj.group(1)
3174 video_title = sanitize_title(video_title)
3175
3176 simple_title = _simplify_title(video_title)
3177
3178 try:
3179 self._downloader.process_info({
3180 'id': video_id,
3181 'url': video_url,
3182 'uploader': u'NA',
3183 'upload_date': u'NA',
3184 'title': video_title,
3185 'stitle': simple_title,
3186 'ext': u'flv',
3187 'format': u'NA',
3188 'player_url': None,
3189 })
3190 except UnavailableVideoError:
3191 self._downloader.trouble(u'\nERROR: Unable to download video')
3192
3193 class ComedyCentralIE(InfoExtractor):
3194 """Information extractor for The Daily Show and Colbert Report """
3195
3196 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3197 IE_NAME = u'comedycentral'
3198
3199 def report_extraction(self, episode_id):
3200 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3201
3202 def report_config_download(self, episode_id):
3203 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3204
3205 def report_index_download(self, episode_id):
3206 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3207
3208 def report_player_url(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3210
3211 def _real_extract(self, url):
3212 mobj = re.match(self._VALID_URL, url)
3213 if mobj is None:
3214 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3215 return
3216
3217 if mobj.group('shortname'):
3218 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3219 url = u'http://www.thedailyshow.com/full-episodes/'
3220 else:
3221 url = u'http://www.colbertnation.com/full-episodes/'
3222 mobj = re.match(self._VALID_URL, url)
3223 assert mobj is not None
3224
3225 dlNewest = not mobj.group('episode')
3226 if dlNewest:
3227 epTitle = mobj.group('showname')
3228 else:
3229 epTitle = mobj.group('episode')
3230
3231 req = urllib2.Request(url)
3232 self.report_extraction(epTitle)
3233 try:
3234 htmlHandle = urllib2.urlopen(req)
3235 html = htmlHandle.read()
3236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3237 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3238 return
3239 if dlNewest:
3240 url = htmlHandle.geturl()
3241 mobj = re.match(self._VALID_URL, url)
3242 if mobj is None:
3243 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3244 return
3245 if mobj.group('episode') == '':
3246 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3247 return
3248 epTitle = mobj.group('episode')
3249
3250 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3251 if len(mMovieParams) == 0:
3252 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3253 return
3254
3255 playerUrl_raw = mMovieParams[0][0]
3256 self.report_player_url(epTitle)
3257 try:
3258 urlHandle = urllib2.urlopen(playerUrl_raw)
3259 playerUrl = urlHandle.geturl()
3260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3261 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3262 return
3263
3264 uri = mMovieParams[0][1]
3265 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3266 self.report_index_download(epTitle)
3267 try:
3268 indexXml = urllib2.urlopen(indexUrl).read()
3269 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3271 return
3272
3273 idoc = xml.etree.ElementTree.fromstring(indexXml)
3274 itemEls = idoc.findall('.//item')
3275 for itemEl in itemEls:
3276 mediaId = itemEl.findall('./guid')[0].text
3277 shortMediaId = mediaId.split(':')[-1]
3278 showId = mediaId.split(':')[-2].replace('.com', '')
3279 officialTitle = itemEl.findall('./title')[0].text
3280 officialDate = itemEl.findall('./pubDate')[0].text
3281
3282 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3283 urllib.urlencode({'uri': mediaId}))
3284 configReq = urllib2.Request(configUrl)
3285 self.report_config_download(epTitle)
3286 try:
3287 configXml = urllib2.urlopen(configReq).read()
3288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3289 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3290 return
3291
3292 cdoc = xml.etree.ElementTree.fromstring(configXml)
3293 turls = []
3294 for rendition in cdoc.findall('.//rendition'):
3295 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3296 turls.append(finfo)
3297
3298 if len(turls) == 0:
3299 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3300 continue
3301
3302 # For now, just pick the highest bitrate
3303 format,video_url = turls[-1]
3304
3305 self._downloader.increment_downloads()
3306
3307 effTitle = showId + u'-' + epTitle
3308 info = {
3309 'id': shortMediaId,
3310 'url': video_url,
3311 'uploader': showId,
3312 'upload_date': officialDate,
3313 'title': effTitle,
3314 'stitle': _simplify_title(effTitle),
3315 'ext': 'mp4',
3316 'format': format,
3317 'thumbnail': None,
3318 'description': officialTitle,
3319 'player_url': playerUrl
3320 }
3321
3322 try:
3323 self._downloader.process_info(info)
3324 except UnavailableVideoError, err:
3325 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3326 continue
3327
3328
3329 class EscapistIE(InfoExtractor):
3330 """Information extractor for The Escapist """
3331
3332 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3333 IE_NAME = u'escapist'
3334
3335 def report_extraction(self, showName):
3336 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3337
3338 def report_config_download(self, showName):
3339 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3340
3341 def _real_extract(self, url):
3342 htmlParser = HTMLParser.HTMLParser()
3343
3344 mobj = re.match(self._VALID_URL, url)
3345 if mobj is None:
3346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3347 return
3348 showName = mobj.group('showname')
3349 videoId = mobj.group('episode')
3350
3351 self.report_extraction(showName)
3352 try:
3353 webPage = urllib2.urlopen(url).read()
3354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3356 return
3357
3358 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3359 description = htmlParser.unescape(descMatch.group(1))
3360 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3361 imgUrl = htmlParser.unescape(imgMatch.group(1))
3362 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3363 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3364 configUrlMatch = re.search('config=(.*)$', playerUrl)
3365 configUrl = urllib2.unquote(configUrlMatch.group(1))
3366
3367 self.report_config_download(showName)
3368 try:
3369 configJSON = urllib2.urlopen(configUrl).read()
3370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3371 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3372 return
3373
3374 # Technically, it's JavaScript, not JSON
3375 configJSON = configJSON.replace("'", '"')
3376
3377 try:
3378 config = json.loads(configJSON)
3379 except (ValueError,), err:
3380 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3381 return
3382
3383 playlist = config['playlist']
3384 videoUrl = playlist[1]['url']
3385
3386 self._downloader.increment_downloads()
3387 info = {
3388 'id': videoId,
3389 'url': videoUrl,
3390 'uploader': showName,
3391 'upload_date': None,
3392 'title': showName,
3393 'stitle': _simplify_title(showName),
3394 'ext': 'flv',
3395 'format': 'flv',
3396 'thumbnail': imgUrl,
3397 'description': description,
3398 'player_url': playerUrl,
3399 }
3400
3401 try:
3402 self._downloader.process_info(info)
3403 except UnavailableVideoError, err:
3404 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3405
3406
3407 class CollegeHumorIE(InfoExtractor):
3408 """Information extractor for collegehumor.com"""
3409
3410 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3411 IE_NAME = u'collegehumor'
3412
3413 def report_webpage(self, video_id):
3414 """Report information extraction."""
3415 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3416
3417 def report_extraction(self, video_id):
3418 """Report information extraction."""
3419 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3420
3421 def _real_extract(self, url):
3422 htmlParser = HTMLParser.HTMLParser()
3423
3424 mobj = re.match(self._VALID_URL, url)
3425 if mobj is None:
3426 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3427 return
3428 video_id = mobj.group('videoid')
3429
3430 self.report_webpage(video_id)
3431 request = urllib2.Request(url)
3432 try:
3433 webpage = urllib2.urlopen(request).read()
3434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3436 return
3437
3438 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3439 if m is None:
3440 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3441 return
3442 internal_video_id = m.group('internalvideoid')
3443
3444 info = {
3445 'id': video_id,
3446 'internal_id': internal_video_id,
3447 }
3448
3449 self.report_extraction(video_id)
3450 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3451 try:
3452 metaXml = urllib2.urlopen(xmlUrl).read()
3453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3454 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3455 return
3456
3457 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3458 try:
3459 videoNode = mdoc.findall('./video')[0]
3460 info['description'] = videoNode.findall('./description')[0].text
3461 info['title'] = videoNode.findall('./caption')[0].text
3462 info['stitle'] = _simplify_title(info['title'])
3463 info['url'] = videoNode.findall('./file')[0].text
3464 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3465 info['ext'] = info['url'].rpartition('.')[2]
3466 info['format'] = info['ext']
3467 except IndexError:
3468 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3469 return
3470
3471 self._downloader.increment_downloads()
3472
3473 try:
3474 self._downloader.process_info(info)
3475 except UnavailableVideoError, err:
3476 self._downloader.trouble(u'\nERROR: unable to download video')
3477
3478
3479 class XVideosIE(InfoExtractor):
3480 """Information extractor for xvideos.com"""
3481
3482 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3483 IE_NAME = u'xvideos'
3484
3485 def report_webpage(self, video_id):
3486 """Report information extraction."""
3487 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3488
3489 def report_extraction(self, video_id):
3490 """Report information extraction."""
3491 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3492
3493 def _real_extract(self, url):
3494 htmlParser = HTMLParser.HTMLParser()
3495
3496 mobj = re.match(self._VALID_URL, url)
3497 if mobj is None:
3498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3499 return
3500 video_id = mobj.group(1).decode('utf-8')
3501
3502 self.report_webpage(video_id)
3503
3504 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3505 try:
3506 webpage = urllib2.urlopen(request).read()
3507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3509 return
3510
3511 self.report_extraction(video_id)
3512
3513
3514 # Extract video URL
3515 mobj = re.search(r'flv_url=(.+?)&', webpage)
3516 if mobj is None:
3517 self._downloader.trouble(u'ERROR: unable to extract video url')
3518 return
3519 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3520
3521
3522 # Extract title
3523 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3524 if mobj is None:
3525 self._downloader.trouble(u'ERROR: unable to extract video title')
3526 return
3527 video_title = mobj.group(1).decode('utf-8')
3528
3529
3530 # Extract video thumbnail
3531 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3532 if mobj is None:
3533 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3534 return
3535 video_thumbnail = mobj.group(1).decode('utf-8')
3536
3537
3538
3539 self._downloader.increment_downloads()
3540 info = {
3541 'id': video_id,
3542 'url': video_url,
3543 'uploader': None,
3544 'upload_date': None,
3545 'title': video_title,
3546 'stitle': _simplify_title(video_title),
3547 'ext': 'flv',
3548 'format': 'flv',
3549 'thumbnail': video_thumbnail,
3550 'description': None,
3551 'player_url': None,
3552 }
3553
3554 try:
3555 self._downloader.process_info(info)
3556 except UnavailableVideoError, err:
3557 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3558
3559
3560 class SoundcloudIE(InfoExtractor):
3561 """Information extractor for soundcloud.com
3562 To access the media, the uid of the song and a stream token
3563 must be extracted from the page source and the script must make
3564 a request to media.soundcloud.com/crossdomain.xml. Then
3565 the media can be grabbed by requesting from an url composed
3566 of the stream token and uid
3567 """
3568
3569 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3570 IE_NAME = u'soundcloud'
3571
3572 def __init__(self, downloader=None):
3573 InfoExtractor.__init__(self, downloader)
3574
3575 def report_webpage(self, video_id):
3576 """Report information extraction."""
3577 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3578
3579 def report_extraction(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3582
3583 def _real_extract(self, url):
3584 htmlParser = HTMLParser.HTMLParser()
3585
3586 mobj = re.match(self._VALID_URL, url)
3587 if mobj is None:
3588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589 return
3590
3591 # extract uploader (which is in the url)
3592 uploader = mobj.group(1).decode('utf-8')
3593 # extract simple title (uploader + slug of song title)
3594 slug_title = mobj.group(2).decode('utf-8')
3595 simple_title = uploader + '-' + slug_title
3596
3597 self.report_webpage('%s/%s' % (uploader, slug_title))
3598
3599 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3600 try:
3601 webpage = urllib2.urlopen(request).read()
3602 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3603 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3604 return
3605
3606 self.report_extraction('%s/%s' % (uploader, slug_title))
3607
3608 # extract uid and stream token that soundcloud hands out for access
3609 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3610 if mobj:
3611 video_id = mobj.group(1)
3612 stream_token = mobj.group(2)
3613
3614 # extract unsimplified title
3615 mobj = re.search('"title":"(.*?)",', webpage)
3616 if mobj:
3617 title = mobj.group(1)
3618
3619 # construct media url (with uid/token)
3620 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3621 mediaURL = mediaURL % (video_id, stream_token)
3622
3623 # description
3624 description = u'No description available'
3625 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3626 if mobj:
3627 description = mobj.group(1)
3628
3629 # upload date
3630 upload_date = None
3631 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3632 if mobj:
3633 try:
3634 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3635 except Exception, e:
3636 print str(e)
3637
3638 # for soundcloud, a request to a cross domain is required for cookies
3639 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3640
3641 try:
3642 self._downloader.process_info({
3643 'id': video_id.decode('utf-8'),
3644 'url': mediaURL,
3645 'uploader': uploader.decode('utf-8'),
3646 'upload_date': upload_date,
3647 'title': simple_title.decode('utf-8'),
3648 'stitle': simple_title.decode('utf-8'),
3649 'ext': u'mp3',
3650 'format': u'NA',
3651 'player_url': None,
3652 'description': description.decode('utf-8')
3653 })
3654 except UnavailableVideoError:
3655 self._downloader.trouble(u'\nERROR: unable to download video')
3656
3657
3658 class InfoQIE(InfoExtractor):
3659 """Information extractor for infoq.com"""
3660
3661 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3662 IE_NAME = u'infoq'
3663
3664 def report_webpage(self, video_id):
3665 """Report information extraction."""
3666 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3667
3668 def report_extraction(self, video_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3671
3672 def _real_extract(self, url):
3673 htmlParser = HTMLParser.HTMLParser()
3674
3675 mobj = re.match(self._VALID_URL, url)
3676 if mobj is None:
3677 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3678 return
3679
3680 self.report_webpage(url)
3681
3682 request = urllib2.Request(url)
3683 try:
3684 webpage = urllib2.urlopen(request).read()
3685 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3686 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3687 return
3688
3689 self.report_extraction(url)
3690
3691
3692 # Extract video URL
3693 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3694 if mobj is None:
3695 self._downloader.trouble(u'ERROR: unable to extract video url')
3696 return
3697 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3698
3699
3700 # Extract title
3701 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3702 if mobj is None:
3703 self._downloader.trouble(u'ERROR: unable to extract video title')
3704 return
3705 video_title = mobj.group(1).decode('utf-8')
3706
3707 # Extract description
3708 video_description = u'No description available.'
3709 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3710 if mobj is not None:
3711 video_description = mobj.group(1).decode('utf-8')
3712
3713 video_filename = video_url.split('/')[-1]
3714 video_id, extension = video_filename.split('.')
3715
3716 self._downloader.increment_downloads()
3717 info = {
3718 'id': video_id,
3719 'url': video_url,
3720 'uploader': None,
3721 'upload_date': None,
3722 'title': video_title,
3723 'stitle': _simplify_title(video_title),
3724 'ext': extension,
3725 'format': extension, # Extension is always(?) mp4, but seems to be flv
3726 'thumbnail': None,
3727 'description': video_description,
3728 'player_url': None,
3729 }
3730
3731 try:
3732 self._downloader.process_info(info)
3733 except UnavailableVideoError, err:
3734 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3735
3736 class MixcloudIE(InfoExtractor):
3737 """Information extractor for www.mixcloud.com"""
3738 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3739 IE_NAME = u'mixcloud'
3740
3741 def __init__(self, downloader=None):
3742 InfoExtractor.__init__(self, downloader)
3743
3744 def report_download_json(self, file_id):
3745 """Report JSON download."""
3746 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3747
3748 def report_extraction(self, file_id):
3749 """Report information extraction."""
3750 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3751
3752 def get_urls(self, jsonData, fmt, bitrate='best'):
3753 """Get urls from 'audio_formats' section in json"""
3754 file_url = None
3755 try:
3756 bitrate_list = jsonData[fmt]
3757 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3758 bitrate = max(bitrate_list) # select highest
3759
3760 url_list = jsonData[fmt][bitrate]
3761 except TypeError: # we have no bitrate info.
3762 url_list = jsonData[fmt]
3763
3764 return url_list
3765
3766 def check_urls(self, url_list):
3767 """Returns 1st active url from list"""
3768 for url in url_list:
3769 try:
3770 urllib2.urlopen(url)
3771 return url
3772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3773 url = None
3774
3775 return None
3776
3777 def _print_formats(self, formats):
3778 print 'Available formats:'
3779 for fmt in formats.keys():
3780 for b in formats[fmt]:
3781 try:
3782 ext = formats[fmt][b][0]
3783 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3784 except TypeError: # we have no bitrate info
3785 ext = formats[fmt][0]
3786 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3787 break
3788
3789 def _real_extract(self, url):
3790 mobj = re.match(self._VALID_URL, url)
3791 if mobj is None:
3792 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3793 return
3794 # extract uploader & filename from url
3795 uploader = mobj.group(1).decode('utf-8')
3796 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3797
3798 # construct API request
3799 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3800 # retrieve .json file with links to files
3801 request = urllib2.Request(file_url)
3802 try:
3803 self.report_download_json(file_url)
3804 jsonData = urllib2.urlopen(request).read()
3805 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3806 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3807 return
3808
3809 # parse JSON
3810 json_data = json.loads(jsonData)
3811 player_url = json_data['player_swf_url']
3812 formats = dict(json_data['audio_formats'])
3813
3814 req_format = self._downloader.params.get('format', None)
3815 bitrate = None
3816
3817 if self._downloader.params.get('listformats', None):
3818 self._print_formats(formats)
3819 return
3820
3821 if req_format is None or req_format == 'best':
3822 for format_param in formats.keys():
3823 url_list = self.get_urls(formats, format_param)
3824 # check urls
3825 file_url = self.check_urls(url_list)
3826 if file_url is not None:
3827 break # got it!
3828 else:
3829 if req_format not in formats.keys():
3830 self._downloader.trouble(u'ERROR: format is not available')
3831 return
3832
3833 url_list = self.get_urls(formats, req_format)
3834 file_url = self.check_urls(url_list)
3835 format_param = req_format
3836
3837 # We have audio
3838 self._downloader.increment_downloads()
3839 try:
3840 # Process file information
3841 self._downloader.process_info({
3842 'id': file_id.decode('utf-8'),
3843 'url': file_url.decode('utf-8'),
3844 'uploader': uploader.decode('utf-8'),
3845 'upload_date': u'NA',
3846 'title': json_data['name'],
3847 'stitle': _simplify_title(json_data['name']),
3848 'ext': file_url.split('.')[-1].decode('utf-8'),
3849 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3850 'thumbnail': json_data['thumbnail_url'],
3851 'description': json_data['description'],
3852 'player_url': player_url.decode('utf-8'),
3853 })
3854 except UnavailableVideoError, err:
3855 self._downloader.trouble(u'ERROR: unable to download file')
3856
3857 class StanfordOpenClassroomIE(InfoExtractor):
3858 """Information extractor for Stanford's Open ClassRoom"""
3859
3860 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3861 IE_NAME = u'stanfordoc'
3862
3863 def report_download_webpage(self, objid):
3864 """Report information extraction."""
3865 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3866
3867 def report_extraction(self, video_id):
3868 """Report information extraction."""
3869 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3870
3871 def _real_extract(self, url):
3872 mobj = re.match(self._VALID_URL, url)
3873 if mobj is None:
3874 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3875 return
3876
3877 if mobj.group('course') and mobj.group('video'): # A specific video
3878 course = mobj.group('course')
3879 video = mobj.group('video')
3880 info = {
3881 'id': _simplify_title(course + '_' + video),
3882 }
3883
3884 self.report_extraction(info['id'])
3885 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3886 xmlUrl = baseUrl + video + '.xml'
3887 try:
3888 metaXml = urllib2.urlopen(xmlUrl).read()
3889 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3890 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3891 return
3892 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3893 try:
3894 info['title'] = mdoc.findall('./title')[0].text
3895 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3896 except IndexError:
3897 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3898 return
3899 info['stitle'] = _simplify_title(info['title'])
3900 info['ext'] = info['url'].rpartition('.')[2]
3901 info['format'] = info['ext']
3902 self._downloader.increment_downloads()
3903 try:
3904 self._downloader.process_info(info)
3905 except UnavailableVideoError, err:
3906 self._downloader.trouble(u'\nERROR: unable to download video')
3907 elif mobj.group('course'): # A course page
3908 unescapeHTML = HTMLParser.HTMLParser().unescape
3909
3910 course = mobj.group('course')
3911 info = {
3912 'id': _simplify_title(course),
3913 'type': 'playlist',
3914 }
3915
3916 self.report_download_webpage(info['id'])
3917 try:
3918 coursepage = urllib2.urlopen(url).read()
3919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3920 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3921 return
3922
3923 m = re.search('<h1>([^<]+)</h1>', coursepage)
3924 if m:
3925 info['title'] = unescapeHTML(m.group(1))
3926 else:
3927 info['title'] = info['id']
3928 info['stitle'] = _simplify_title(info['title'])
3929
3930 m = re.search('<description>([^<]+)</description>', coursepage)
3931 if m:
3932 info['description'] = unescapeHTML(m.group(1))
3933
3934 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3935 info['list'] = [
3936 {
3937 'type': 'reference',
3938 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3939 }
3940 for vpage in links]
3941
3942 for entry in info['list']:
3943 assert entry['type'] == 'reference'
3944 self.extract(entry['url'])
3945 else: # Root page
3946 unescapeHTML = HTMLParser.HTMLParser().unescape
3947
3948 info = {
3949 'id': 'Stanford OpenClassroom',
3950 'type': 'playlist',
3951 }
3952
3953 self.report_download_webpage(info['id'])
3954 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3955 try:
3956 rootpage = urllib2.urlopen(rootURL).read()
3957 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3958 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3959 return
3960
3961 info['title'] = info['id']
3962 info['stitle'] = _simplify_title(info['title'])
3963
3964 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3965 info['list'] = [
3966 {
3967 'type': 'reference',
3968 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3969 }
3970 for cpage in links]
3971
3972 for entry in info['list']:
3973 assert entry['type'] == 'reference'
3974 self.extract(entry['url'])
3975
3976 class MTVIE(InfoExtractor):
3977 """Information extractor for MTV.com"""
3978
3979 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3980 IE_NAME = u'mtv'
3981
3982 def report_webpage(self, video_id):
3983 """Report information extraction."""
3984 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3985
3986 def report_extraction(self, video_id):
3987 """Report information extraction."""
3988 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3989
3990 def _real_extract(self, url):
3991 mobj = re.match(self._VALID_URL, url)
3992 if mobj is None:
3993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3994 return
3995 if not mobj.group('proto'):
3996 url = 'http://' + url
3997 video_id = mobj.group('videoid')
3998 self.report_webpage(video_id)
3999
4000 request = urllib2.Request(url)
4001 try:
4002 webpage = urllib2.urlopen(request).read()
4003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4004 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4005 return
4006
4007 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4008 if mobj is None:
4009 self._downloader.trouble(u'ERROR: unable to extract song name')
4010 return
4011 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4012 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4013 if mobj is None:
4014 self._downloader.trouble(u'ERROR: unable to extract performer')
4015 return
4016 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017 video_title = performer + ' - ' + song_name
4018
4019 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4020 if mobj is None:
4021 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4022 return
4023 mtvn_uri = mobj.group(1)
4024
4025 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4026 if mobj is None:
4027 self._downloader.trouble(u'ERROR: unable to extract content id')
4028 return
4029 content_id = mobj.group(1)
4030
4031 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4032 self.report_extraction(video_id)
4033 request = urllib2.Request(videogen_url)
4034 try:
4035 metadataXml = urllib2.urlopen(request).read()
4036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4037 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4038 return
4039
4040 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4041 renditions = mdoc.findall('.//rendition')
4042
4043 # For now, always pick the highest quality.
4044 rendition = renditions[-1]
4045
4046 try:
4047 _,_,ext = rendition.attrib['type'].partition('/')
4048 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4049 video_url = rendition.find('./src').text
4050 except KeyError:
4051 self._downloader.trouble('Invalid rendition field.')
4052 return
4053
4054 self._downloader.increment_downloads()
4055 info = {
4056 'id': video_id,
4057 'url': video_url,
4058 'uploader': performer,
4059 'title': video_title,
4060 'stitle': _simplify_title(video_title),
4061 'ext': ext,
4062 'format': format,
4063 }
4064
4065 try:
4066 self._downloader.process_info(info)
4067 except UnavailableVideoError, err:
4068 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4069
4070
4071 class PostProcessor(object):
4072 """Post Processor class.
4073
4074 PostProcessor objects can be added to downloaders with their
4075 add_post_processor() method. When the downloader has finished a
4076 successful download, it will take its internal chain of PostProcessors
4077 and start calling the run() method on each one of them, first with
4078 an initial argument and then with the returned value of the previous
4079 PostProcessor.
4080
4081 The chain will be stopped if one of them ever returns None or the end
4082 of the chain is reached.
4083
4084 PostProcessor objects follow a "mutual registration" process similar
4085 to InfoExtractor objects.
4086 """
4087
4088 _downloader = None
4089
4090 def __init__(self, downloader=None):
4091 self._downloader = downloader
4092
4093 def set_downloader(self, downloader):
4094 """Sets the downloader for this PP."""
4095 self._downloader = downloader
4096
4097 def run(self, information):
4098 """Run the PostProcessor.
4099
4100 The "information" argument is a dictionary like the ones
4101 composed by InfoExtractors. The only difference is that this
4102 one has an extra field called "filepath" that points to the
4103 downloaded file.
4104
4105 When this method returns None, the postprocessing chain is
4106 stopped. However, this method may return an information
4107 dictionary that will be passed to the next postprocessing
4108 object in the chain. It can be the one it received after
4109 changing some fields.
4110
4111 In addition, this method may raise a PostProcessingError
4112 exception that will be taken into account by the downloader
4113 it was called from.
4114 """
4115 return information # by default, do nothing
4116
4117 class AudioConversionError(BaseException):
4118 def __init__(self, message):
4119 self.message = message
4120
4121 class FFmpegExtractAudioPP(PostProcessor):
4122
4123 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4124 PostProcessor.__init__(self, downloader)
4125 if preferredcodec is None:
4126 preferredcodec = 'best'
4127 self._preferredcodec = preferredcodec
4128 self._preferredquality = preferredquality
4129 self._keepvideo = keepvideo
4130
4131 @staticmethod
4132 def get_audio_codec(path):
4133 try:
4134 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4135 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4136 output = handle.communicate()[0]
4137 if handle.wait() != 0:
4138 return None
4139 except (IOError, OSError):
4140 return None
4141 audio_codec = None
4142 for line in output.split('\n'):
4143 if line.startswith('codec_name='):
4144 audio_codec = line.split('=')[1].strip()
4145 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4146 return audio_codec
4147 return None
4148
4149 @staticmethod
4150 def run_ffmpeg(path, out_path, codec, more_opts):
4151 if codec is None:
4152 acodec_opts = []
4153 else:
4154 acodec_opts = ['-acodec', codec]
4155 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4156 try:
4157 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4158 stdout,stderr = p.communicate()
4159 except (IOError, OSError):
4160 e = sys.exc_info()[1]
4161 if isinstance(e, OSError) and e.errno == 2:
4162 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4163 else:
4164 raise e
4165 if p.returncode != 0:
4166 msg = stderr.strip().split('\n')[-1]
4167 raise AudioConversionError(msg)
4168
4169 def run(self, information):
4170 path = information['filepath']
4171
4172 filecodec = self.get_audio_codec(path)
4173 if filecodec is None:
4174 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4175 return None
4176
4177 more_opts = []
4178 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4179 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4180 # Lossless, but in another container
4181 acodec = 'copy'
4182 extension = self._preferredcodec
4183 more_opts = ['-absf', 'aac_adtstoasc']
4184 elif filecodec in ['aac', 'mp3', 'vorbis']:
4185 # Lossless if possible
4186 acodec = 'copy'
4187 extension = filecodec
4188 if filecodec == 'aac':
4189 more_opts = ['-f', 'adts']
4190 if filecodec == 'vorbis':
4191 extension = 'ogg'
4192 else:
4193 # MP3 otherwise.
4194 acodec = 'libmp3lame'
4195 extension = 'mp3'
4196 more_opts = []
4197 if self._preferredquality is not None:
4198 more_opts += ['-ab', self._preferredquality]
4199 else:
4200 # We convert the audio (lossy)
4201 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4202 extension = self._preferredcodec
4203 more_opts = []
4204 if self._preferredquality is not None:
4205 more_opts += ['-ab', self._preferredquality]
4206 if self._preferredcodec == 'aac':
4207 more_opts += ['-f', 'adts']
4208 if self._preferredcodec == 'm4a':
4209 more_opts += ['-absf', 'aac_adtstoasc']
4210 if self._preferredcodec == 'vorbis':
4211 extension = 'ogg'
4212 if self._preferredcodec == 'wav':
4213 extension = 'wav'
4214 more_opts += ['-f', 'wav']
4215
4216 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4217 new_path = prefix + sep + extension
4218 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4219 try:
4220 self.run_ffmpeg(path, new_path, acodec, more_opts)
4221 except:
4222 etype,e,tb = sys.exc_info()
4223 if isinstance(e, AudioConversionError):
4224 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4225 else:
4226 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4227 return None
4228
4229 # Try to update the date time for extracted audio file.
4230 if information.get('filetime') is not None:
4231 try:
4232 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4233 except:
4234 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4235
4236 if not self._keepvideo:
4237 try:
4238 os.remove(_encodeFilename(path))
4239 except (IOError, OSError):
4240 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4241 return None
4242
4243 information['filepath'] = new_path
4244 return information
4245
4246
4247 def updateSelf(downloader, filename):
4248 ''' Update the program file with the latest version from the repository '''
4249 # Note: downloader only used for options
4250 if not os.access(filename, os.W_OK):
4251 sys.exit('ERROR: no write permissions on %s' % filename)
4252
4253 downloader.to_screen(u'Updating to latest version...')
4254
4255 try:
4256 try:
4257 urlh = urllib.urlopen(UPDATE_URL)
4258 newcontent = urlh.read()
4259
4260 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4261 if vmatch is not None and vmatch.group(1) == __version__:
4262 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4263 return
4264 finally:
4265 urlh.close()
4266 except (IOError, OSError), err:
4267 sys.exit('ERROR: unable to download latest version')
4268
4269 try:
4270 outf = open(filename, 'wb')
4271 try:
4272 outf.write(newcontent)
4273 finally:
4274 outf.close()
4275 except (IOError, OSError), err:
4276 sys.exit('ERROR: unable to overwrite current version')
4277
4278 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4279
4280 def parseOpts():
4281 def _readOptions(filename_bytes):
4282 try:
4283 optionf = open(filename_bytes)
4284 except IOError:
4285 return [] # silently skip if file is not present
4286 try:
4287 res = []
4288 for l in optionf:
4289 res += shlex.split(l, comments=True)
4290 finally:
4291 optionf.close()
4292 return res
4293
4294 def _format_option_string(option):
4295 ''' ('-o', '--option') -> -o, --format METAVAR'''
4296
4297 opts = []
4298
4299 if option._short_opts: opts.append(option._short_opts[0])
4300 if option._long_opts: opts.append(option._long_opts[0])
4301 if len(opts) > 1: opts.insert(1, ', ')
4302
4303 if option.takes_value(): opts.append(' %s' % option.metavar)
4304
4305 return "".join(opts)
4306
4307 def _find_term_columns():
4308 columns = os.environ.get('COLUMNS', None)
4309 if columns:
4310 return int(columns)
4311
4312 try:
4313 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4314 out,err = sp.communicate()
4315 return int(out.split()[1])
4316 except:
4317 pass
4318 return None
4319
4320 max_width = 80
4321 max_help_position = 80
4322
4323 # No need to wrap help messages if we're on a wide console
4324 columns = _find_term_columns()
4325 if columns: max_width = columns
4326
4327 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4328 fmt.format_option_strings = _format_option_string
4329
4330 kw = {
4331 'version' : __version__,
4332 'formatter' : fmt,
4333 'usage' : '%prog [options] url [url...]',
4334 'conflict_handler' : 'resolve',
4335 }
4336
4337 parser = optparse.OptionParser(**kw)
4338
4339 # option groups
4340 general = optparse.OptionGroup(parser, 'General Options')
4341 selection = optparse.OptionGroup(parser, 'Video Selection')
4342 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4343 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4344 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4345 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4346 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4347
4348 general.add_option('-h', '--help',
4349 action='help', help='print this help text and exit')
4350 general.add_option('-v', '--version',
4351 action='version', help='print program version and exit')
4352 general.add_option('-U', '--update',
4353 action='store_true', dest='update_self', help='update this program to latest version')
4354 general.add_option('-i', '--ignore-errors',
4355 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4356 general.add_option('-r', '--rate-limit',
4357 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4358 general.add_option('-R', '--retries',
4359 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4360 general.add_option('--dump-user-agent',
4361 action='store_true', dest='dump_user_agent',
4362 help='display the current browser identification', default=False)
4363 general.add_option('--list-extractors',
4364 action='store_true', dest='list_extractors',
4365 help='List all supported extractors and the URLs they would handle', default=False)
4366
4367 selection.add_option('--playlist-start',
4368 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4369 selection.add_option('--playlist-end',
4370 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4371 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4372 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4373 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4374
4375 authentication.add_option('-u', '--username',
4376 dest='username', metavar='USERNAME', help='account username')
4377 authentication.add_option('-p', '--password',
4378 dest='password', metavar='PASSWORD', help='account password')
4379 authentication.add_option('-n', '--netrc',
4380 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4381
4382
4383 video_format.add_option('-f', '--format',
4384 action='store', dest='format', metavar='FORMAT', help='video format code')
4385 video_format.add_option('--all-formats',
4386 action='store_const', dest='format', help='download all available video formats', const='all')
4387 video_format.add_option('--prefer-free-formats',
4388 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4389 video_format.add_option('--max-quality',
4390 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4391 video_format.add_option('-F', '--list-formats',
4392 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4393
4394
4395 verbosity.add_option('-q', '--quiet',
4396 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4397 verbosity.add_option('-s', '--simulate',
4398 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4399 verbosity.add_option('--skip-download',
4400 action='store_true', dest='skip_download', help='do not download the video', default=False)
4401 verbosity.add_option('-g', '--get-url',
4402 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4403 verbosity.add_option('-e', '--get-title',
4404 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4405 verbosity.add_option('--get-thumbnail',
4406 action='store_true', dest='getthumbnail',
4407 help='simulate, quiet but print thumbnail URL', default=False)
4408 verbosity.add_option('--get-description',
4409 action='store_true', dest='getdescription',
4410 help='simulate, quiet but print video description', default=False)
4411 verbosity.add_option('--get-filename',
4412 action='store_true', dest='getfilename',
4413 help='simulate, quiet but print output filename', default=False)
4414 verbosity.add_option('--get-format',
4415 action='store_true', dest='getformat',
4416 help='simulate, quiet but print output format', default=False)
4417 verbosity.add_option('--no-progress',
4418 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4419 verbosity.add_option('--console-title',
4420 action='store_true', dest='consoletitle',
4421 help='display progress in console titlebar', default=False)
4422 verbosity.add_option('-v', '--verbose',
4423 action='store_true', dest='verbose', help='print various debugging information', default=False)
4424
4425
4426 filesystem.add_option('-t', '--title',
4427 action='store_true', dest='usetitle', help='use title in file name', default=False)
4428 filesystem.add_option('-l', '--literal',
4429 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4430 filesystem.add_option('-A', '--auto-number',
4431 action='store_true', dest='autonumber',
4432 help='number downloaded files starting from 00000', default=False)
4433 filesystem.add_option('-o', '--output',
4434 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4435 filesystem.add_option('-a', '--batch-file',
4436 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4437 filesystem.add_option('-w', '--no-overwrites',
4438 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4439 filesystem.add_option('-c', '--continue',
4440 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4441 filesystem.add_option('--no-continue',
4442 action='store_false', dest='continue_dl',
4443 help='do not resume partially downloaded files (restart from beginning)')
4444 filesystem.add_option('--cookies',
4445 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4446 filesystem.add_option('--no-part',
4447 action='store_true', dest='nopart', help='do not use .part files', default=False)
4448 filesystem.add_option('--no-mtime',
4449 action='store_false', dest='updatetime',
4450 help='do not use the Last-modified header to set the file modification time', default=True)
4451 filesystem.add_option('--write-description',
4452 action='store_true', dest='writedescription',
4453 help='write video description to a .description file', default=False)
4454 filesystem.add_option('--write-info-json',
4455 action='store_true', dest='writeinfojson',
4456 help='write video metadata to a .info.json file', default=False)
4457 filesystem.add_option('--write-srt',
4458 action='store_true', dest='writesubtitles',
4459 help='write video subtitles to a .srt file', default=False)
4460
4461
4462 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4463 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4464 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4465 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4466 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4467 help='ffmpeg audio bitrate specification, 128k by default')
4468 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4469 help='keeps the video file on disk after the post-processing; the video is erased by default')
4470
4471
4472 parser.add_option_group(general)
4473 parser.add_option_group(selection)
4474 parser.add_option_group(filesystem)
4475 parser.add_option_group(verbosity)
4476 parser.add_option_group(video_format)
4477 parser.add_option_group(authentication)
4478 parser.add_option_group(postproc)
4479
4480 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4481 if xdg_config_home:
4482 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4483 else:
4484 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4485 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4486 opts, args = parser.parse_args(argv)
4487
4488 return parser, opts, args
4489
4490 def gen_extractors():
4491 """ Return a list of an instance of every supported extractor.
4492 The order does matter; the first extractor matched is the one handling the URL.
4493 """
4494 youtube_ie = YoutubeIE()
4495 google_ie = GoogleIE()
4496 yahoo_ie = YahooIE()
4497 return [
4498 YoutubePlaylistIE(youtube_ie),
4499 YoutubeUserIE(youtube_ie),
4500 YoutubeSearchIE(youtube_ie),
4501 youtube_ie,
4502 MetacafeIE(youtube_ie),
4503 DailymotionIE(),
4504 google_ie,
4505 GoogleSearchIE(google_ie),
4506 PhotobucketIE(),
4507 yahoo_ie,
4508 YahooSearchIE(yahoo_ie),
4509 DepositFilesIE(),
4510 FacebookIE(),
4511 BlipTVIE(),
4512 VimeoIE(),
4513 MyVideoIE(),
4514 ComedyCentralIE(),
4515 EscapistIE(),
4516 CollegeHumorIE(),
4517 XVideosIE(),
4518 SoundcloudIE(),
4519 InfoQIE(),
4520 MixcloudIE(),
4521 StanfordOpenClassroomIE(),
4522 MTVIE(),
4523
4524 GenericIE()
4525 ]
4526
4527 def _real_main():
4528 parser, opts, args = parseOpts()
4529
4530 # Open appropriate CookieJar
4531 if opts.cookiefile is None:
4532 jar = cookielib.CookieJar()
4533 else:
4534 try:
4535 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4536 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4537 jar.load()
4538 except (IOError, OSError), err:
4539 sys.exit(u'ERROR: unable to open cookie file')
4540
4541 # Dump user agent
4542 if opts.dump_user_agent:
4543 print std_headers['User-Agent']
4544 sys.exit(0)
4545
4546 # Batch file verification
4547 batchurls = []
4548 if opts.batchfile is not None:
4549 try:
4550 if opts.batchfile == '-':
4551 batchfd = sys.stdin
4552 else:
4553 batchfd = open(opts.batchfile, 'r')
4554 batchurls = batchfd.readlines()
4555 batchurls = [x.strip() for x in batchurls]
4556 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4557 except IOError:
4558 sys.exit(u'ERROR: batch file could not be read')
4559 all_urls = batchurls + args
4560
4561 # General configuration
4562 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4563 proxy_handler = urllib2.ProxyHandler()
4564 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4565 urllib2.install_opener(opener)
4566 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4567
4568 if opts.verbose:
4569 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4570
4571 extractors = gen_extractors()
4572
4573 if opts.list_extractors:
4574 for ie in extractors:
4575 print(ie.IE_NAME)
4576 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4577 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4578 for mu in matchedUrls:
4579 print(u' ' + mu)
4580 sys.exit(0)
4581
4582 # Conflicting, missing and erroneous options
4583 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4584 parser.error(u'using .netrc conflicts with giving username/password')
4585 if opts.password is not None and opts.username is None:
4586 parser.error(u'account username missing')
4587 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4588 parser.error(u'using output template conflicts with using title, literal title or auto number')
4589 if opts.usetitle and opts.useliteral:
4590 parser.error(u'using title conflicts with using literal title')
4591 if opts.username is not None and opts.password is None:
4592 opts.password = getpass.getpass(u'Type account password and press return:')
4593 if opts.ratelimit is not None:
4594 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4595 if numeric_limit is None:
4596 parser.error(u'invalid rate limit specified')
4597 opts.ratelimit = numeric_limit
4598 if opts.retries is not None:
4599 try:
4600 opts.retries = long(opts.retries)
4601 except (TypeError, ValueError), err:
4602 parser.error(u'invalid retry count specified')
4603 try:
4604 opts.playliststart = int(opts.playliststart)
4605 if opts.playliststart <= 0:
4606 raise ValueError(u'Playlist start must be positive')
4607 except (TypeError, ValueError), err:
4608 parser.error(u'invalid playlist start number specified')
4609 try:
4610 opts.playlistend = int(opts.playlistend)
4611 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4612 raise ValueError(u'Playlist end must be greater than playlist start')
4613 except (TypeError, ValueError), err:
4614 parser.error(u'invalid playlist end number specified')
4615 if opts.extractaudio:
4616 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4617 parser.error(u'invalid audio format specified')
4618
4619 # File downloader
4620 fd = FileDownloader({
4621 'usenetrc': opts.usenetrc,
4622 'username': opts.username,
4623 'password': opts.password,
4624 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4625 'forceurl': opts.geturl,
4626 'forcetitle': opts.gettitle,
4627 'forcethumbnail': opts.getthumbnail,
4628 'forcedescription': opts.getdescription,
4629 'forcefilename': opts.getfilename,
4630 'forceformat': opts.getformat,
4631 'simulate': opts.simulate,
4632 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633 'format': opts.format,
4634 'format_limit': opts.format_limit,
4635 'listformats': opts.listformats,
4636 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4637 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4638 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4639 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4640 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4641 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4642 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4643 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4644 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4645 or u'%(id)s.%(ext)s'),
4646 'ignoreerrors': opts.ignoreerrors,
4647 'ratelimit': opts.ratelimit,
4648 'nooverwrites': opts.nooverwrites,
4649 'retries': opts.retries,
4650 'continuedl': opts.continue_dl,
4651 'noprogress': opts.noprogress,
4652 'playliststart': opts.playliststart,
4653 'playlistend': opts.playlistend,
4654 'logtostderr': opts.outtmpl == '-',
4655 'consoletitle': opts.consoletitle,
4656 'nopart': opts.nopart,
4657 'updatetime': opts.updatetime,
4658 'writedescription': opts.writedescription,
4659 'writeinfojson': opts.writeinfojson,
4660 'writesubtitles': opts.writesubtitles,
4661 'matchtitle': opts.matchtitle,
4662 'rejecttitle': opts.rejecttitle,
4663 'max_downloads': opts.max_downloads,
4664 'prefer_free_formats': opts.prefer_free_formats,
4665 'verbose': opts.verbose,
4666 })
4667 for extractor in extractors:
4668 fd.add_info_extractor(extractor)
4669
4670 # PostProcessors
4671 if opts.extractaudio:
4672 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4673
4674 # Update version
4675 if opts.update_self:
4676 updateSelf(fd, sys.argv[0])
4677
4678 # Maybe do nothing
4679 if len(all_urls) < 1:
4680 if not opts.update_self:
4681 parser.error(u'you must provide at least one URL')
4682 else:
4683 sys.exit()
4684
4685 try:
4686 retcode = fd.download(all_urls)
4687 except MaxDownloadsReached:
4688 fd.to_screen(u'--max-download limit reached, aborting.')
4689 retcode = 101
4690
4691 # Dump cookie jar if requested
4692 if opts.cookiefile is not None:
4693 try:
4694 jar.save()
4695 except (IOError, OSError), err:
4696 sys.exit(u'ERROR: unable to save cookie jar')
4697
4698 sys.exit(retcode)
4699
4700 def main():
4701 try:
4702 _real_main()
4703 except DownloadError:
4704 sys.exit(1)
4705 except SameFileError:
4706 sys.exit(u'ERROR: fixed output name but more than one file to download')
4707 except KeyboardInterrupt:
4708 sys.exit(u'\nERROR: Interrupted by user')
4709
4710 if __name__ == '__main__':
4711 main()
4712
4713 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: