]> jfr.im git - yt-dlp.git/blame - youtube_dl/__init__.py
Simplify simplify_title
[yt-dlp.git] / youtube_dl / __init__.py
CommitLineData
235b3ba4
PH
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4__author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 )
18
19__license__ = 'Public Domain'
20__version__ = '2011.10.19'
21
22UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
23
24import cookielib
25import datetime
26import gzip
27import htmlentitydefs
28import HTMLParser
29import httplib
30import locale
31import math
32import netrc
33import os
34import os.path
35import re
36import socket
37import string
38import subprocess
39import sys
40import time
41import urllib
42import urllib2
43import warnings
44import zlib
45
46if os.name == 'nt':
47 import ctypes
48
49try:
50 import email.utils
51except ImportError: # Python 2.4
52 import email.Utils
53try:
54 import cStringIO as StringIO
55except ImportError:
56 import StringIO
57
58# parse_qs was moved from the cgi module to the urlparse module recently.
59try:
60 from urlparse import parse_qs
61except ImportError:
62 from cgi import parse_qs
63
64try:
65 import lxml.etree
66except ImportError:
67 pass # Handled below
68
69try:
70 import xml.etree.ElementTree
71except ImportError: # Python<2.5: Not officially supported, but let it slip
72 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73
74std_headers = {
75 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
76 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
77 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
78 'Accept-Encoding': 'gzip, deflate',
79 'Accept-Language': 'en-us,en;q=0.5',
80}
81
235b3ba4
PH
82try:
83 import json
84except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
195def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
210
211
212def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
214
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
238
239def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
243
244
245def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
256 if filename == u'-':
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
271
272def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
e33e3045 280def _simplify_title(title):
e092418d 281 return re.sub(ur'[^\w\d_\-]+', u'_', title).strip(u'_')
235b3ba4
PH
282
283class DownloadError(Exception):
284 """Download Error exception.
285
286 This exception may be thrown by FileDownloader objects if they are not
287 configured to continue on errors. They will contain the appropriate
288 error message.
289 """
290 pass
291
292
293class SameFileError(Exception):
294 """Same File exception.
295
296 This exception will be thrown by FileDownloader objects if they detect
297 multiple files would have to be downloaded to the same file on disk.
298 """
299 pass
300
301
302class PostProcessingError(Exception):
303 """Post Processing exception.
304
305 This exception may be raised by PostProcessor's .run() method to
306 indicate an error in the postprocessing task.
307 """
308 pass
309
310
311class UnavailableVideoError(Exception):
312 """Unavailable Format exception.
313
314 This exception will be thrown when a video is requested
315 in a format that is not available for that video.
316 """
317 pass
318
319
320class ContentTooShortError(Exception):
321 """Content Too Short exception.
322
323 This exception may be raised by FileDownloader objects when a file they
324 download is too small for what the server announced first, indicating
325 the connection was probably interrupted.
326 """
327 # Both in bytes
328 downloaded = None
329 expected = None
330
331 def __init__(self, downloaded, expected):
332 self.downloaded = downloaded
333 self.expected = expected
334
335
336class YoutubeDLHandler(urllib2.HTTPHandler):
337 """Handler for HTTP requests and responses.
338
339 This class, when installed with an OpenerDirector, automatically adds
340 the standard headers to every HTTP request and handles gzipped and
341 deflated responses from web servers. If compression is to be avoided in
342 a particular request, the original request in the program code only has
343 to include the HTTP header "Youtubedl-No-Compression", which will be
344 removed before making the real request.
345
346 Part of this code was copied from:
347
348 http://techknack.net/python-urllib2-handlers/
349
350 Andrew Rowls, the author of that code, agreed to release it to the
351 public domain.
352 """
353
354 @staticmethod
355 def deflate(data):
356 try:
357 return zlib.decompress(data, -zlib.MAX_WBITS)
358 except zlib.error:
359 return zlib.decompress(data)
360
361 @staticmethod
362 def addinfourl_wrapper(stream, headers, url, code):
363 if hasattr(urllib2.addinfourl, 'getcode'):
364 return urllib2.addinfourl(stream, headers, url, code)
365 ret = urllib2.addinfourl(stream, headers, url)
366 ret.code = code
367 return ret
368
369 def http_request(self, req):
370 for h in std_headers:
371 if h in req.headers:
372 del req.headers[h]
373 req.add_header(h, std_headers[h])
374 if 'Youtubedl-no-compression' in req.headers:
375 if 'Accept-encoding' in req.headers:
376 del req.headers['Accept-encoding']
377 del req.headers['Youtubedl-no-compression']
378 return req
379
380 def http_response(self, req, resp):
381 old_resp = resp
382 # gzip
383 if resp.headers.get('Content-encoding', '') == 'gzip':
384 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
385 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
386 resp.msg = old_resp.msg
387 # deflate
388 if resp.headers.get('Content-encoding', '') == 'deflate':
389 gz = StringIO.StringIO(self.deflate(resp.read()))
390 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
391 resp.msg = old_resp.msg
392 return resp
393
394
395class FileDownloader(object):
396 """File Downloader class.
397
398 File downloader objects are the ones responsible of downloading the
399 actual video file and writing it to disk if the user has requested
400 it, among some other tasks. In most cases there should be one per
401 program. As, given a video URL, the downloader doesn't know how to
402 extract all the needed information, task that InfoExtractors do, it
403 has to pass the URL to one of them.
404
405 For this, file downloader objects have a method that allows
406 InfoExtractors to be registered in a given order. When it is passed
407 a URL, the file downloader handles it to the first InfoExtractor it
408 finds that reports being able to handle it. The InfoExtractor extracts
409 all the information about the video or videos the URL refers to, and
410 asks the FileDownloader to process the video information, possibly
411 downloading the video.
412
413 File downloaders accept a lot of parameters. In order not to saturate
414 the object constructor with arguments, it receives a dictionary of
415 options instead. These options are available through the params
416 attribute for the InfoExtractors to use. The FileDownloader also
417 registers itself as the downloader in charge for the InfoExtractors
418 that are added to it, so this is a "mutual registration".
419
420 Available options:
421
422 username: Username for authentication purposes.
423 password: Password for authentication purposes.
424 usenetrc: Use netrc for authentication instead.
425 quiet: Do not print messages to stdout.
426 forceurl: Force printing final URL.
427 forcetitle: Force printing title.
428 forcethumbnail: Force printing thumbnail URL.
429 forcedescription: Force printing description.
430 forcefilename: Force printing final filename.
431 simulate: Do not download the video files.
432 format: Video format code.
433 format_limit: Highest quality format to try.
434 outtmpl: Template for output names.
435 ignoreerrors: Do not stop on download errors.
436 ratelimit: Download speed limit, in bytes/sec.
437 nooverwrites: Prevent overwriting files.
438 retries: Number of times to retry for HTTP error 5xx
439 continuedl: Try to continue downloads if possible.
440 noprogress: Do not print the progress bar.
441 playliststart: Playlist item to start at.
442 playlistend: Playlist item to end at.
443 matchtitle: Download only matching titles.
444 rejecttitle: Reject downloads for matching titles.
445 logtostderr: Log messages to stderr instead of stdout.
446 consoletitle: Display progress in console window's titlebar.
447 nopart: Do not use temporary .part files.
448 updatetime: Use the Last-modified header to set output file timestamps.
449 writedescription: Write the video description to a .description file
450 writeinfojson: Write the video description to a .info.json file
451 """
452
453 params = None
454 _ies = []
455 _pps = []
456 _download_retcode = None
457 _num_downloads = None
458 _screen_file = None
459
460 def __init__(self, params):
461 """Create a FileDownloader object with the given options."""
462 self._ies = []
463 self._pps = []
464 self._download_retcode = 0
465 self._num_downloads = 0
466 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
467 self.params = params
468
469 @staticmethod
470 def format_bytes(bytes):
471 if bytes is None:
472 return 'N/A'
473 if type(bytes) is str:
474 bytes = float(bytes)
475 if bytes == 0.0:
476 exponent = 0
477 else:
478 exponent = long(math.log(bytes, 1024.0))
479 suffix = 'bkMGTPEZY'[exponent]
480 converted = float(bytes) / float(1024 ** exponent)
481 return '%.2f%s' % (converted, suffix)
482
483 @staticmethod
484 def calc_percent(byte_counter, data_len):
485 if data_len is None:
486 return '---.-%'
487 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488
489 @staticmethod
490 def calc_eta(start, now, total, current):
491 if total is None:
492 return '--:--'
493 dif = now - start
494 if current == 0 or dif < 0.001: # One millisecond
495 return '--:--'
496 rate = float(current) / dif
497 eta = long((float(total) - float(current)) / rate)
498 (eta_mins, eta_secs) = divmod(eta, 60)
499 if eta_mins > 99:
500 return '--:--'
501 return '%02d:%02d' % (eta_mins, eta_secs)
502
503 @staticmethod
504 def calc_speed(start, now, bytes):
505 dif = now - start
506 if bytes == 0 or dif < 0.001: # One millisecond
507 return '%10s' % '---b/s'
508 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509
510 @staticmethod
511 def best_block_size(elapsed_time, bytes):
512 new_min = max(bytes / 2.0, 1.0)
513 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
514 if elapsed_time < 0.001:
515 return long(new_max)
516 rate = bytes / elapsed_time
517 if rate > new_max:
518 return long(new_max)
519 if rate < new_min:
520 return long(new_min)
521 return long(rate)
522
523 @staticmethod
524 def parse_bytes(bytestr):
525 """Parse a string indicating a byte quantity into a long integer."""
526 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 if matchobj is None:
528 return None
529 number = float(matchobj.group(1))
530 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
531 return long(round(number * multiplier))
532
533 def add_info_extractor(self, ie):
534 """Add an InfoExtractor object to the end of the list."""
535 self._ies.append(ie)
536 ie.set_downloader(self)
537
538 def add_post_processor(self, pp):
539 """Add a PostProcessor object to the end of the chain."""
540 self._pps.append(pp)
541 pp.set_downloader(self)
542
543 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
544 """Print message to stdout if not in quiet mode."""
545 try:
546 if not self.params.get('quiet', False):
547 terminator = [u'\n', u''][skip_eol]
548 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
549 self._screen_file.flush()
550 except (UnicodeEncodeError), err:
551 if not ignore_encoding_errors:
552 raise
553
554 def to_stderr(self, message):
555 """Print message to stderr."""
556 print >>sys.stderr, message.encode(preferredencoding())
557
558 def to_cons_title(self, message):
559 """Set console/terminal window title to message."""
560 if not self.params.get('consoletitle', False):
561 return
562 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
563 # c_wchar_p() might not be necessary if `message` is
564 # already of type unicode()
565 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
566 elif 'TERM' in os.environ:
567 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
568
569 def fixed_template(self):
570 """Checks if the output template is fixed."""
571 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
572
573 def trouble(self, message=None):
574 """Determine action to take when a download problem appears.
575
576 Depending on if the downloader has been configured to ignore
577 download errors or not, this method may throw an exception or
578 not when errors are found, after printing the message.
579 """
580 if message is not None:
581 self.to_stderr(message)
582 if not self.params.get('ignoreerrors', False):
583 raise DownloadError(message)
584 self._download_retcode = 1
585
586 def slow_down(self, start_time, byte_counter):
587 """Sleep if the download speed is over the rate limit."""
588 rate_limit = self.params.get('ratelimit', None)
589 if rate_limit is None or byte_counter == 0:
590 return
591 now = time.time()
592 elapsed = now - start_time
593 if elapsed <= 0.0:
594 return
595 speed = float(byte_counter) / elapsed
596 if speed > rate_limit:
597 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
598
599 def temp_name(self, filename):
600 """Returns a temporary filename for the given filename."""
601 if self.params.get('nopart', False) or filename == u'-' or \
602 (os.path.exists(filename) and not os.path.isfile(filename)):
603 return filename
604 return filename + u'.part'
605
606 def undo_temp_name(self, filename):
607 if filename.endswith(u'.part'):
608 return filename[:-len(u'.part')]
609 return filename
610
611 def try_rename(self, old_filename, new_filename):
612 try:
613 if old_filename == new_filename:
614 return
615 os.rename(old_filename, new_filename)
616 except (IOError, OSError), err:
617 self.trouble(u'ERROR: unable to rename file')
618
619 def try_utime(self, filename, last_modified_hdr):
620 """Try to set the last-modified time of the given file."""
621 if last_modified_hdr is None:
622 return
623 if not os.path.isfile(filename):
624 return
625 timestr = last_modified_hdr
626 if timestr is None:
627 return
628 filetime = timeconvert(timestr)
629 if filetime is None:
630 return filetime
631 try:
632 os.utime(filename, (time.time(), filetime))
633 except:
634 pass
635 return filetime
636
637 def report_writedescription(self, descfn):
638 """ Report that the description file is being written """
639 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
640
641 def report_writeinfojson(self, infofn):
642 """ Report that the metadata file has been written """
643 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
644
645 def report_destination(self, filename):
646 """Report destination filename."""
647 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
648
649 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
650 """Report download progress."""
651 if self.params.get('noprogress', False):
652 return
653 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
654 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
655 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
656 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
657
658 def report_resuming_byte(self, resume_len):
659 """Report attempt to resume at given byte."""
660 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
661
662 def report_retry(self, count, retries):
663 """Report retry in case of HTTP error 5xx"""
664 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
665
666 def report_file_already_downloaded(self, file_name):
667 """Report file has already been fully downloaded."""
668 try:
669 self.to_screen(u'[download] %s has already been downloaded' % file_name)
670 except (UnicodeEncodeError), err:
671 self.to_screen(u'[download] The file has already been downloaded')
672
673 def report_unable_to_resume(self):
674 """Report it was impossible to resume download."""
675 self.to_screen(u'[download] Unable to resume')
676
677 def report_finish(self):
678 """Report download finished."""
679 if self.params.get('noprogress', False):
680 self.to_screen(u'[download] Download completed')
681 else:
682 self.to_screen(u'')
683
684 def increment_downloads(self):
685 """Increment the ordinal that assigns a number to each file."""
686 self._num_downloads += 1
687
688 def prepare_filename(self, info_dict):
689 """Generate the output filename."""
690 try:
691 template_dict = dict(info_dict)
692 template_dict['epoch'] = unicode(long(time.time()))
693 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
694 filename = self.params['outtmpl'] % template_dict
695 return filename
696 except (ValueError, KeyError), err:
697 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 return None
699
700 def process_info(self, info_dict):
701 """Process a single dictionary returned by an InfoExtractor."""
702 filename = self.prepare_filename(info_dict)
703
704 # Forced printings
705 if self.params.get('forcetitle', False):
706 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forceurl', False):
708 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
710 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcedescription', False) and 'description' in info_dict:
712 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forcefilename', False) and filename is not None:
714 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
715 if self.params.get('forceformat', False):
716 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
717
718 # Do nothing else if in simulate mode
719 if self.params.get('simulate', False):
720 return
721
722 if filename is None:
723 return
724
725 matchtitle=self.params.get('matchtitle',False)
726 rejecttitle=self.params.get('rejecttitle',False)
727 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
728 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
729 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
730 return
731 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
732 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 return
734
735 if self.params.get('nooverwrites', False) and os.path.exists(filename):
736 self.to_stderr(u'WARNING: file exists and will be skipped')
737 return
738
739 try:
740 dn = os.path.dirname(filename)
741 if dn != '' and not os.path.exists(dn):
742 os.makedirs(dn)
743 except (OSError, IOError), err:
744 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 return
746
747 if self.params.get('writedescription', False):
748 try:
749 descfn = filename + '.description'
750 self.report_writedescription(descfn)
751 descfile = open(descfn, 'wb')
752 try:
753 descfile.write(info_dict['description'].encode('utf-8'))
754 finally:
755 descfile.close()
756 except (OSError, IOError):
757 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 return
759
760 if self.params.get('writeinfojson', False):
761 infofn = filename + '.info.json'
762 self.report_writeinfojson(infofn)
763 try:
764 json.dump
765 except (NameError,AttributeError):
766 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 return
768 try:
769 infof = open(infofn, 'wb')
770 try:
771 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
772 json.dump(json_info_dict, infof)
773 finally:
774 infof.close()
775 except (OSError, IOError):
776 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
777 return
778
779 if not self.params.get('skip_download', False):
780 try:
781 success = self._do_download(filename, info_dict)
782 except (OSError, IOError), err:
783 raise UnavailableVideoError
784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
785 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
786 return
787 except (ContentTooShortError, ), err:
788 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
789 return
790
791 if success:
792 try:
793 self.post_process(filename, info_dict)
794 except (PostProcessingError), err:
795 self.trouble(u'ERROR: postprocessing: %s' % str(err))
796 return
797
798 def download(self, url_list):
799 """Download a given list of URLs."""
800 if len(url_list) > 1 and self.fixed_template():
801 raise SameFileError(self.params['outtmpl'])
802
803 for url in url_list:
804 suitable_found = False
805 for ie in self._ies:
806 # Go to next InfoExtractor if not suitable
807 if not ie.suitable(url):
808 continue
809
810 # Suitable InfoExtractor found
811 suitable_found = True
812
813 # Extract information from URL and process it
814 ie.extract(url)
815
816 # Suitable InfoExtractor had been found; go to next URL
817 break
818
819 if not suitable_found:
820 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
821
822 return self._download_retcode
823
824 def post_process(self, filename, ie_info):
825 """Run the postprocessing chain on the given file."""
826 info = dict(ie_info)
827 info['filepath'] = filename
828 for pp in self._pps:
829 info = pp.run(info)
830 if info is None:
831 break
832
833 def _download_with_rtmpdump(self, filename, url, player_url):
834 self.report_destination(filename)
835 tmpfilename = self.temp_name(filename)
836
837 # Check for rtmpdump first
838 try:
839 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
840 except (OSError, IOError):
841 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842 return False
843
844 # Download using rtmpdump. rtmpdump returns exit code 2 when
845 # the connection was interrumpted and resuming appears to be
846 # possible. This is part of rtmpdump's normal usage, AFAIK.
847 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
848 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
849 while retval == 2 or retval == 1:
850 prevsize = os.path.getsize(tmpfilename)
851 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
852 time.sleep(5.0) # This seems to be needed
853 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
854 cursize = os.path.getsize(tmpfilename)
855 if prevsize == cursize and retval == 1:
856 break
857 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
858 if prevsize == cursize and retval == 2 and cursize > 1024:
859 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
860 retval = 0
861 break
862 if retval == 0:
863 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
864 self.try_rename(tmpfilename, filename)
865 return True
866 else:
867 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
868 return False
869
870 def _do_download(self, filename, info_dict):
871 url = info_dict['url']
872 player_url = info_dict.get('player_url', None)
873
874 # Check file already present
875 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
876 self.report_file_already_downloaded(filename)
877 return True
878
879 # Attempt to download using rtmpdump
880 if url.startswith('rtmp'):
881 return self._download_with_rtmpdump(filename, url, player_url)
882
883 tmpfilename = self.temp_name(filename)
884 stream = None
885
886 # Do not include the Accept-Encoding header
887 headers = {'Youtubedl-no-compression': 'True'}
888 basic_request = urllib2.Request(url, None, headers)
889 request = urllib2.Request(url, None, headers)
890
891 # Establish possible resume length
892 if os.path.isfile(tmpfilename):
893 resume_len = os.path.getsize(tmpfilename)
894 else:
895 resume_len = 0
896
897 open_mode = 'wb'
898 if resume_len != 0:
899 if self.params.get('continuedl', False):
900 self.report_resuming_byte(resume_len)
901 request.add_header('Range','bytes=%d-' % resume_len)
902 open_mode = 'ab'
903 else:
904 resume_len = 0
905
906 count = 0
907 retries = self.params.get('retries', 0)
908 while count <= retries:
909 # Establish connection
910 try:
911 if count == 0 and 'urlhandle' in info_dict:
912 data = info_dict['urlhandle']
913 data = urllib2.urlopen(request)
914 break
915 except (urllib2.HTTPError, ), err:
916 if (err.code < 500 or err.code >= 600) and err.code != 416:
917 # Unexpected HTTP error
918 raise
919 elif err.code == 416:
920 # Unable to resume (requested range not satisfiable)
921 try:
922 # Open the connection again without the range header
923 data = urllib2.urlopen(basic_request)
924 content_length = data.info()['Content-Length']
925 except (urllib2.HTTPError, ), err:
926 if err.code < 500 or err.code >= 600:
927 raise
928 else:
929 # Examine the reported length
930 if (content_length is not None and
931 (resume_len - 100 < long(content_length) < resume_len + 100)):
932 # The file had already been fully downloaded.
933 # Explanation to the above condition: in issue #175 it was revealed that
934 # YouTube sometimes adds or removes a few bytes from the end of the file,
935 # changing the file size slightly and causing problems for some users. So
936 # I decided to implement a suggested change and consider the file
937 # completely downloaded if the file size differs less than 100 bytes from
938 # the one in the hard drive.
939 self.report_file_already_downloaded(filename)
940 self.try_rename(tmpfilename, filename)
941 return True
942 else:
943 # The length does not match, we start the download over
944 self.report_unable_to_resume()
945 open_mode = 'wb'
946 break
947 # Retry
948 count += 1
949 if count <= retries:
950 self.report_retry(count, retries)
951
952 if count > retries:
953 self.trouble(u'ERROR: giving up after %s retries' % retries)
954 return False
955
956 data_len = data.info().get('Content-length', None)
957 if data_len is not None:
958 data_len = long(data_len) + resume_len
959 data_len_str = self.format_bytes(data_len)
960 byte_counter = 0 + resume_len
961 block_size = 1024
962 start = time.time()
963 while True:
964 # Download and write
965 before = time.time()
966 data_block = data.read(block_size)
967 after = time.time()
968 if len(data_block) == 0:
969 break
970 byte_counter += len(data_block)
971
972 # Open file just in time
973 if stream is None:
974 try:
975 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
976 assert stream is not None
977 filename = self.undo_temp_name(tmpfilename)
978 self.report_destination(filename)
979 except (OSError, IOError), err:
980 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
981 return False
982 try:
983 stream.write(data_block)
984 except (IOError, OSError), err:
985 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
986 return False
987 block_size = self.best_block_size(after - before, len(data_block))
988
989 # Progress message
990 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
991 if data_len is None:
992 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
993 else:
994 percent_str = self.calc_percent(byte_counter, data_len)
995 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
996 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
997
998 # Apply rate limit
999 self.slow_down(start, byte_counter - resume_len)
1000
1001 if stream is None:
1002 self.trouble(u'\nERROR: Did not get any data blocks')
1003 return False
1004 stream.close()
1005 self.report_finish()
1006 if data_len is not None and byte_counter != data_len:
1007 raise ContentTooShortError(byte_counter, long(data_len))
1008 self.try_rename(tmpfilename, filename)
1009
1010 # Update file modification time
1011 if self.params.get('updatetime', True):
1012 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1013
1014 return True
1015
1016
1017class InfoExtractor(object):
1018 """Information Extractor class.
1019
1020 Information extractors are the classes that, given a URL, extract
1021 information from the video (or videos) the URL refers to. This
1022 information includes the real video URL, the video title and simplified
1023 title, author and others. The information is stored in a dictionary
1024 which is then passed to the FileDownloader. The FileDownloader
1025 processes this information possibly downloading the video to the file
1026 system, among other possible outcomes. The dictionaries must include
1027 the following fields:
1028
1029 id: Video identifier.
1030 url: Final video URL.
1031 uploader: Nickname of the video uploader.
1032 title: Literal title.
1033 stitle: Simplified title.
1034 ext: Video filename extension.
1035 format: Video format.
1036 player_url: SWF Player URL (may be None).
1037
1038 The following fields are optional. Their primary purpose is to allow
1039 youtube-dl to serve as the backend for a video search function, such
1040 as the one in youtube2mp3. They are only used when their respective
1041 forced printing functions are called:
1042
1043 thumbnail: Full URL to a video thumbnail image.
1044 description: One-line video description.
1045
1046 Subclasses of this one should re-define the _real_initialize() and
1047 _real_extract() methods and define a _VALID_URL regexp.
1048 Probably, they should also be added to the list of extractors.
1049 """
1050
1051 _ready = False
1052 _downloader = None
1053
1054 def __init__(self, downloader=None):
1055 """Constructor. Receives an optional downloader."""
1056 self._ready = False
1057 self.set_downloader(downloader)
1058
1059 def suitable(self, url):
1060 """Receives a URL and returns True if suitable for this IE."""
1061 return re.match(self._VALID_URL, url) is not None
1062
1063 def initialize(self):
1064 """Initializes an instance (authentication, etc)."""
1065 if not self._ready:
1066 self._real_initialize()
1067 self._ready = True
1068
1069 def extract(self, url):
1070 """Extracts URL information and returns it in list of dicts."""
1071 self.initialize()
1072 return self._real_extract(url)
1073
1074 def set_downloader(self, downloader):
1075 """Sets the downloader for this IE."""
1076 self._downloader = downloader
1077
1078 def _real_initialize(self):
1079 """Real initialization process. Redefine in subclasses."""
1080 pass
1081
1082 def _real_extract(self, url):
1083 """Real extraction process. Redefine in subclasses."""
1084 pass
1085
1086
1087class YoutubeIE(InfoExtractor):
1088 """Information extractor for youtube.com."""
1089
1090 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1091 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1092 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1093 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1094 _NETRC_MACHINE = 'youtube'
1095 # Listed in order of quality
1096 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1097 _video_extensions = {
1098 '13': '3gp',
1099 '17': 'mp4',
1100 '18': 'mp4',
1101 '22': 'mp4',
1102 '37': 'mp4',
1103 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1104 '43': 'webm',
1105 '44': 'webm',
1106 '45': 'webm',
1107 }
1108 _video_dimensions = {
1109 '5': '240x400',
1110 '6': '???',
1111 '13': '???',
1112 '17': '144x176',
1113 '18': '360x640',
1114 '22': '720x1280',
1115 '34': '360x640',
1116 '35': '480x854',
1117 '37': '1080x1920',
1118 '38': '3072x4096',
1119 '43': '360x640',
1120 '44': '480x854',
1121 '45': '720x1280',
1122 }
1123 IE_NAME = u'youtube'
1124
1125 def report_lang(self):
1126 """Report attempt to set language."""
1127 self._downloader.to_screen(u'[youtube] Setting language')
1128
1129 def report_login(self):
1130 """Report attempt to log in."""
1131 self._downloader.to_screen(u'[youtube] Logging in')
1132
1133 def report_age_confirmation(self):
1134 """Report attempt to confirm age."""
1135 self._downloader.to_screen(u'[youtube] Confirming age')
1136
1137 def report_video_webpage_download(self, video_id):
1138 """Report attempt to download video webpage."""
1139 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1140
1141 def report_video_info_webpage_download(self, video_id):
1142 """Report attempt to download video info webpage."""
1143 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1144
1145 def report_information_extraction(self, video_id):
1146 """Report attempt to extract video information."""
1147 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1148
1149 def report_unavailable_format(self, video_id, format):
1150 """Report extracted video URL."""
1151 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1152
1153 def report_rtmp_download(self):
1154 """Indicate the download will use the RTMP protocol."""
1155 self._downloader.to_screen(u'[youtube] RTMP download detected')
1156
1157 def _print_formats(self, formats):
1158 print 'Available formats:'
1159 for x in formats:
1160 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1161
1162 def _real_initialize(self):
1163 if self._downloader is None:
1164 return
1165
1166 username = None
1167 password = None
1168 downloader_params = self._downloader.params
1169
1170 # Attempt to use provided username and password or .netrc data
1171 if downloader_params.get('username', None) is not None:
1172 username = downloader_params['username']
1173 password = downloader_params['password']
1174 elif downloader_params.get('usenetrc', False):
1175 try:
1176 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1177 if info is not None:
1178 username = info[0]
1179 password = info[2]
1180 else:
1181 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1182 except (IOError, netrc.NetrcParseError), err:
1183 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1184 return
1185
1186 # Set language
1187 request = urllib2.Request(self._LANG_URL)
1188 try:
1189 self.report_lang()
1190 urllib2.urlopen(request).read()
1191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1193 return
1194
1195 # No authentication to be performed
1196 if username is None:
1197 return
1198
1199 # Log in
1200 login_form = {
1201 'current_form': 'loginForm',
1202 'next': '/',
1203 'action_login': 'Log In',
1204 'username': username,
1205 'password': password,
1206 }
1207 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1208 try:
1209 self.report_login()
1210 login_results = urllib2.urlopen(request).read()
1211 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1212 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1213 return
1214 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1215 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1216 return
1217
1218 # Confirm age
1219 age_form = {
1220 'next_url': '/',
1221 'action_confirm': 'Confirm',
1222 }
1223 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1224 try:
1225 self.report_age_confirmation()
1226 age_results = urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1229 return
1230
1231 def _real_extract(self, url):
1232 # Extract video id from URL
1233 mobj = re.match(self._VALID_URL, url)
1234 if mobj is None:
1235 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1236 return
1237 video_id = mobj.group(2)
1238
1239 # Get video webpage
1240 self.report_video_webpage_download(video_id)
1241 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1242 try:
1243 video_webpage = urllib2.urlopen(request).read()
1244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1246 return
1247
1248 # Attempt to extract SWF player URL
1249 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1250 if mobj is not None:
1251 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1252 else:
1253 player_url = None
1254
1255 # Get video info
1256 self.report_video_info_webpage_download(video_id)
1257 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1258 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1259 % (video_id, el_type))
1260 request = urllib2.Request(video_info_url)
1261 try:
1262 video_info_webpage = urllib2.urlopen(request).read()
1263 video_info = parse_qs(video_info_webpage)
1264 if 'token' in video_info:
1265 break
1266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1267 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1268 return
1269 if 'token' not in video_info:
1270 if 'reason' in video_info:
1271 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1272 else:
1273 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1274 return
1275
1276 # Start extracting information
1277 self.report_information_extraction(video_id)
1278
1279 # uploader
1280 if 'author' not in video_info:
1281 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1282 return
1283 video_uploader = urllib.unquote_plus(video_info['author'][0])
1284
1285 # title
1286 if 'title' not in video_info:
1287 self._downloader.trouble(u'ERROR: unable to extract video title')
1288 return
1289 video_title = urllib.unquote_plus(video_info['title'][0])
1290 video_title = video_title.decode('utf-8')
1291 video_title = sanitize_title(video_title)
1292
1293 # simplified title
e092418d 1294 simple_title = _simplify_title(video_title)
235b3ba4
PH
1295
1296 # thumbnail image
1297 if 'thumbnail_url' not in video_info:
1298 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1299 video_thumbnail = ''
1300 else: # don't panic if we can't find it
1301 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1302
1303 # upload date
1304 upload_date = u'NA'
1305 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1306 if mobj is not None:
1307 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1308 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1309 for expression in format_expressions:
1310 try:
1311 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1312 except:
1313 pass
1314
1315 # description
1316 try:
1317 lxml.etree
1318 except NameError:
1319 video_description = u'No description available.'
1320 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1321 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1322 if mobj is not None:
1323 video_description = mobj.group(1).decode('utf-8')
1324 else:
1325 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1326 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1327 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1328 # TODO use another parser
1329
1330 # token
1331 video_token = urllib.unquote_plus(video_info['token'][0])
1332
1333 # Decide which formats to download
1334 req_format = self._downloader.params.get('format', None)
1335
1336 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1337 self.report_rtmp_download()
1338 video_url_list = [(None, video_info['conn'][0])]
1339 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1340 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1341 url_data = [parse_qs(uds) for uds in url_data_strs]
1342 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1343 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1344
1345 format_limit = self._downloader.params.get('format_limit', None)
1346 if format_limit is not None and format_limit in self._available_formats:
1347 format_list = self._available_formats[self._available_formats.index(format_limit):]
1348 else:
1349 format_list = self._available_formats
1350 existing_formats = [x for x in format_list if x in url_map]
1351 if len(existing_formats) == 0:
1352 self._downloader.trouble(u'ERROR: no known formats available for video')
1353 return
1354 if self._downloader.params.get('listformats', None):
1355 self._print_formats(existing_formats)
1356 return
1357 if req_format is None or req_format == 'best':
1358 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1359 elif req_format == 'worst':
1360 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1361 elif req_format in ('-1', 'all'):
1362 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1363 else:
1364 # Specific formats. We pick the first in a slash-delimeted sequence.
1365 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1366 req_formats = req_format.split('/')
1367 video_url_list = None
1368 for rf in req_formats:
1369 if rf in url_map:
1370 video_url_list = [(rf, url_map[rf])]
1371 break
1372 if video_url_list is None:
1373 self._downloader.trouble(u'ERROR: requested format not available')
1374 return
1375 else:
1376 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1377 return
1378
1379 for format_param, video_real_url in video_url_list:
1380 # At this point we have a new video
1381 self._downloader.increment_downloads()
1382
1383 # Extension
1384 video_extension = self._video_extensions.get(format_param, 'flv')
1385
1386 try:
1387 # Process video information
1388 self._downloader.process_info({
1389 'id': video_id.decode('utf-8'),
1390 'url': video_real_url.decode('utf-8'),
1391 'uploader': video_uploader.decode('utf-8'),
1392 'upload_date': upload_date,
1393 'title': video_title,
1394 'stitle': simple_title,
1395 'ext': video_extension.decode('utf-8'),
1396 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1397 'thumbnail': video_thumbnail.decode('utf-8'),
1398 'description': video_description,
1399 'player_url': player_url,
1400 })
1401 except UnavailableVideoError, err:
1402 self._downloader.trouble(u'\nERROR: unable to download video')
1403
1404
1405class MetacafeIE(InfoExtractor):
1406 """Information Extractor for metacafe.com."""
1407
1408 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1409 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1410 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1411 _youtube_ie = None
1412 IE_NAME = u'metacafe'
1413
1414 def __init__(self, youtube_ie, downloader=None):
1415 InfoExtractor.__init__(self, downloader)
1416 self._youtube_ie = youtube_ie
1417
1418 def report_disclaimer(self):
1419 """Report disclaimer retrieval."""
1420 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1421
1422 def report_age_confirmation(self):
1423 """Report attempt to confirm age."""
1424 self._downloader.to_screen(u'[metacafe] Confirming age')
1425
1426 def report_download_webpage(self, video_id):
1427 """Report webpage download."""
1428 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1429
1430 def report_extraction(self, video_id):
1431 """Report information extraction."""
1432 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1433
1434 def _real_initialize(self):
1435 # Retrieve disclaimer
1436 request = urllib2.Request(self._DISCLAIMER)
1437 try:
1438 self.report_disclaimer()
1439 disclaimer = urllib2.urlopen(request).read()
1440 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1441 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1442 return
1443
1444 # Confirm age
1445 disclaimer_form = {
1446 'filters': '0',
1447 'submit': "Continue - I'm over 18",
1448 }
1449 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1450 try:
1451 self.report_age_confirmation()
1452 disclaimer = urllib2.urlopen(request).read()
1453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1455 return
1456
1457 def _real_extract(self, url):
1458 # Extract id and simplified title from URL
1459 mobj = re.match(self._VALID_URL, url)
1460 if mobj is None:
1461 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1462 return
1463
1464 video_id = mobj.group(1)
1465
1466 # Check if video comes from YouTube
1467 mobj2 = re.match(r'^yt-(.*)$', video_id)
1468 if mobj2 is not None:
1469 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1470 return
1471
1472 # At this point we have a new video
1473 self._downloader.increment_downloads()
1474
1475 simple_title = mobj.group(2).decode('utf-8')
1476
1477 # Retrieve video webpage to extract further information
1478 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1479 try:
1480 self.report_download_webpage(video_id)
1481 webpage = urllib2.urlopen(request).read()
1482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1483 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1484 return
1485
1486 # Extract URL, uploader and title from webpage
1487 self.report_extraction(video_id)
1488 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1489 if mobj is not None:
1490 mediaURL = urllib.unquote(mobj.group(1))
1491 video_extension = mediaURL[-3:]
1492
1493 # Extract gdaKey if available
1494 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1495 if mobj is None:
1496 video_url = mediaURL
1497 else:
1498 gdaKey = mobj.group(1)
1499 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1500 else:
1501 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1502 if mobj is None:
1503 self._downloader.trouble(u'ERROR: unable to extract media URL')
1504 return
1505 vardict = parse_qs(mobj.group(1))
1506 if 'mediaData' not in vardict:
1507 self._downloader.trouble(u'ERROR: unable to extract media URL')
1508 return
1509 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1510 if mobj is None:
1511 self._downloader.trouble(u'ERROR: unable to extract media URL')
1512 return
1513 mediaURL = mobj.group(1).replace('\\/', '/')
1514 video_extension = mediaURL[-3:]
1515 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1516
1517 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1518 if mobj is None:
1519 self._downloader.trouble(u'ERROR: unable to extract title')
1520 return
1521 video_title = mobj.group(1).decode('utf-8')
1522 video_title = sanitize_title(video_title)
1523
1524 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1525 if mobj is None:
1526 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1527 return
1528 video_uploader = mobj.group(1)
1529
1530 try:
1531 # Process video information
1532 self._downloader.process_info({
1533 'id': video_id.decode('utf-8'),
1534 'url': video_url.decode('utf-8'),
1535 'uploader': video_uploader.decode('utf-8'),
1536 'upload_date': u'NA',
1537 'title': video_title,
1538 'stitle': simple_title,
1539 'ext': video_extension.decode('utf-8'),
1540 'format': u'NA',
1541 'player_url': None,
1542 })
1543 except UnavailableVideoError:
1544 self._downloader.trouble(u'\nERROR: unable to download video')
1545
1546
1547class DailymotionIE(InfoExtractor):
1548 """Information Extractor for Dailymotion"""
1549
1550 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1551 IE_NAME = u'dailymotion'
1552
1553 def __init__(self, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1555
1556 def report_download_webpage(self, video_id):
1557 """Report webpage download."""
1558 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1559
1560 def report_extraction(self, video_id):
1561 """Report information extraction."""
1562 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1563
1564 def _real_extract(self, url):
1565 # Extract id and simplified title from URL
1566 mobj = re.match(self._VALID_URL, url)
1567 if mobj is None:
1568 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1569 return
1570
1571 # At this point we have a new video
1572 self._downloader.increment_downloads()
1573 video_id = mobj.group(1)
1574
1575 simple_title = mobj.group(2).decode('utf-8')
1576 video_extension = 'flv'
1577
1578 # Retrieve video webpage to extract further information
1579 request = urllib2.Request(url)
1580 request.add_header('Cookie', 'family_filter=off')
1581 try:
1582 self.report_download_webpage(video_id)
1583 webpage = urllib2.urlopen(request).read()
1584 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1586 return
1587
1588 # Extract URL, uploader and title from webpage
1589 self.report_extraction(video_id)
1590 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1591 if mobj is None:
1592 self._downloader.trouble(u'ERROR: unable to extract media URL')
1593 return
1594 sequence = urllib.unquote(mobj.group(1))
1595 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1596 if mobj is None:
1597 self._downloader.trouble(u'ERROR: unable to extract media URL')
1598 return
1599 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1600
1601 # if needed add http://www.dailymotion.com/ if relative URL
1602
1603 video_url = mediaURL
1604
1605 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1606 if mobj is None:
1607 self._downloader.trouble(u'ERROR: unable to extract title')
1608 return
1609 video_title = mobj.group(1).decode('utf-8')
1610 video_title = sanitize_title(video_title)
1611
1612 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1613 if mobj is None:
1614 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1615 return
1616 video_uploader = mobj.group(1)
1617
1618 try:
1619 # Process video information
1620 self._downloader.process_info({
1621 'id': video_id.decode('utf-8'),
1622 'url': video_url.decode('utf-8'),
1623 'uploader': video_uploader.decode('utf-8'),
1624 'upload_date': u'NA',
1625 'title': video_title,
1626 'stitle': simple_title,
1627 'ext': video_extension.decode('utf-8'),
1628 'format': u'NA',
1629 'player_url': None,
1630 })
1631 except UnavailableVideoError:
1632 self._downloader.trouble(u'\nERROR: unable to download video')
1633
1634
1635class GoogleIE(InfoExtractor):
1636 """Information extractor for video.google.com."""
1637
1638 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1639 IE_NAME = u'video.google'
1640
1641 def __init__(self, downloader=None):
1642 InfoExtractor.__init__(self, downloader)
1643
1644 def report_download_webpage(self, video_id):
1645 """Report webpage download."""
1646 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1647
1648 def report_extraction(self, video_id):
1649 """Report information extraction."""
1650 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1651
1652 def _real_extract(self, url):
1653 # Extract id from URL
1654 mobj = re.match(self._VALID_URL, url)
1655 if mobj is None:
1656 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1657 return
1658
1659 # At this point we have a new video
1660 self._downloader.increment_downloads()
1661 video_id = mobj.group(1)
1662
1663 video_extension = 'mp4'
1664
1665 # Retrieve video webpage to extract further information
1666 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1667 try:
1668 self.report_download_webpage(video_id)
1669 webpage = urllib2.urlopen(request).read()
1670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1672 return
1673
1674 # Extract URL, uploader, and title from webpage
1675 self.report_extraction(video_id)
1676 mobj = re.search(r"download_url:'([^']+)'", webpage)
1677 if mobj is None:
1678 video_extension = 'flv'
1679 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1680 if mobj is None:
1681 self._downloader.trouble(u'ERROR: unable to extract media URL')
1682 return
1683 mediaURL = urllib.unquote(mobj.group(1))
1684 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1685 mediaURL = mediaURL.replace('\\x26', '\x26')
1686
1687 video_url = mediaURL
1688
1689 mobj = re.search(r'<title>(.*)</title>', webpage)
1690 if mobj is None:
1691 self._downloader.trouble(u'ERROR: unable to extract title')
1692 return
1693 video_title = mobj.group(1).decode('utf-8')
1694 video_title = sanitize_title(video_title)
e092418d 1695 simple_title = _simplify_title(video_title)
235b3ba4
PH
1696
1697 # Extract video description
1698 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1699 if mobj is None:
1700 self._downloader.trouble(u'ERROR: unable to extract video description')
1701 return
1702 video_description = mobj.group(1).decode('utf-8')
1703 if not video_description:
1704 video_description = 'No description available.'
1705
1706 # Extract video thumbnail
1707 if self._downloader.params.get('forcethumbnail', False):
1708 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1709 try:
1710 webpage = urllib2.urlopen(request).read()
1711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1712 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1713 return
1714 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1715 if mobj is None:
1716 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1717 return
1718 video_thumbnail = mobj.group(1)
1719 else: # we need something to pass to process_info
1720 video_thumbnail = ''
1721
1722 try:
1723 # Process video information
1724 self._downloader.process_info({
1725 'id': video_id.decode('utf-8'),
1726 'url': video_url.decode('utf-8'),
1727 'uploader': u'NA',
1728 'upload_date': u'NA',
1729 'title': video_title,
1730 'stitle': simple_title,
1731 'ext': video_extension.decode('utf-8'),
1732 'format': u'NA',
1733 'player_url': None,
1734 })
1735 except UnavailableVideoError:
1736 self._downloader.trouble(u'\nERROR: unable to download video')
1737
1738
1739class PhotobucketIE(InfoExtractor):
1740 """Information extractor for photobucket.com."""
1741
1742 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1743 IE_NAME = u'photobucket'
1744
1745 def __init__(self, downloader=None):
1746 InfoExtractor.__init__(self, downloader)
1747
1748 def report_download_webpage(self, video_id):
1749 """Report webpage download."""
1750 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1751
1752 def report_extraction(self, video_id):
1753 """Report information extraction."""
1754 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1755
1756 def _real_extract(self, url):
1757 # Extract id from URL
1758 mobj = re.match(self._VALID_URL, url)
1759 if mobj is None:
1760 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1761 return
1762
1763 # At this point we have a new video
1764 self._downloader.increment_downloads()
1765 video_id = mobj.group(1)
1766
1767 video_extension = 'flv'
1768
1769 # Retrieve video webpage to extract further information
1770 request = urllib2.Request(url)
1771 try:
1772 self.report_download_webpage(video_id)
1773 webpage = urllib2.urlopen(request).read()
1774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1776 return
1777
1778 # Extract URL, uploader, and title from webpage
1779 self.report_extraction(video_id)
1780 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1781 if mobj is None:
1782 self._downloader.trouble(u'ERROR: unable to extract media URL')
1783 return
1784 mediaURL = urllib.unquote(mobj.group(1))
1785
1786 video_url = mediaURL
1787
1788 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1789 if mobj is None:
1790 self._downloader.trouble(u'ERROR: unable to extract title')
1791 return
1792 video_title = mobj.group(1).decode('utf-8')
1793 video_title = sanitize_title(video_title)
e092418d 1794 simple_title = _simplify_title(vide_title)
235b3ba4
PH
1795
1796 video_uploader = mobj.group(2).decode('utf-8')
1797
1798 try:
1799 # Process video information
1800 self._downloader.process_info({
1801 'id': video_id.decode('utf-8'),
1802 'url': video_url.decode('utf-8'),
1803 'uploader': video_uploader,
1804 'upload_date': u'NA',
1805 'title': video_title,
1806 'stitle': simple_title,
1807 'ext': video_extension.decode('utf-8'),
1808 'format': u'NA',
1809 'player_url': None,
1810 })
1811 except UnavailableVideoError:
1812 self._downloader.trouble(u'\nERROR: unable to download video')
1813
1814
1815class YahooIE(InfoExtractor):
1816 """Information extractor for video.yahoo.com."""
1817
1818 # _VALID_URL matches all Yahoo! Video URLs
1819 # _VPAGE_URL matches only the extractable '/watch/' URLs
1820 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1821 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1822 IE_NAME = u'video.yahoo'
1823
1824 def __init__(self, downloader=None):
1825 InfoExtractor.__init__(self, downloader)
1826
1827 def report_download_webpage(self, video_id):
1828 """Report webpage download."""
1829 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1830
1831 def report_extraction(self, video_id):
1832 """Report information extraction."""
1833 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1834
1835 def _real_extract(self, url, new_video=True):
1836 # Extract ID from URL
1837 mobj = re.match(self._VALID_URL, url)
1838 if mobj is None:
1839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1840 return
1841
1842 # At this point we have a new video
1843 self._downloader.increment_downloads()
1844 video_id = mobj.group(2)
1845 video_extension = 'flv'
1846
1847 # Rewrite valid but non-extractable URLs as
1848 # extractable English language /watch/ URLs
1849 if re.match(self._VPAGE_URL, url) is None:
1850 request = urllib2.Request(url)
1851 try:
1852 webpage = urllib2.urlopen(request).read()
1853 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1854 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1855 return
1856
1857 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1858 if mobj is None:
1859 self._downloader.trouble(u'ERROR: Unable to extract id field')
1860 return
1861 yahoo_id = mobj.group(1)
1862
1863 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1864 if mobj is None:
1865 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1866 return
1867 yahoo_vid = mobj.group(1)
1868
1869 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1870 return self._real_extract(url, new_video=False)
1871
1872 # Retrieve video webpage to extract further information
1873 request = urllib2.Request(url)
1874 try:
1875 self.report_download_webpage(video_id)
1876 webpage = urllib2.urlopen(request).read()
1877 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1878 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1879 return
1880
1881 # Extract uploader and title from webpage
1882 self.report_extraction(video_id)
1883 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1884 if mobj is None:
1885 self._downloader.trouble(u'ERROR: unable to extract video title')
1886 return
1887 video_title = mobj.group(1).decode('utf-8')
e092418d 1888 simple_title = _simplify_title(video_title)
235b3ba4
PH
1889
1890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1891 if mobj is None:
1892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1893 return
1894 video_uploader = mobj.group(1).decode('utf-8')
1895
1896 # Extract video thumbnail
1897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1898 if mobj is None:
1899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1900 return
1901 video_thumbnail = mobj.group(1).decode('utf-8')
1902
1903 # Extract video description
1904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1905 if mobj is None:
1906 self._downloader.trouble(u'ERROR: unable to extract video description')
1907 return
1908 video_description = mobj.group(1).decode('utf-8')
1909 if not video_description:
1910 video_description = 'No description available.'
1911
1912 # Extract video height and width
1913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: unable to extract video height')
1916 return
1917 yv_video_height = mobj.group(1)
1918
1919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1920 if mobj is None:
1921 self._downloader.trouble(u'ERROR: unable to extract video width')
1922 return
1923 yv_video_width = mobj.group(1)
1924
1925 # Retrieve video playlist to extract media URL
1926 # I'm not completely sure what all these options are, but we
1927 # seem to need most of them, otherwise the server sends a 401.
1928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1930 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1933 try:
1934 self.report_download_webpage(video_id)
1935 webpage = urllib2.urlopen(request).read()
1936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1938 return
1939
1940 # Extract media URL from playlist XML
1941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1942 if mobj is None:
1943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1944 return
1945 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1946 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1947
1948 try:
1949 # Process video information
1950 self._downloader.process_info({
1951 'id': video_id.decode('utf-8'),
1952 'url': video_url,
1953 'uploader': video_uploader,
1954 'upload_date': u'NA',
1955 'title': video_title,
1956 'stitle': simple_title,
1957 'ext': video_extension.decode('utf-8'),
1958 'thumbnail': video_thumbnail.decode('utf-8'),
1959 'description': video_description,
1960 'thumbnail': video_thumbnail,
1961 'player_url': None,
1962 })
1963 except UnavailableVideoError:
1964 self._downloader.trouble(u'\nERROR: unable to download video')
1965
1966
1967class VimeoIE(InfoExtractor):
1968 """Information extractor for vimeo.com."""
1969
1970 # _VALID_URL matches Vimeo URLs
1971 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1972 IE_NAME = u'vimeo'
1973
1974 def __init__(self, downloader=None):
1975 InfoExtractor.__init__(self, downloader)
1976
1977 def report_download_webpage(self, video_id):
1978 """Report webpage download."""
1979 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1980
1981 def report_extraction(self, video_id):
1982 """Report information extraction."""
1983 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1984
1985 def _real_extract(self, url, new_video=True):
1986 # Extract ID from URL
1987 mobj = re.match(self._VALID_URL, url)
1988 if mobj is None:
1989 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1990 return
1991
1992 # At this point we have a new video
1993 self._downloader.increment_downloads()
1994 video_id = mobj.group(1)
1995
1996 # Retrieve video webpage to extract further information
1997 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1998 try:
1999 self.report_download_webpage(video_id)
2000 webpage = urllib2.urlopen(request).read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2003 return
2004
2005 # Now we begin extracting as much information as we can from what we
2006 # retrieved. First we extract the information common to all extractors,
2007 # and latter we extract those that are Vimeo specific.
2008 self.report_extraction(video_id)
2009
2010 # Extract title
2011 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2012 if mobj is None:
2013 self._downloader.trouble(u'ERROR: unable to extract video title')
2014 return
2015 video_title = mobj.group(1).decode('utf-8')
e092418d 2016 simple_title = _simple_title(video_title)
235b3ba4
PH
2017
2018 # Extract uploader
2019 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2020 if mobj is None:
2021 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2022 return
2023 video_uploader = mobj.group(1).decode('utf-8')
2024
2025 # Extract video thumbnail
2026 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2027 if mobj is None:
2028 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2029 return
2030 video_thumbnail = mobj.group(1).decode('utf-8')
2031
2032 # # Extract video description
2033 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2034 # if mobj is None:
2035 # self._downloader.trouble(u'ERROR: unable to extract video description')
2036 # return
2037 # video_description = mobj.group(1).decode('utf-8')
2038 # if not video_description: video_description = 'No description available.'
2039 video_description = 'Foo.'
2040
2041 # Vimeo specific: extract request signature
2042 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2043 if mobj is None:
2044 self._downloader.trouble(u'ERROR: unable to extract request signature')
2045 return
2046 sig = mobj.group(1).decode('utf-8')
2047
2048 # Vimeo specific: extract video quality information
2049 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2050 if mobj is None:
2051 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2052 return
2053 quality = mobj.group(1).decode('utf-8')
2054
2055 if int(quality) == 1:
2056 quality = 'hd'
2057 else:
2058 quality = 'sd'
2059
2060 # Vimeo specific: Extract request signature expiration
2061 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2062 if mobj is None:
2063 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2064 return
2065 sig_exp = mobj.group(1).decode('utf-8')
2066
2067 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2068
2069 try:
2070 # Process video information
2071 self._downloader.process_info({
2072 'id': video_id.decode('utf-8'),
2073 'url': video_url,
2074 'uploader': video_uploader,
2075 'upload_date': u'NA',
2076 'title': video_title,
2077 'stitle': simple_title,
2078 'ext': u'mp4',
2079 'thumbnail': video_thumbnail.decode('utf-8'),
2080 'description': video_description,
2081 'thumbnail': video_thumbnail,
2082 'description': video_description,
2083 'player_url': None,
2084 })
2085 except UnavailableVideoError:
2086 self._downloader.trouble(u'ERROR: unable to download video')
2087
2088
2089class GenericIE(InfoExtractor):
2090 """Generic last-resort information extractor."""
2091
2092 _VALID_URL = r'.*'
2093 IE_NAME = u'generic'
2094
2095 def __init__(self, downloader=None):
2096 InfoExtractor.__init__(self, downloader)
2097
2098 def report_download_webpage(self, video_id):
2099 """Report webpage download."""
2100 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2101 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2102
2103 def report_extraction(self, video_id):
2104 """Report information extraction."""
2105 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2106
2107 def _real_extract(self, url):
2108 # At this point we have a new video
2109 self._downloader.increment_downloads()
2110
2111 video_id = url.split('/')[-1]
2112 request = urllib2.Request(url)
2113 try:
2114 self.report_download_webpage(video_id)
2115 webpage = urllib2.urlopen(request).read()
2116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2118 return
2119 except ValueError, err:
2120 # since this is the last-resort InfoExtractor, if
2121 # this error is thrown, it'll be thrown here
2122 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2123 return
2124
2125 self.report_extraction(video_id)
2126 # Start with something easy: JW Player in SWFObject
2127 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2128 if mobj is None:
2129 # Broaden the search a little bit
2130 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2131 if mobj is None:
2132 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2133 return
2134
2135 # It's possible that one of the regexes
2136 # matched, but returned an empty group:
2137 if mobj.group(1) is None:
2138 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2139 return
2140
2141 video_url = urllib.unquote(mobj.group(1))
2142 video_id = os.path.basename(video_url)
2143
2144 # here's a fun little line of code for you:
2145 video_extension = os.path.splitext(video_id)[1][1:]
2146 video_id = os.path.splitext(video_id)[0]
2147
2148 # it's tempting to parse this further, but you would
2149 # have to take into account all the variations like
2150 # Video Title - Site Name
2151 # Site Name | Video Title
2152 # Video Title - Tagline | Site Name
2153 # and so on and so forth; it's just not practical
2154 mobj = re.search(r'<title>(.*)</title>', webpage)
2155 if mobj is None:
2156 self._downloader.trouble(u'ERROR: unable to extract title')
2157 return
2158 video_title = mobj.group(1).decode('utf-8')
2159 video_title = sanitize_title(video_title)
e092418d 2160 simple_title = _simplify_title(video_title)
235b3ba4
PH
2161
2162 # video uploader is domain name
2163 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2164 if mobj is None:
2165 self._downloader.trouble(u'ERROR: unable to extract title')
2166 return
2167 video_uploader = mobj.group(1).decode('utf-8')
2168
2169 try:
2170 # Process video information
2171 self._downloader.process_info({
2172 'id': video_id.decode('utf-8'),
2173 'url': video_url.decode('utf-8'),
2174 'uploader': video_uploader,
2175 'upload_date': u'NA',
2176 'title': video_title,
2177 'stitle': simple_title,
2178 'ext': video_extension.decode('utf-8'),
2179 'format': u'NA',
2180 'player_url': None,
2181 })
2182 except UnavailableVideoError, err:
2183 self._downloader.trouble(u'\nERROR: unable to download video')
2184
2185
2186class YoutubeSearchIE(InfoExtractor):
2187 """Information Extractor for YouTube search queries."""
2188 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2189 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2190 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2191 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2192 _youtube_ie = None
2193 _max_youtube_results = 1000
2194 IE_NAME = u'youtube:search'
2195
2196 def __init__(self, youtube_ie, downloader=None):
2197 InfoExtractor.__init__(self, downloader)
2198 self._youtube_ie = youtube_ie
2199
2200 def report_download_page(self, query, pagenum):
2201 """Report attempt to download playlist page with given number."""
2202 query = query.decode(preferredencoding())
2203 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2204
2205 def _real_initialize(self):
2206 self._youtube_ie.initialize()
2207
2208 def _real_extract(self, query):
2209 mobj = re.match(self._VALID_URL, query)
2210 if mobj is None:
2211 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2212 return
2213
2214 prefix, query = query.split(':')
2215 prefix = prefix[8:]
2216 query = query.encode('utf-8')
2217 if prefix == '':
2218 self._download_n_results(query, 1)
2219 return
2220 elif prefix == 'all':
2221 self._download_n_results(query, self._max_youtube_results)
2222 return
2223 else:
2224 try:
2225 n = long(prefix)
2226 if n <= 0:
2227 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2228 return
2229 elif n > self._max_youtube_results:
2230 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2231 n = self._max_youtube_results
2232 self._download_n_results(query, n)
2233 return
2234 except ValueError: # parsing prefix as integer fails
2235 self._download_n_results(query, 1)
2236 return
2237
2238 def _download_n_results(self, query, n):
2239 """Downloads a specified number of results for a query"""
2240
2241 video_ids = []
2242 already_seen = set()
2243 pagenum = 1
2244
2245 while True:
2246 self.report_download_page(query, pagenum)
2247 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2248 request = urllib2.Request(result_url)
2249 try:
2250 page = urllib2.urlopen(request).read()
2251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2252 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2253 return
2254
2255 # Extract video identifiers
2256 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2257 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2258 if video_id not in already_seen:
2259 video_ids.append(video_id)
2260 already_seen.add(video_id)
2261 if len(video_ids) == n:
2262 # Specified n videos reached
2263 for id in video_ids:
2264 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2265 return
2266
2267 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2268 for id in video_ids:
2269 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2270 return
2271
2272 pagenum = pagenum + 1
2273
2274
2275class GoogleSearchIE(InfoExtractor):
2276 """Information Extractor for Google Video search queries."""
2277 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2278 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2279 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2280 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2281 _google_ie = None
2282 _max_google_results = 1000
2283 IE_NAME = u'video.google:search'
2284
2285 def __init__(self, google_ie, downloader=None):
2286 InfoExtractor.__init__(self, downloader)
2287 self._google_ie = google_ie
2288
2289 def report_download_page(self, query, pagenum):
2290 """Report attempt to download playlist page with given number."""
2291 query = query.decode(preferredencoding())
2292 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2293
2294 def _real_initialize(self):
2295 self._google_ie.initialize()
2296
2297 def _real_extract(self, query):
2298 mobj = re.match(self._VALID_URL, query)
2299 if mobj is None:
2300 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2301 return
2302
2303 prefix, query = query.split(':')
2304 prefix = prefix[8:]
2305 query = query.encode('utf-8')
2306 if prefix == '':
2307 self._download_n_results(query, 1)
2308 return
2309 elif prefix == 'all':
2310 self._download_n_results(query, self._max_google_results)
2311 return
2312 else:
2313 try:
2314 n = long(prefix)
2315 if n <= 0:
2316 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2317 return
2318 elif n > self._max_google_results:
2319 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2320 n = self._max_google_results
2321 self._download_n_results(query, n)
2322 return
2323 except ValueError: # parsing prefix as integer fails
2324 self._download_n_results(query, 1)
2325 return
2326
2327 def _download_n_results(self, query, n):
2328 """Downloads a specified number of results for a query"""
2329
2330 video_ids = []
2331 already_seen = set()
2332 pagenum = 1
2333
2334 while True:
2335 self.report_download_page(query, pagenum)
2336 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2337 request = urllib2.Request(result_url)
2338 try:
2339 page = urllib2.urlopen(request).read()
2340 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2341 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2342 return
2343
2344 # Extract video identifiers
2345 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2346 video_id = mobj.group(1)
2347 if video_id not in already_seen:
2348 video_ids.append(video_id)
2349 already_seen.add(video_id)
2350 if len(video_ids) == n:
2351 # Specified n videos reached
2352 for id in video_ids:
2353 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2354 return
2355
2356 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2357 for id in video_ids:
2358 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2359 return
2360
2361 pagenum = pagenum + 1
2362
2363
2364class YahooSearchIE(InfoExtractor):
2365 """Information Extractor for Yahoo! Video search queries."""
2366 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2367 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2368 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2369 _MORE_PAGES_INDICATOR = r'\s*Next'
2370 _yahoo_ie = None
2371 _max_yahoo_results = 1000
2372 IE_NAME = u'video.yahoo:search'
2373
2374 def __init__(self, yahoo_ie, downloader=None):
2375 InfoExtractor.__init__(self, downloader)
2376 self._yahoo_ie = yahoo_ie
2377
2378 def report_download_page(self, query, pagenum):
2379 """Report attempt to download playlist page with given number."""
2380 query = query.decode(preferredencoding())
2381 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2382
2383 def _real_initialize(self):
2384 self._yahoo_ie.initialize()
2385
2386 def _real_extract(self, query):
2387 mobj = re.match(self._VALID_URL, query)
2388 if mobj is None:
2389 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2390 return
2391
2392 prefix, query = query.split(':')
2393 prefix = prefix[8:]
2394 query = query.encode('utf-8')
2395 if prefix == '':
2396 self._download_n_results(query, 1)
2397 return
2398 elif prefix == 'all':
2399 self._download_n_results(query, self._max_yahoo_results)
2400 return
2401 else:
2402 try:
2403 n = long(prefix)
2404 if n <= 0:
2405 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2406 return
2407 elif n > self._max_yahoo_results:
2408 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2409 n = self._max_yahoo_results
2410 self._download_n_results(query, n)
2411 return
2412 except ValueError: # parsing prefix as integer fails
2413 self._download_n_results(query, 1)
2414 return
2415
2416 def _download_n_results(self, query, n):
2417 """Downloads a specified number of results for a query"""
2418
2419 video_ids = []
2420 already_seen = set()
2421 pagenum = 1
2422
2423 while True:
2424 self.report_download_page(query, pagenum)
2425 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2426 request = urllib2.Request(result_url)
2427 try:
2428 page = urllib2.urlopen(request).read()
2429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2430 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2431 return
2432
2433 # Extract video identifiers
2434 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2435 video_id = mobj.group(1)
2436 if video_id not in already_seen:
2437 video_ids.append(video_id)
2438 already_seen.add(video_id)
2439 if len(video_ids) == n:
2440 # Specified n videos reached
2441 for id in video_ids:
2442 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2443 return
2444
2445 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2446 for id in video_ids:
2447 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2448 return
2449
2450 pagenum = pagenum + 1
2451
2452
2453class YoutubePlaylistIE(InfoExtractor):
2454 """Information Extractor for YouTube playlists."""
2455
2456 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2457 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2458 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2459 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2460 _youtube_ie = None
2461 IE_NAME = u'youtube:playlist'
2462
2463 def __init__(self, youtube_ie, downloader=None):
2464 InfoExtractor.__init__(self, downloader)
2465 self._youtube_ie = youtube_ie
2466
2467 def report_download_page(self, playlist_id, pagenum):
2468 """Report attempt to download playlist page with given number."""
2469 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2470
2471 def _real_initialize(self):
2472 self._youtube_ie.initialize()
2473
2474 def _real_extract(self, url):
2475 # Extract playlist id
2476 mobj = re.match(self._VALID_URL, url)
2477 if mobj is None:
2478 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2479 return
2480
2481 # Single video case
2482 if mobj.group(3) is not None:
2483 self._youtube_ie.extract(mobj.group(3))
2484 return
2485
2486 # Download playlist pages
2487 # prefix is 'p' as default for playlists but there are other types that need extra care
2488 playlist_prefix = mobj.group(1)
2489 if playlist_prefix == 'a':
2490 playlist_access = 'artist'
2491 else:
2492 playlist_prefix = 'p'
2493 playlist_access = 'view_play_list'
2494 playlist_id = mobj.group(2)
2495 video_ids = []
2496 pagenum = 1
2497
2498 while True:
2499 self.report_download_page(playlist_id, pagenum)
2500 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2501 request = urllib2.Request(url)
2502 try:
2503 page = urllib2.urlopen(request).read()
2504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2505 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2506 return
2507
2508 # Extract video identifiers
2509 ids_in_page = []
2510 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2511 if mobj.group(1) not in ids_in_page:
2512 ids_in_page.append(mobj.group(1))
2513 video_ids.extend(ids_in_page)
2514
2515 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2516 break
2517 pagenum = pagenum + 1
2518
2519 playliststart = self._downloader.params.get('playliststart', 1) - 1
2520 playlistend = self._downloader.params.get('playlistend', -1)
2521 video_ids = video_ids[playliststart:playlistend]
2522
2523 for id in video_ids:
2524 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2525 return
2526
2527
2528class YoutubeUserIE(InfoExtractor):
2529 """Information Extractor for YouTube users."""
2530
2531 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2532 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2533 _GDATA_PAGE_SIZE = 50
2534 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2535 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2536 _youtube_ie = None
2537 IE_NAME = u'youtube:user'
2538
2539 def __init__(self, youtube_ie, downloader=None):
2540 InfoExtractor.__init__(self, downloader)
2541 self._youtube_ie = youtube_ie
2542
2543 def report_download_page(self, username, start_index):
2544 """Report attempt to download user page."""
2545 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2546 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2547
2548 def _real_initialize(self):
2549 self._youtube_ie.initialize()
2550
2551 def _real_extract(self, url):
2552 # Extract username
2553 mobj = re.match(self._VALID_URL, url)
2554 if mobj is None:
2555 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2556 return
2557
2558 username = mobj.group(1)
2559
2560 # Download video ids using YouTube Data API. Result size per
2561 # query is limited (currently to 50 videos) so we need to query
2562 # page by page until there are no video ids - it means we got
2563 # all of them.
2564
2565 video_ids = []
2566 pagenum = 0
2567
2568 while True:
2569 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2570 self.report_download_page(username, start_index)
2571
2572 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2573
2574 try:
2575 page = urllib2.urlopen(request).read()
2576 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2577 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2578 return
2579
2580 # Extract video identifiers
2581 ids_in_page = []
2582
2583 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2584 if mobj.group(1) not in ids_in_page:
2585 ids_in_page.append(mobj.group(1))
2586
2587 video_ids.extend(ids_in_page)
2588
2589 # A little optimization - if current page is not
2590 # "full", ie. does not contain PAGE_SIZE video ids then
2591 # we can assume that this page is the last one - there
2592 # are no more ids on further pages - no need to query
2593 # again.
2594
2595 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2596 break
2597
2598 pagenum += 1
2599
2600 all_ids_count = len(video_ids)
2601 playliststart = self._downloader.params.get('playliststart', 1) - 1
2602 playlistend = self._downloader.params.get('playlistend', -1)
2603
2604 if playlistend == -1:
2605 video_ids = video_ids[playliststart:]
2606 else:
2607 video_ids = video_ids[playliststart:playlistend]
2608
2609 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2610 (username, all_ids_count, len(video_ids)))
2611
2612 for video_id in video_ids:
2613 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2614
2615
2616class DepositFilesIE(InfoExtractor):
2617 """Information extractor for depositfiles.com"""
2618
2619 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2620 IE_NAME = u'DepositFiles'
2621
2622 def __init__(self, downloader=None):
2623 InfoExtractor.__init__(self, downloader)
2624
2625 def report_download_webpage(self, file_id):
2626 """Report webpage download."""
2627 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2628
2629 def report_extraction(self, file_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2632
2633 def _real_extract(self, url):
2634 # At this point we have a new file
2635 self._downloader.increment_downloads()
2636
2637 file_id = url.split('/')[-1]
2638 # Rebuild url in english locale
2639 url = 'http://depositfiles.com/en/files/' + file_id
2640
2641 # Retrieve file webpage with 'Free download' button pressed
2642 free_download_indication = { 'gateway_result' : '1' }
2643 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2644 try:
2645 self.report_download_webpage(file_id)
2646 webpage = urllib2.urlopen(request).read()
2647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2648 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2649 return
2650
2651 # Search for the real file URL
2652 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2653 if (mobj is None) or (mobj.group(1) is None):
2654 # Try to figure out reason of the error.
2655 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2656 if (mobj is not None) and (mobj.group(1) is not None):
2657 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2658 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2659 else:
2660 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2661 return
2662
2663 file_url = mobj.group(1)
2664 file_extension = os.path.splitext(file_url)[1][1:]
2665
2666 # Search for file title
2667 mobj = re.search(r'<b title="(.*?)">', webpage)
2668 if mobj is None:
2669 self._downloader.trouble(u'ERROR: unable to extract title')
2670 return
2671 file_title = mobj.group(1).decode('utf-8')
2672
2673 try:
2674 # Process file information
2675 self._downloader.process_info({
2676 'id': file_id.decode('utf-8'),
2677 'url': file_url.decode('utf-8'),
2678 'uploader': u'NA',
2679 'upload_date': u'NA',
2680 'title': file_title,
2681 'stitle': file_title,
2682 'ext': file_extension.decode('utf-8'),
2683 'format': u'NA',
2684 'player_url': None,
2685 })
2686 except UnavailableVideoError, err:
2687 self._downloader.trouble(u'ERROR: unable to download file')
2688
2689
2690class FacebookIE(InfoExtractor):
2691 """Information Extractor for Facebook"""
2692
2693 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2694 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2695 _NETRC_MACHINE = 'facebook'
2696 _available_formats = ['video', 'highqual', 'lowqual']
2697 _video_extensions = {
2698 'video': 'mp4',
2699 'highqual': 'mp4',
2700 'lowqual': 'mp4',
2701 }
2702 IE_NAME = u'facebook'
2703
2704 def __init__(self, downloader=None):
2705 InfoExtractor.__init__(self, downloader)
2706
2707 def _reporter(self, message):
2708 """Add header and report message."""
2709 self._downloader.to_screen(u'[facebook] %s' % message)
2710
2711 def report_login(self):
2712 """Report attempt to log in."""
2713 self._reporter(u'Logging in')
2714
2715 def report_video_webpage_download(self, video_id):
2716 """Report attempt to download video webpage."""
2717 self._reporter(u'%s: Downloading video webpage' % video_id)
2718
2719 def report_information_extraction(self, video_id):
2720 """Report attempt to extract video information."""
2721 self._reporter(u'%s: Extracting video information' % video_id)
2722
2723 def _parse_page(self, video_webpage):
2724 """Extract video information from page"""
2725 # General data
2726 data = {'title': r'\("video_title", "(.*?)"\)',
2727 'description': r'<div class="datawrap">(.*?)</div>',
2728 'owner': r'\("video_owner_name", "(.*?)"\)',
2729 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2730 }
2731 video_info = {}
2732 for piece in data.keys():
2733 mobj = re.search(data[piece], video_webpage)
2734 if mobj is not None:
2735 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2736
2737 # Video urls
2738 video_urls = {}
2739 for fmt in self._available_formats:
2740 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2741 if mobj is not None:
2742 # URL is in a Javascript segment inside an escaped Unicode format within
2743 # the generally utf-8 page
2744 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2745 video_info['video_urls'] = video_urls
2746
2747 return video_info
2748
2749 def _real_initialize(self):
2750 if self._downloader is None:
2751 return
2752
2753 useremail = None
2754 password = None
2755 downloader_params = self._downloader.params
2756
2757 # Attempt to use provided username and password or .netrc data
2758 if downloader_params.get('username', None) is not None:
2759 useremail = downloader_params['username']
2760 password = downloader_params['password']
2761 elif downloader_params.get('usenetrc', False):
2762 try:
2763 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2764 if info is not None:
2765 useremail = info[0]
2766 password = info[2]
2767 else:
2768 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2769 except (IOError, netrc.NetrcParseError), err:
2770 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2771 return
2772
2773 if useremail is None:
2774 return
2775
2776 # Log in
2777 login_form = {
2778 'email': useremail,
2779 'pass': password,
2780 'login': 'Log+In'
2781 }
2782 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2783 try:
2784 self.report_login()
2785 login_results = urllib2.urlopen(request).read()
2786 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2787 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2788 return
2789 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2790 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2791 return
2792
2793 def _real_extract(self, url):
2794 mobj = re.match(self._VALID_URL, url)
2795 if mobj is None:
2796 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797 return
2798 video_id = mobj.group('ID')
2799
2800 # Get video webpage
2801 self.report_video_webpage_download(video_id)
2802 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2803 try:
2804 page = urllib2.urlopen(request)
2805 video_webpage = page.read()
2806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2807 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2808 return
2809
2810 # Start extracting information
2811 self.report_information_extraction(video_id)
2812
2813 # Extract information
2814 video_info = self._parse_page(video_webpage)
2815
2816 # uploader
2817 if 'owner' not in video_info:
2818 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2819 return
2820 video_uploader = video_info['owner']
2821
2822 # title
2823 if 'title' not in video_info:
2824 self._downloader.trouble(u'ERROR: unable to extract video title')
2825 return
2826 video_title = video_info['title']
2827 video_title = video_title.decode('utf-8')
2828 video_title = sanitize_title(video_title)
2829
e092418d 2830 simple_title = _simplify_title(video_title)
235b3ba4
PH
2831
2832 # thumbnail image
2833 if 'thumbnail' not in video_info:
2834 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2835 video_thumbnail = ''
2836 else:
2837 video_thumbnail = video_info['thumbnail']
2838
2839 # upload date
2840 upload_date = u'NA'
2841 if 'upload_date' in video_info:
2842 upload_time = video_info['upload_date']
2843 timetuple = email.utils.parsedate_tz(upload_time)
2844 if timetuple is not None:
2845 try:
2846 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2847 except:
2848 pass
2849
2850 # description
2851 video_description = video_info.get('description', 'No description available.')
2852
2853 url_map = video_info['video_urls']
2854 if len(url_map.keys()) > 0:
2855 # Decide which formats to download
2856 req_format = self._downloader.params.get('format', None)
2857 format_limit = self._downloader.params.get('format_limit', None)
2858
2859 if format_limit is not None and format_limit in self._available_formats:
2860 format_list = self._available_formats[self._available_formats.index(format_limit):]
2861 else:
2862 format_list = self._available_formats
2863 existing_formats = [x for x in format_list if x in url_map]
2864 if len(existing_formats) == 0:
2865 self._downloader.trouble(u'ERROR: no known formats available for video')
2866 return
2867 if req_format is None:
2868 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2869 elif req_format == 'worst':
2870 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2871 elif req_format == '-1':
2872 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2873 else:
2874 # Specific format
2875 if req_format not in url_map:
2876 self._downloader.trouble(u'ERROR: requested format not available')
2877 return
2878 video_url_list = [(req_format, url_map[req_format])] # Specific format
2879
2880 for format_param, video_real_url in video_url_list:
2881
2882 # At this point we have a new video
2883 self._downloader.increment_downloads()
2884
2885 # Extension
2886 video_extension = self._video_extensions.get(format_param, 'mp4')
2887
2888 try:
2889 # Process video information
2890 self._downloader.process_info({
2891 'id': video_id.decode('utf-8'),
2892 'url': video_real_url.decode('utf-8'),
2893 'uploader': video_uploader.decode('utf-8'),
2894 'upload_date': upload_date,
2895 'title': video_title,
2896 'stitle': simple_title,
2897 'ext': video_extension.decode('utf-8'),
2898 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2899 'thumbnail': video_thumbnail.decode('utf-8'),
2900 'description': video_description.decode('utf-8'),
2901 'player_url': None,
2902 })
2903 except UnavailableVideoError, err:
2904 self._downloader.trouble(u'\nERROR: unable to download video')
2905
2906class BlipTVIE(InfoExtractor):
2907 """Information extractor for blip.tv"""
2908
2909 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2910 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2911 IE_NAME = u'blip.tv'
2912
2913 def report_extraction(self, file_id):
2914 """Report information extraction."""
2915 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2916
2917 def report_direct_download(self, title):
2918 """Report information extraction."""
2919 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2920
235b3ba4
PH
2921 def _real_extract(self, url):
2922 mobj = re.match(self._VALID_URL, url)
2923 if mobj is None:
2924 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2925 return
2926
2927 if '?' in url:
2928 cchar = '&'
2929 else:
2930 cchar = '?'
2931 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2932 request = urllib2.Request(json_url)
2933 self.report_extraction(mobj.group(1))
2934 info = None
2935 try:
2936 urlh = urllib2.urlopen(request)
2937 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2938 basename = url.split('/')[-1]
2939 title,ext = os.path.splitext(basename)
2940 ext = ext.replace('.', '')
2941 self.report_direct_download(title)
2942 info = {
2943 'id': title,
2944 'url': url,
2945 'title': title,
e092418d 2946 'stitle': _simplify_title(title),
235b3ba4
PH
2947 'ext': ext,
2948 'urlhandle': urlh
2949 }
2950 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2951 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2952 return
2953 if info is None: # Regular URL
2954 try:
2955 json_code = urlh.read()
2956 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2957 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2958 return
2959
2960 try:
2961 json_data = json.loads(json_code)
2962 if 'Post' in json_data:
2963 data = json_data['Post']
2964 else:
2965 data = json_data
2966
2967 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2968 video_url = data['media']['url']
2969 umobj = re.match(self._URL_EXT, video_url)
2970 if umobj is None:
2971 raise ValueError('Can not determine filename extension')
2972 ext = umobj.group(1)
2973
2974 info = {
2975 'id': data['item_id'],
2976 'url': video_url,
2977 'uploader': data['display_name'],
2978 'upload_date': upload_date,
2979 'title': data['title'],
e092418d 2980 'stitle': _simplify_title(data['title']),
235b3ba4
PH
2981 'ext': ext,
2982 'format': data['media']['mimeType'],
2983 'thumbnail': data['thumbnailUrl'],
2984 'description': data['description'],
2985 'player_url': data['embedUrl']
2986 }
2987 except (ValueError,KeyError), err:
2988 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2989 return
2990
2991 self._downloader.increment_downloads()
2992
2993 try:
2994 self._downloader.process_info(info)
2995 except UnavailableVideoError, err:
2996 self._downloader.trouble(u'\nERROR: unable to download video')
2997
2998
2999class MyVideoIE(InfoExtractor):
3000 """Information Extractor for myvideo.de."""
3001
3002 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3003 IE_NAME = u'myvideo'
3004
3005 def __init__(self, downloader=None):
3006 InfoExtractor.__init__(self, downloader)
3007
3008 def report_download_webpage(self, video_id):
3009 """Report webpage download."""
3010 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3011
3012 def report_extraction(self, video_id):
3013 """Report information extraction."""
3014 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3015
3016 def _real_extract(self,url):
3017 mobj = re.match(self._VALID_URL, url)
3018 if mobj is None:
3019 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3020 return
3021
3022 video_id = mobj.group(1)
235b3ba4
PH
3023
3024 # Get video webpage
3025 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3026 try:
3027 self.report_download_webpage(video_id)
3028 webpage = urllib2.urlopen(request).read()
3029 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3030 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3031 return
3032
3033 self.report_extraction(video_id)
3034 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3035 webpage)
3036 if mobj is None:
3037 self._downloader.trouble(u'ERROR: unable to extract media URL')
3038 return
3039 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3040
3041 mobj = re.search('<title>([^<]+)</title>', webpage)
3042 if mobj is None:
3043 self._downloader.trouble(u'ERROR: unable to extract title')
3044 return
3045
3046 video_title = mobj.group(1)
3047 video_title = sanitize_title(video_title)
3048
e092418d
PH
3049 simple_title = _simplify_title(video_title)
3050
235b3ba4
PH
3051 try:
3052 self._downloader.process_info({
3053 'id': video_id,
3054 'url': video_url,
3055 'uploader': u'NA',
3056 'upload_date': u'NA',
3057 'title': video_title,
3058 'stitle': simple_title,
3059 'ext': u'flv',
3060 'format': u'NA',
3061 'player_url': None,
3062 })
3063 except UnavailableVideoError:
3064 self._downloader.trouble(u'\nERROR: Unable to download video')
3065
3066class ComedyCentralIE(InfoExtractor):
3067 """Information extractor for The Daily Show and Colbert Report """
3068
3069 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3070 IE_NAME = u'comedycentral'
3071
3072 def report_extraction(self, episode_id):
3073 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3074
3075 def report_config_download(self, episode_id):
3076 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3077
3078 def report_index_download(self, episode_id):
3079 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3080
3081 def report_player_url(self, episode_id):
3082 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3083
235b3ba4
PH
3084 def _real_extract(self, url):
3085 mobj = re.match(self._VALID_URL, url)
3086 if mobj is None:
3087 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3088 return
3089
3090 if mobj.group('shortname'):
3091 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3092 url = 'http://www.thedailyshow.com/full-episodes/'
3093 else:
3094 url = 'http://www.colbertnation.com/full-episodes/'
3095 mobj = re.match(self._VALID_URL, url)
3096 assert mobj is not None
3097
3098 dlNewest = not mobj.group('episode')
3099 if dlNewest:
3100 epTitle = mobj.group('showname')
3101 else:
3102 epTitle = mobj.group('episode')
3103
3104 req = urllib2.Request(url)
3105 self.report_extraction(epTitle)
3106 try:
3107 htmlHandle = urllib2.urlopen(req)
3108 html = htmlHandle.read()
3109 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3110 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3111 return
3112 if dlNewest:
3113 url = htmlHandle.geturl()
3114 mobj = re.match(self._VALID_URL, url)
3115 if mobj is None:
3116 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3117 return
3118 if mobj.group('episode') == '':
3119 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3120 return
3121 epTitle = mobj.group('episode')
3122
3123 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3124 if len(mMovieParams) == 0:
3125 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3126 return
3127
3128 playerUrl_raw = mMovieParams[0][0]
3129 self.report_player_url(epTitle)
3130 try:
3131 urlHandle = urllib2.urlopen(playerUrl_raw)
3132 playerUrl = urlHandle.geturl()
3133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3135 return
3136
3137 uri = mMovieParams[0][1]
3138 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3139 self.report_index_download(epTitle)
3140 try:
3141 indexXml = urllib2.urlopen(indexUrl).read()
3142 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3143 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3144 return
3145
3146 idoc = xml.etree.ElementTree.fromstring(indexXml)
3147 itemEls = idoc.findall('.//item')
3148 for itemEl in itemEls:
3149 mediaId = itemEl.findall('./guid')[0].text
3150 shortMediaId = mediaId.split(':')[-1]
3151 showId = mediaId.split(':')[-2].replace('.com', '')
3152 officialTitle = itemEl.findall('./title')[0].text
3153 officialDate = itemEl.findall('./pubDate')[0].text
3154
3155 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3156 urllib.urlencode({'uri': mediaId}))
3157 configReq = urllib2.Request(configUrl)
3158 self.report_config_download(epTitle)
3159 try:
3160 configXml = urllib2.urlopen(configReq).read()
3161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3163 return
3164
3165 cdoc = xml.etree.ElementTree.fromstring(configXml)
3166 turls = []
3167 for rendition in cdoc.findall('.//rendition'):
3168 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3169 turls.append(finfo)
3170
3171 if len(turls) == 0:
3172 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3173 continue
3174
3175 # For now, just pick the highest bitrate
3176 format,video_url = turls[-1]
3177
3178 self._downloader.increment_downloads()
3179
3180 effTitle = showId + '-' + epTitle
3181 info = {
3182 'id': shortMediaId,
3183 'url': video_url,
3184 'uploader': showId,
3185 'upload_date': officialDate,
3186 'title': effTitle,
3187 'stitle': self._simplify_title(effTitle),
3188 'ext': 'mp4',
3189 'format': format,
3190 'thumbnail': None,
3191 'description': officialTitle,
3192 'player_url': playerUrl
3193 }
3194
3195 try:
3196 self._downloader.process_info(info)
3197 except UnavailableVideoError, err:
3198 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3199 continue
3200
3201
3202class EscapistIE(InfoExtractor):
3203 """Information extractor for The Escapist """
3204
3205 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3206 IE_NAME = u'escapist'
3207
3208 def report_extraction(self, showName):
3209 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3210
3211 def report_config_download(self, showName):
3212 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3213
235b3ba4
PH
3214 def _real_extract(self, url):
3215 htmlParser = HTMLParser.HTMLParser()
3216
3217 mobj = re.match(self._VALID_URL, url)
3218 if mobj is None:
3219 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3220 return
3221 showName = mobj.group('showname')
3222 videoId = mobj.group('episode')
3223
3224 self.report_extraction(showName)
3225 try:
3226 webPage = urllib2.urlopen(url).read()
3227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3228 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3229 return
3230
3231 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3232 description = htmlParser.unescape(descMatch.group(1))
3233 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3234 imgUrl = htmlParser.unescape(imgMatch.group(1))
3235 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3236 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3237 configUrlMatch = re.search('config=(.*)$', playerUrl)
3238 configUrl = urllib2.unquote(configUrlMatch.group(1))
3239
3240 self.report_config_download(showName)
3241 try:
3242 configJSON = urllib2.urlopen(configUrl).read()
3243 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3244 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3245 return
3246
3247 # Technically, it's JavaScript, not JSON
3248 configJSON = configJSON.replace("'", '"')
3249
3250 try:
3251 config = json.loads(configJSON)
3252 except (ValueError,), err:
3253 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3254 return
3255
3256 playlist = config['playlist']
3257 videoUrl = playlist[1]['url']
3258
3259 self._downloader.increment_downloads()
3260 info = {
3261 'id': videoId,
3262 'url': videoUrl,
3263 'uploader': showName,
3264 'upload_date': None,
3265 'title': showName,
e092418d 3266 'stitle': _simplify_title(showName),
235b3ba4
PH
3267 'ext': 'flv',
3268 'format': 'flv',
3269 'thumbnail': imgUrl,
3270 'description': description,
3271 'player_url': playerUrl,
3272 }
3273
3274 try:
3275 self._downloader.process_info(info)
3276 except UnavailableVideoError, err:
3277 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3278
3279
3280class CollegeHumorIE(InfoExtractor):
3281 """Information extractor for collegehumor.com"""
3282
3283 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3284 IE_NAME = u'collegehumor'
3285
3286 def report_webpage(self, video_id):
3287 """Report information extraction."""
3288 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3289
3290 def report_extraction(self, video_id):
3291 """Report information extraction."""
3292 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3293
235b3ba4
PH
3294 def _real_extract(self, url):
3295 htmlParser = HTMLParser.HTMLParser()
3296
3297 mobj = re.match(self._VALID_URL, url)
3298 if mobj is None:
3299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3300 return
3301 video_id = mobj.group('videoid')
3302
3303 self.report_webpage(video_id)
3304 request = urllib2.Request(url)
3305 try:
3306 webpage = urllib2.urlopen(request).read()
3307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3309 return
3310
3311 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3312 if m is None:
3313 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3314 return
3315 internal_video_id = m.group('internalvideoid')
3316
3317 info = {
3318 'id': video_id,
3319 'internal_id': internal_video_id,
3320 }
3321
3322 self.report_extraction(video_id)
3323 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3324 try:
3325 metaXml = urllib2.urlopen(xmlUrl).read()
3326 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3327 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3328 return
3329
3330 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3331 try:
3332 videoNode = mdoc.findall('./video')[0]
3333 info['description'] = videoNode.findall('./description')[0].text
3334 info['title'] = videoNode.findall('./caption')[0].text
e092418d 3335 info['stitle'] = _simplify_title(info['title'])
235b3ba4
PH
3336 info['url'] = videoNode.findall('./file')[0].text
3337 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3338 info['ext'] = info['url'].rpartition('.')[2]
3339 info['format'] = info['ext']
3340 except IndexError:
3341 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3342 return
3343
3344 self._downloader.increment_downloads()
3345
3346 try:
3347 self._downloader.process_info(info)
3348 except UnavailableVideoError, err:
3349 self._downloader.trouble(u'\nERROR: unable to download video')
3350
3351
3352class XVideosIE(InfoExtractor):
3353 """Information extractor for xvideos.com"""
3354
3355 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3356 IE_NAME = u'xvideos'
3357
3358 def report_webpage(self, video_id):
3359 """Report information extraction."""
3360 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3361
3362 def report_extraction(self, video_id):
3363 """Report information extraction."""
3364 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3365
235b3ba4
PH
3366 def _real_extract(self, url):
3367 htmlParser = HTMLParser.HTMLParser()
3368
3369 mobj = re.match(self._VALID_URL, url)
3370 if mobj is None:
3371 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3372 return
3373 video_id = mobj.group(1).decode('utf-8')
3374
3375 self.report_webpage(video_id)
3376
3377 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3378 try:
3379 webpage = urllib2.urlopen(request).read()
3380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3382 return
3383
3384 self.report_extraction(video_id)
3385
3386
3387 # Extract video URL
3388 mobj = re.search(r'flv_url=(.+?)&', webpage)
3389 if mobj is None:
3390 self._downloader.trouble(u'ERROR: unable to extract video url')
3391 return
3392 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3393
3394
3395 # Extract title
3396 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3397 if mobj is None:
3398 self._downloader.trouble(u'ERROR: unable to extract video title')
3399 return
3400 video_title = mobj.group(1).decode('utf-8')
3401
3402
3403 # Extract video thumbnail
3404 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3405 if mobj is None:
3406 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3407 return
3408 video_thumbnail = mobj.group(1).decode('utf-8')
3409
3410
3411
3412 self._downloader.increment_downloads()
3413 info = {
3414 'id': video_id,
3415 'url': video_url,
3416 'uploader': None,
3417 'upload_date': None,
3418 'title': video_title,
e092418d 3419 'stitle': _simplify_title(video_title),
235b3ba4
PH
3420 'ext': 'flv',
3421 'format': 'flv',
3422 'thumbnail': video_thumbnail,
3423 'description': None,
3424 'player_url': None,
3425 }
3426
3427 try:
3428 self._downloader.process_info(info)
3429 except UnavailableVideoError, err:
3430 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3431
3432
3433class SoundcloudIE(InfoExtractor):
3434 """Information extractor for soundcloud.com
3435 To access the media, the uid of the song and a stream token
3436 must be extracted from the page source and the script must make
3437 a request to media.soundcloud.com/crossdomain.xml. Then
3438 the media can be grabbed by requesting from an url composed
3439 of the stream token and uid
3440 """
3441
3442 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3443 IE_NAME = u'soundcloud'
3444
3445 def __init__(self, downloader=None):
3446 InfoExtractor.__init__(self, downloader)
3447
3448 def report_webpage(self, video_id):
3449 """Report information extraction."""
3450 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3451
3452 def report_extraction(self, video_id):
3453 """Report information extraction."""
3454 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3455
3456 def _real_extract(self, url):
3457 htmlParser = HTMLParser.HTMLParser()
3458
3459 mobj = re.match(self._VALID_URL, url)
3460 if mobj is None:
3461 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3462 return
3463
3464 # extract uploader (which is in the url)
3465 uploader = mobj.group(1).decode('utf-8')
3466 # extract simple title (uploader + slug of song title)
3467 slug_title = mobj.group(2).decode('utf-8')
3468 simple_title = uploader + '-' + slug_title
3469
3470 self.report_webpage('%s/%s' % (uploader, slug_title))
3471
3472 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3473 try:
3474 webpage = urllib2.urlopen(request).read()
3475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3476 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3477 return
3478
3479 self.report_extraction('%s/%s' % (uploader, slug_title))
3480
3481 # extract uid and stream token that soundcloud hands out for access
3482 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3483 if mobj:
3484 video_id = mobj.group(1)
3485 stream_token = mobj.group(2)
3486
3487 # extract unsimplified title
3488 mobj = re.search('"title":"(.*?)",', webpage)
3489 if mobj:
3490 title = mobj.group(1)
3491
3492 # construct media url (with uid/token)
3493 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3494 mediaURL = mediaURL % (video_id, stream_token)
3495
3496 # description
3497 description = u'No description available'
3498 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3499 if mobj:
3500 description = mobj.group(1)
3501
3502 # upload date
3503 upload_date = None
3504 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3505 if mobj:
3506 try:
3507 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3508 except Exception as e:
3509 print str(e)
3510
3511 # for soundcloud, a request to a cross domain is required for cookies
3512 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3513
3514 try:
3515 self._downloader.process_info({
3516 'id': video_id.decode('utf-8'),
3517 'url': mediaURL,
3518 'uploader': uploader.decode('utf-8'),
3519 'upload_date': upload_date,
3520 'title': simple_title.decode('utf-8'),
3521 'stitle': simple_title.decode('utf-8'),
3522 'ext': u'mp3',
3523 'format': u'NA',
3524 'player_url': None,
3525 'description': description.decode('utf-8')
3526 })
3527 except UnavailableVideoError:
3528 self._downloader.trouble(u'\nERROR: unable to download video')
3529
3530
3531class InfoQIE(InfoExtractor):
3532 """Information extractor for infoq.com"""
3533
3534 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3535 IE_NAME = u'infoq'
3536
3537 def report_webpage(self, video_id):
3538 """Report information extraction."""
3539 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3540
3541 def report_extraction(self, video_id):
3542 """Report information extraction."""
3543 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3544
235b3ba4
PH
3545 def _real_extract(self, url):
3546 htmlParser = HTMLParser.HTMLParser()
3547
3548 mobj = re.match(self._VALID_URL, url)
3549 if mobj is None:
3550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3551 return
3552
3553 self.report_webpage(url)
3554
3555 request = urllib2.Request(url)
3556 try:
3557 webpage = urllib2.urlopen(request).read()
3558 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3559 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3560 return
3561
3562 self.report_extraction(url)
3563
3564
3565 # Extract video URL
3566 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3567 if mobj is None:
3568 self._downloader.trouble(u'ERROR: unable to extract video url')
3569 return
3570 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3571
3572
3573 # Extract title
3574 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3575 if mobj is None:
3576 self._downloader.trouble(u'ERROR: unable to extract video title')
3577 return
3578 video_title = mobj.group(1).decode('utf-8')
3579
3580 # Extract description
3581 video_description = u'No description available.'
3582 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3583 if mobj is not None:
3584 video_description = mobj.group(1).decode('utf-8')
3585
3586 video_filename = video_url.split('/')[-1]
3587 video_id, extension = video_filename.split('.')
3588
3589 self._downloader.increment_downloads()
3590 info = {
3591 'id': video_id,
3592 'url': video_url,
3593 'uploader': None,
3594 'upload_date': None,
3595 'title': video_title,
e092418d 3596 'stitle': _simplify_title(video_title),
235b3ba4
PH
3597 'ext': extension,
3598 'format': extension, # Extension is always(?) mp4, but seems to be flv
3599 'thumbnail': None,
3600 'description': video_description,
3601 'player_url': None,
3602 }
3603
3604 try:
3605 self._downloader.process_info(info)
3606 except UnavailableVideoError, err:
3607 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3608
3609
3610
3611class PostProcessor(object):
3612 """Post Processor class.
3613
3614 PostProcessor objects can be added to downloaders with their
3615 add_post_processor() method. When the downloader has finished a
3616 successful download, it will take its internal chain of PostProcessors
3617 and start calling the run() method on each one of them, first with
3618 an initial argument and then with the returned value of the previous
3619 PostProcessor.
3620
3621 The chain will be stopped if one of them ever returns None or the end
3622 of the chain is reached.
3623
3624 PostProcessor objects follow a "mutual registration" process similar
3625 to InfoExtractor objects.
3626 """
3627
3628 _downloader = None
3629
3630 def __init__(self, downloader=None):
3631 self._downloader = downloader
3632
3633 def set_downloader(self, downloader):
3634 """Sets the downloader for this PP."""
3635 self._downloader = downloader
3636
3637 def run(self, information):
3638 """Run the PostProcessor.
3639
3640 The "information" argument is a dictionary like the ones
3641 composed by InfoExtractors. The only difference is that this
3642 one has an extra field called "filepath" that points to the
3643 downloaded file.
3644
3645 When this method returns None, the postprocessing chain is
3646 stopped. However, this method may return an information
3647 dictionary that will be passed to the next postprocessing
3648 object in the chain. It can be the one it received after
3649 changing some fields.
3650
3651 In addition, this method may raise a PostProcessingError
3652 exception that will be taken into account by the downloader
3653 it was called from.
3654 """
3655 return information # by default, do nothing
3656
3657
3658class FFmpegExtractAudioPP(PostProcessor):
3659
3660 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3661 PostProcessor.__init__(self, downloader)
3662 if preferredcodec is None:
3663 preferredcodec = 'best'
3664 self._preferredcodec = preferredcodec
3665 self._preferredquality = preferredquality
3666 self._keepvideo = keepvideo
3667
3668 @staticmethod
3669 def get_audio_codec(path):
3670 try:
3671 cmd = ['ffprobe', '-show_streams', '--', path]
3672 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3673 output = handle.communicate()[0]
3674 if handle.wait() != 0:
3675 return None
3676 except (IOError, OSError):
3677 return None
3678 audio_codec = None
3679 for line in output.split('\n'):
3680 if line.startswith('codec_name='):
3681 audio_codec = line.split('=')[1].strip()
3682 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3683 return audio_codec
3684 return None
3685
3686 @staticmethod
3687 def run_ffmpeg(path, out_path, codec, more_opts):
3688 try:
3689 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3690 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3691 return (ret == 0)
3692 except (IOError, OSError):
3693 return False
3694
3695 def run(self, information):
3696 path = information['filepath']
3697
3698 filecodec = self.get_audio_codec(path)
3699 if filecodec is None:
3700 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3701 return None
3702
3703 more_opts = []
3704 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3705 if filecodec in ['aac', 'mp3', 'vorbis']:
3706 # Lossless if possible
3707 acodec = 'copy'
3708 extension = filecodec
3709 if filecodec == 'aac':
3710 more_opts = ['-f', 'adts']
3711 if filecodec == 'vorbis':
3712 extension = 'ogg'
3713 else:
3714 # MP3 otherwise.
3715 acodec = 'libmp3lame'
3716 extension = 'mp3'
3717 more_opts = []
3718 if self._preferredquality is not None:
3719 more_opts += ['-ab', self._preferredquality]
3720 else:
3721 # We convert the audio (lossy)
3722 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3723 extension = self._preferredcodec
3724 more_opts = []
3725 if self._preferredquality is not None:
3726 more_opts += ['-ab', self._preferredquality]
3727 if self._preferredcodec == 'aac':
3728 more_opts += ['-f', 'adts']
3729 if self._preferredcodec == 'vorbis':
3730 extension = 'ogg'
3731
3732 (prefix, ext) = os.path.splitext(path)
3733 new_path = prefix + '.' + extension
3734 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3735 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3736
3737 if not status:
3738 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3739 return None
3740
3741 # Try to update the date time for extracted audio file.
3742 if information.get('filetime') is not None:
3743 try:
3744 os.utime(new_path, (time.time(), information['filetime']))
3745 except:
3746 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3747
3748 if not self._keepvideo:
3749 try:
3750 os.remove(path)
3751 except (IOError, OSError):
3752 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3753 return None
3754
3755 information['filepath'] = new_path
3756 return information
3757
3758
3759def updateSelf(downloader, filename):
3760 ''' Update the program file with the latest version from the repository '''
3761 # Note: downloader only used for options
3762 if not os.access(filename, os.W_OK):
3763 sys.exit('ERROR: no write permissions on %s' % filename)
3764
3765 downloader.to_screen('Updating to latest version...')
3766
3767 try:
3768 try:
3769 urlh = urllib.urlopen(UPDATE_URL)
3770 newcontent = urlh.read()
3771
3772 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3773 if vmatch is not None and vmatch.group(1) == __version__:
3774 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3775 return
3776 finally:
3777 urlh.close()
3778 except (IOError, OSError), err:
3779 sys.exit('ERROR: unable to download latest version')
3780
3781 try:
3782 outf = open(filename, 'wb')
3783 try:
3784 outf.write(newcontent)
3785 finally:
3786 outf.close()
3787 except (IOError, OSError), err:
3788 sys.exit('ERROR: unable to overwrite current version')
3789
3790 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3791
3792def parseOpts():
3793 # Deferred imports
3794 import getpass
3795 import optparse
3796
3797 def _format_option_string(option):
3798 ''' ('-o', '--option') -> -o, --format METAVAR'''
3799
3800 opts = []
3801
3802 if option._short_opts: opts.append(option._short_opts[0])
3803 if option._long_opts: opts.append(option._long_opts[0])
3804 if len(opts) > 1: opts.insert(1, ', ')
3805
3806 if option.takes_value(): opts.append(' %s' % option.metavar)
3807
3808 return "".join(opts)
3809
3810 def _find_term_columns():
3811 columns = os.environ.get('COLUMNS', None)
3812 if columns:
3813 return int(columns)
3814
3815 try:
3816 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3817 out,err = sp.communicate()
3818 return int(out.split()[1])
3819 except:
3820 pass
3821 return None
3822
3823 max_width = 80
3824 max_help_position = 80
3825
3826 # No need to wrap help messages if we're on a wide console
3827 columns = _find_term_columns()
3828 if columns: max_width = columns
3829
3830 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3831 fmt.format_option_strings = _format_option_string
3832
3833 kw = {
3834 'version' : __version__,
3835 'formatter' : fmt,
3836 'usage' : '%prog [options] url [url...]',
3837 'conflict_handler' : 'resolve',
3838 }
3839
3840 parser = optparse.OptionParser(**kw)
3841
3842 # option groups
3843 general = optparse.OptionGroup(parser, 'General Options')
3844 selection = optparse.OptionGroup(parser, 'Video Selection')
3845 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3846 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3847 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3848 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3849 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3850
3851 general.add_option('-h', '--help',
3852 action='help', help='print this help text and exit')
3853 general.add_option('-v', '--version',
3854 action='version', help='print program version and exit')
3855 general.add_option('-U', '--update',
3856 action='store_true', dest='update_self', help='update this program to latest version')
3857 general.add_option('-i', '--ignore-errors',
3858 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3859 general.add_option('-r', '--rate-limit',
3860 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3861 general.add_option('-R', '--retries',
3862 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3863 general.add_option('--dump-user-agent',
3864 action='store_true', dest='dump_user_agent',
3865 help='display the current browser identification', default=False)
3866 general.add_option('--list-extractors',
3867 action='store_true', dest='list_extractors',
3868 help='List all supported extractors and the URLs they would handle', default=False)
3869
3870 selection.add_option('--playlist-start',
3871 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3872 selection.add_option('--playlist-end',
3873 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3874 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3875 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3876
3877 authentication.add_option('-u', '--username',
3878 dest='username', metavar='USERNAME', help='account username')
3879 authentication.add_option('-p', '--password',
3880 dest='password', metavar='PASSWORD', help='account password')
3881 authentication.add_option('-n', '--netrc',
3882 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3883
3884
3885 video_format.add_option('-f', '--format',
3886 action='store', dest='format', metavar='FORMAT', help='video format code')
3887 video_format.add_option('--all-formats',
3888 action='store_const', dest='format', help='download all available video formats', const='all')
3889 video_format.add_option('--max-quality',
3890 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3891 video_format.add_option('-F', '--list-formats',
3892 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3893
3894
3895 verbosity.add_option('-q', '--quiet',
3896 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3897 verbosity.add_option('-s', '--simulate',
3898 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3899 verbosity.add_option('--skip-download',
3900 action='store_true', dest='skip_download', help='do not download the video', default=False)
3901 verbosity.add_option('-g', '--get-url',
3902 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3903 verbosity.add_option('-e', '--get-title',
3904 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3905 verbosity.add_option('--get-thumbnail',
3906 action='store_true', dest='getthumbnail',
3907 help='simulate, quiet but print thumbnail URL', default=False)
3908 verbosity.add_option('--get-description',
3909 action='store_true', dest='getdescription',
3910 help='simulate, quiet but print video description', default=False)
3911 verbosity.add_option('--get-filename',
3912 action='store_true', dest='getfilename',
3913 help='simulate, quiet but print output filename', default=False)
3914 verbosity.add_option('--get-format',
3915 action='store_true', dest='getformat',
3916 help='simulate, quiet but print output format', default=False)
3917 verbosity.add_option('--no-progress',
3918 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3919 verbosity.add_option('--console-title',
3920 action='store_true', dest='consoletitle',
3921 help='display progress in console titlebar', default=False)
3922
3923
3924 filesystem.add_option('-t', '--title',
3925 action='store_true', dest='usetitle', help='use title in file name', default=False)
3926 filesystem.add_option('-l', '--literal',
3927 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3928 filesystem.add_option('-A', '--auto-number',
3929 action='store_true', dest='autonumber',
3930 help='number downloaded files starting from 00000', default=False)
3931 filesystem.add_option('-o', '--output',
3932 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3933 filesystem.add_option('-a', '--batch-file',
3934 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3935 filesystem.add_option('-w', '--no-overwrites',
3936 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3937 filesystem.add_option('-c', '--continue',
3938 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3939 filesystem.add_option('--no-continue',
3940 action='store_false', dest='continue_dl',
3941 help='do not resume partially downloaded files (restart from beginning)')
3942 filesystem.add_option('--cookies',
3943 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3944 filesystem.add_option('--no-part',
3945 action='store_true', dest='nopart', help='do not use .part files', default=False)
3946 filesystem.add_option('--no-mtime',
3947 action='store_false', dest='updatetime',
3948 help='do not use the Last-modified header to set the file modification time', default=True)
3949 filesystem.add_option('--write-description',
3950 action='store_true', dest='writedescription',
3951 help='write video description to a .description file', default=False)
3952 filesystem.add_option('--write-info-json',
3953 action='store_true', dest='writeinfojson',
3954 help='write video metadata to a .info.json file', default=False)
3955
3956
3957 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3958 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3959 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3960 help='"best", "aac", "vorbis" or "mp3"; best by default')
3961 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3962 help='ffmpeg audio bitrate specification, 128k by default')
3963 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3964 help='keeps the video file on disk after the post-processing; the video is erased by default')
3965
3966
3967 parser.add_option_group(general)
3968 parser.add_option_group(selection)
3969 parser.add_option_group(filesystem)
3970 parser.add_option_group(verbosity)
3971 parser.add_option_group(video_format)
3972 parser.add_option_group(authentication)
3973 parser.add_option_group(postproc)
3974
3975 opts, args = parser.parse_args()
3976
3977 return parser, opts, args
3978
3979def gen_extractors():
3980 """ Return a list of an instance of every supported extractor.
3981 The order does matter; the first extractor matched is the one handling the URL.
3982 """
3983 youtube_ie = YoutubeIE()
3984 google_ie = GoogleIE()
3985 yahoo_ie = YahooIE()
3986 return [
3987 YoutubePlaylistIE(youtube_ie),
3988 YoutubeUserIE(youtube_ie),
3989 YoutubeSearchIE(youtube_ie),
3990 youtube_ie,
3991 MetacafeIE(youtube_ie),
3992 DailymotionIE(),
3993 google_ie,
3994 GoogleSearchIE(google_ie),
3995 PhotobucketIE(),
3996 yahoo_ie,
3997 YahooSearchIE(yahoo_ie),
3998 DepositFilesIE(),
3999 FacebookIE(),
4000 BlipTVIE(),
4001 VimeoIE(),
4002 MyVideoIE(),
4003 ComedyCentralIE(),
4004 EscapistIE(),
4005 CollegeHumorIE(),
4006 XVideosIE(),
4007 SoundcloudIE(),
4008 InfoQIE(),
4009
4010 GenericIE()
4011 ]
4012
4013def _real_main():
4014 parser, opts, args = parseOpts()
4015
4016 # Open appropriate CookieJar
4017 if opts.cookiefile is None:
4018 jar = cookielib.CookieJar()
4019 else:
4020 try:
4021 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4022 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4023 jar.load()
4024 except (IOError, OSError), err:
4025 sys.exit(u'ERROR: unable to open cookie file')
4026
4027 # Dump user agent
4028 if opts.dump_user_agent:
4029 print std_headers['User-Agent']
4030 sys.exit(0)
4031
4032 # Batch file verification
4033 batchurls = []
4034 if opts.batchfile is not None:
4035 try:
4036 if opts.batchfile == '-':
4037 batchfd = sys.stdin
4038 else:
4039 batchfd = open(opts.batchfile, 'r')
4040 batchurls = batchfd.readlines()
4041 batchurls = [x.strip() for x in batchurls]
4042 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4043 except IOError:
4044 sys.exit(u'ERROR: batch file could not be read')
4045 all_urls = batchurls + args
4046
4047 # General configuration
4048 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4049 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4050 urllib2.install_opener(opener)
4051 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4052
4053 extractors = gen_extractors()
4054
4055 if opts.list_extractors:
4056 for ie in extractors:
4057 print(ie.IE_NAME)
4058 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4059 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4060 for mu in matchedUrls:
4061 print(u' ' + mu)
4062 sys.exit(0)
4063
4064 # Conflicting, missing and erroneous options
4065 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4066 parser.error(u'using .netrc conflicts with giving username/password')
4067 if opts.password is not None and opts.username is None:
4068 parser.error(u'account username missing')
4069 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4070 parser.error(u'using output template conflicts with using title, literal title or auto number')
4071 if opts.usetitle and opts.useliteral:
4072 parser.error(u'using title conflicts with using literal title')
4073 if opts.username is not None and opts.password is None:
4074 opts.password = getpass.getpass(u'Type account password and press return:')
4075 if opts.ratelimit is not None:
4076 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4077 if numeric_limit is None:
4078 parser.error(u'invalid rate limit specified')
4079 opts.ratelimit = numeric_limit
4080 if opts.retries is not None:
4081 try:
4082 opts.retries = long(opts.retries)
4083 except (TypeError, ValueError), err:
4084 parser.error(u'invalid retry count specified')
4085 try:
4086 opts.playliststart = int(opts.playliststart)
4087 if opts.playliststart <= 0:
4088 raise ValueError(u'Playlist start must be positive')
4089 except (TypeError, ValueError), err:
4090 parser.error(u'invalid playlist start number specified')
4091 try:
4092 opts.playlistend = int(opts.playlistend)
4093 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4094 raise ValueError(u'Playlist end must be greater than playlist start')
4095 except (TypeError, ValueError), err:
4096 parser.error(u'invalid playlist end number specified')
4097 if opts.extractaudio:
4098 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4099 parser.error(u'invalid audio format specified')
4100
4101 # File downloader
4102 fd = FileDownloader({
4103 'usenetrc': opts.usenetrc,
4104 'username': opts.username,
4105 'password': opts.password,
4106 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4107 'forceurl': opts.geturl,
4108 'forcetitle': opts.gettitle,
4109 'forcethumbnail': opts.getthumbnail,
4110 'forcedescription': opts.getdescription,
4111 'forcefilename': opts.getfilename,
4112 'forceformat': opts.getformat,
4113 'simulate': opts.simulate,
4114 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4115 'format': opts.format,
4116 'format_limit': opts.format_limit,
4117 'listformats': opts.listformats,
4118 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4119 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4120 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4121 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4122 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4123 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4124 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4125 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4126 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4127 or u'%(id)s.%(ext)s'),
4128 'ignoreerrors': opts.ignoreerrors,
4129 'ratelimit': opts.ratelimit,
4130 'nooverwrites': opts.nooverwrites,
4131 'retries': opts.retries,
4132 'continuedl': opts.continue_dl,
4133 'noprogress': opts.noprogress,
4134 'playliststart': opts.playliststart,
4135 'playlistend': opts.playlistend,
4136 'logtostderr': opts.outtmpl == '-',
4137 'consoletitle': opts.consoletitle,
4138 'nopart': opts.nopart,
4139 'updatetime': opts.updatetime,
4140 'writedescription': opts.writedescription,
4141 'writeinfojson': opts.writeinfojson,
4142 'matchtitle': opts.matchtitle,
4143 'rejecttitle': opts.rejecttitle,
4144 })
4145 for extractor in extractors:
4146 fd.add_info_extractor(extractor)
4147
4148 # PostProcessors
4149 if opts.extractaudio:
4150 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4151
4152 # Update version
4153 if opts.update_self:
4154 updateSelf(fd, sys.argv[0])
4155
4156 # Maybe do nothing
4157 if len(all_urls) < 1:
4158 if not opts.update_self:
4159 parser.error(u'you must provide at least one URL')
4160 else:
4161 sys.exit()
4162 retcode = fd.download(all_urls)
4163
4164 # Dump cookie jar if requested
4165 if opts.cookiefile is not None:
4166 try:
4167 jar.save()
4168 except (IOError, OSError), err:
4169 sys.exit(u'ERROR: unable to save cookie jar')
4170
4171 sys.exit(retcode)
4172
4173def main():
4174 try:
4175 _real_main()
4176 except DownloadError:
4177 sys.exit(1)
4178 except SameFileError:
4179 sys.exit(u'ERROR: fixed output name but more than one file to download')
4180 except KeyboardInterrupt:
4181 sys.exit(u'\nERROR: Interrupted by user')
4182
4183if __name__ == '__main__':
4184 main()
4185
4186# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: