2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__
= 'Public Domain'
21 __version__
= '2011.12.15'
23 UPDATE_URL
= 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO
as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse
import parse_qs
63 from cgi
import parse_qs
71 import xml
.etree
.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings
.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg
, i
):
92 raise ValueError(msg
+ ' at position ' + str(i
) + ' of ' + repr(s
) + ': ' + repr(s
[i
:]))
93 def skipSpace(i
, expectMore
=True):
94 while i
< len(s
) and s
[i
] in ' \t\r\n':
98 raiseError('Premature end', i
)
100 def decodeEscape(match
):
116 return unichr(int(esc
[1:5], 16))
117 if len(esc
) == 5+6 and esc
[5:7] == '\\u':
118 hi
= int(esc
[1:5], 16)
119 low
= int(esc
[7:11], 16)
120 return unichr((hi
- 0xd800) * 0x400 + low
- 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc
))
128 while s
[e
-bslashes
-1] == '\\':
130 if bslashes
% 2 == 1:
134 rexp
= re
.compile(r
'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri
= rexp
.sub(decodeEscape
, s
[i
:e
])
141 if s
[i
] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i
)
146 i
,key
= parseString(i
)
148 if i
>= len(s
) or s
[i
] != ':':
149 raiseError('Expected a colon', i
)
156 raiseError('Expected comma or closing curly brace', i
)
161 if s
[i
] == ']': # Empty array
166 i
= skipSpace(i
) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i
)
172 def parseDiscrete(i
):
173 for k
,v
in {'true': True, 'false': False, 'null': None}
.items():
174 if s
.startswith(k
, i
):
176 raiseError('Not a boolean (or null)', i
)
178 mobj
= re
.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s
[i
:])
180 raiseError('Not a number', i
)
182 if '.' in nums
or 'e' in nums
or 'E' in nums
:
183 return (i
+len(nums
), float(nums
))
184 return (i
+len(nums
), int(nums
))
185 CHARMAP
= {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i
,res
= CHARMAP
.get(s
[i
], parseNumber
)(i
)
189 i
= skipSpace(i
, False)
193 raise ValueError('Extra data at end of input (index ' + str(i
) + ' of ' + repr(s
) + ': ' + repr(s
[i
:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref
= locale
.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj
):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity
= matchobj
.group(1)
221 # Known non-numeric HTML entity
222 if entity
in htmlentitydefs
.name2codepoint
:
223 return unichr(htmlentitydefs
.name2codepoint
[entity
])
226 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
228 numstr
= mobj
.group(1)
229 if numstr
.startswith(u
'x'):
231 numstr
= u
'0%s' % numstr
234 return unichr(long(numstr
, base
))
236 # Unknown entity in name, return its literal representation
237 return (u
'&%s;' % entity
)
240 def sanitize_title(utitle
):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
)
243 return utitle
.replace(unicode(os
.sep
), u
'%')
246 def sanitize_open(filename
, open_mode
):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys
.platform
== 'win32':
260 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
261 return (sys
.stdout
, filename
)
262 stream
= open(filename
, open_mode
)
263 return (stream
, filename
)
264 except (IOError, OSError), err
:
265 # In case of error, try to remove win32 forbidden chars
266 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
268 # An exception here should be caught in the caller
269 stream
= open(filename
, open_mode
)
270 return (stream
, filename
)
273 def timeconvert(timestr
):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple
= email
.utils
.parsedate_tz(timestr
)
277 if timetuple
is not None:
278 timestamp
= email
.utils
.mktime_tz(timetuple
)
281 def _simplify_title(title
):
282 expr
= re
.compile(ur
'[^\w\d_\-]+', flags
=re
.UNICODE
)
283 return expr
.sub(u
'_', title
).strip(u
'_')
285 def _orderedSet(iterable
):
286 """ Remove all duplicates from the input iterable """
293 class DownloadError(Exception):
294 """Download Error exception.
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
303 class SameFileError(Exception):
304 """Same File exception.
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
312 class PostProcessingError(Exception):
313 """Post Processing exception.
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
345 def __init__(self
, downloaded
, expected
):
346 self
.downloaded
= downloaded
347 self
.expected
= expected
350 class YoutubeDLHandler(urllib2
.HTTPHandler
):
351 """Handler for HTTP requests and responses.
353 This class, when installed with an OpenerDirector, automatically adds
354 the standard headers to every HTTP request and handles gzipped and
355 deflated responses from web servers. If compression is to be avoided in
356 a particular request, the original request in the program code only has
357 to include the HTTP header "Youtubedl-No-Compression", which will be
358 removed before making the real request.
360 Part of this code was copied from:
362 http://techknack.net/python-urllib2-handlers/
364 Andrew Rowls, the author of that code, agreed to release it to the
371 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
373 return zlib
.decompress(data
)
376 def addinfourl_wrapper(stream
, headers
, url
, code
):
377 if hasattr(urllib2
.addinfourl
, 'getcode'):
378 return urllib2
.addinfourl(stream
, headers
, url
, code
)
379 ret
= urllib2
.addinfourl(stream
, headers
, url
)
383 def http_request(self
, req
):
384 for h
in std_headers
:
387 req
.add_header(h
, std_headers
[h
])
388 if 'Youtubedl-no-compression' in req
.headers
:
389 if 'Accept-encoding' in req
.headers
:
390 del req
.headers
['Accept-encoding']
391 del req
.headers
['Youtubedl-no-compression']
394 def http_response(self
, req
, resp
):
397 if resp
.headers
.get('Content-encoding', '') == 'gzip':
398 gz
= gzip
.GzipFile(fileobj
=StringIO
.StringIO(resp
.read()), mode
='r')
399 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
400 resp
.msg
= old_resp
.msg
402 if resp
.headers
.get('Content-encoding', '') == 'deflate':
403 gz
= StringIO
.StringIO(self
.deflate(resp
.read()))
404 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
405 resp
.msg
= old_resp
.msg
409 class FileDownloader(object):
410 """File Downloader class.
412 File downloader objects are the ones responsible of downloading the
413 actual video file and writing it to disk if the user has requested
414 it, among some other tasks. In most cases there should be one per
415 program. As, given a video URL, the downloader doesn't know how to
416 extract all the needed information, task that InfoExtractors do, it
417 has to pass the URL to one of them.
419 For this, file downloader objects have a method that allows
420 InfoExtractors to be registered in a given order. When it is passed
421 a URL, the file downloader handles it to the first InfoExtractor it
422 finds that reports being able to handle it. The InfoExtractor extracts
423 all the information about the video or videos the URL refers to, and
424 asks the FileDownloader to process the video information, possibly
425 downloading the video.
427 File downloaders accept a lot of parameters. In order not to saturate
428 the object constructor with arguments, it receives a dictionary of
429 options instead. These options are available through the params
430 attribute for the InfoExtractors to use. The FileDownloader also
431 registers itself as the downloader in charge for the InfoExtractors
432 that are added to it, so this is a "mutual registration".
436 username: Username for authentication purposes.
437 password: Password for authentication purposes.
438 usenetrc: Use netrc for authentication instead.
439 quiet: Do not print messages to stdout.
440 forceurl: Force printing final URL.
441 forcetitle: Force printing title.
442 forcethumbnail: Force printing thumbnail URL.
443 forcedescription: Force printing description.
444 forcefilename: Force printing final filename.
445 simulate: Do not download the video files.
446 format: Video format code.
447 format_limit: Highest quality format to try.
448 outtmpl: Template for output names.
449 ignoreerrors: Do not stop on download errors.
450 ratelimit: Download speed limit, in bytes/sec.
451 nooverwrites: Prevent overwriting files.
452 retries: Number of times to retry for HTTP error 5xx
453 continuedl: Try to continue downloads if possible.
454 noprogress: Do not print the progress bar.
455 playliststart: Playlist item to start at.
456 playlistend: Playlist item to end at.
457 matchtitle: Download only matching titles.
458 rejecttitle: Reject downloads for matching titles.
459 logtostderr: Log messages to stderr instead of stdout.
460 consoletitle: Display progress in console window's titlebar.
461 nopart: Do not use temporary .part files.
462 updatetime: Use the Last-modified header to set output file timestamps.
463 writedescription: Write the video description to a .description file
464 writeinfojson: Write the video description to a .info.json file
470 _download_retcode
= None
471 _num_downloads
= None
474 def __init__(self
, params
):
475 """Create a FileDownloader object with the given options."""
478 self
._download
_retcode
= 0
479 self
._num
_downloads
= 0
480 self
._screen
_file
= [sys
.stdout
, sys
.stderr
][params
.get('logtostderr', False)]
484 def format_bytes(bytes):
487 if type(bytes) is str:
492 exponent
= long(math
.log(bytes, 1024.0))
493 suffix
= 'bkMGTPEZY'[exponent
]
494 converted
= float(bytes) / float(1024 ** exponent
)
495 return '%.2f%s' % (converted
, suffix
)
498 def calc_percent(byte_counter
, data_len
):
501 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0))
504 def calc_eta(start
, now
, total
, current
):
508 if current
== 0 or dif
< 0.001: # One millisecond
510 rate
= float(current
) / dif
511 eta
= long((float(total
) - float(current
)) / rate
)
512 (eta_mins
, eta_secs
) = divmod(eta
, 60)
515 return '%02d:%02d' % (eta_mins
, eta_secs
)
518 def calc_speed(start
, now
, bytes):
520 if bytes == 0 or dif
< 0.001: # One millisecond
521 return '%10s' % '---b/s'
522 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
))
525 def best_block_size(elapsed_time
, bytes):
526 new_min
= max(bytes / 2.0, 1.0)
527 new_max
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528 if elapsed_time
< 0.001:
530 rate
= bytes / elapsed_time
538 def parse_bytes(bytestr
):
539 """Parse a string indicating a byte quantity into a long integer."""
540 matchobj
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
)
543 number
= float(matchobj
.group(1))
544 multiplier
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower())
545 return long(round(number
* multiplier
))
547 def add_info_extractor(self
, ie
):
548 """Add an InfoExtractor object to the end of the list."""
550 ie
.set_downloader(self
)
552 def add_post_processor(self
, pp
):
553 """Add a PostProcessor object to the end of the chain."""
555 pp
.set_downloader(self
)
557 def to_screen(self
, message
, skip_eol
=False, ignore_encoding_errors
=False):
558 """Print message to stdout if not in quiet mode."""
560 if not self
.params
.get('quiet', False):
561 terminator
= [u
'\n', u
''][skip_eol
]
562 print >>self
._screen
_file
, (u
'%s%s' % (message
, terminator
)).encode(preferredencoding()),
563 self
._screen
_file
.flush()
564 except (UnicodeEncodeError), err
:
565 if not ignore_encoding_errors
:
568 def to_stderr(self
, message
):
569 """Print message to stderr."""
570 print >>sys
.stderr
, message
.encode(preferredencoding())
572 def to_cons_title(self
, message
):
573 """Set console/terminal window title to message."""
574 if not self
.params
.get('consoletitle', False):
576 if os
.name
== 'nt' and ctypes
.windll
.kernel32
.GetConsoleWindow():
577 # c_wchar_p() might not be necessary if `message` is
578 # already of type unicode()
579 ctypes
.windll
.kernel32
.SetConsoleTitleW(ctypes
.c_wchar_p(message
))
580 elif 'TERM' in os
.environ
:
581 sys
.stderr
.write('\033]0;%s\007' % message
.encode(preferredencoding()))
583 def fixed_template(self
):
584 """Checks if the output template is fixed."""
585 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None)
587 def trouble(self
, message
=None):
588 """Determine action to take when a download problem appears.
590 Depending on if the downloader has been configured to ignore
591 download errors or not, this method may throw an exception or
592 not when errors are found, after printing the message.
594 if message
is not None:
595 self
.to_stderr(message
)
596 if not self
.params
.get('ignoreerrors', False):
597 raise DownloadError(message
)
598 self
._download
_retcode
= 1
600 def slow_down(self
, start_time
, byte_counter
):
601 """Sleep if the download speed is over the rate limit."""
602 rate_limit
= self
.params
.get('ratelimit', None)
603 if rate_limit
is None or byte_counter
== 0:
606 elapsed
= now
- start_time
609 speed
= float(byte_counter
) / elapsed
610 if speed
> rate_limit
:
611 time
.sleep((byte_counter
- rate_limit
* (now
- start_time
)) / rate_limit
)
613 def temp_name(self
, filename
):
614 """Returns a temporary filename for the given filename."""
615 if self
.params
.get('nopart', False) or filename
== u
'-' or \
616 (os
.path
.exists(filename
) and not os
.path
.isfile(filename
)):
618 return filename
+ u
'.part'
620 def undo_temp_name(self
, filename
):
621 if filename
.endswith(u
'.part'):
622 return filename
[:-len(u
'.part')]
625 def try_rename(self
, old_filename
, new_filename
):
627 if old_filename
== new_filename
:
629 os
.rename(old_filename
, new_filename
)
630 except (IOError, OSError), err
:
631 self
.trouble(u
'ERROR: unable to rename file')
633 def try_utime(self
, filename
, last_modified_hdr
):
634 """Try to set the last-modified time of the given file."""
635 if last_modified_hdr
is None:
637 if not os
.path
.isfile(filename
):
639 timestr
= last_modified_hdr
642 filetime
= timeconvert(timestr
)
646 os
.utime(filename
, (time
.time(), filetime
))
651 def report_writedescription(self
, descfn
):
652 """ Report that the description file is being written """
653 self
.to_screen(u
'[info] Writing video description to: %s' % descfn
, ignore_encoding_errors
=True)
655 def report_writeinfojson(self
, infofn
):
656 """ Report that the metadata file has been written """
657 self
.to_screen(u
'[info] Video description metadata as JSON to: %s' % infofn
, ignore_encoding_errors
=True)
659 def report_destination(self
, filename
):
660 """Report destination filename."""
661 self
.to_screen(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True)
663 def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
):
664 """Report download progress."""
665 if self
.params
.get('noprogress', False):
667 self
.to_screen(u
'\r[download] %s of %s at %s ETA %s' %
668 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True)
669 self
.to_cons_title(u
'youtube-dl - %s of %s at %s ETA %s' %
670 (percent_str
.strip(), data_len_str
.strip(), speed_str
.strip(), eta_str
.strip()))
672 def report_resuming_byte(self
, resume_len
):
673 """Report attempt to resume at given byte."""
674 self
.to_screen(u
'[download] Resuming download at byte %s' % resume_len
)
676 def report_retry(self
, count
, retries
):
677 """Report retry in case of HTTP error 5xx"""
678 self
.to_screen(u
'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count
, retries
))
680 def report_file_already_downloaded(self
, file_name
):
681 """Report file has already been fully downloaded."""
683 self
.to_screen(u
'[download] %s has already been downloaded' % file_name
)
684 except (UnicodeEncodeError), err
:
685 self
.to_screen(u
'[download] The file has already been downloaded')
687 def report_unable_to_resume(self
):
688 """Report it was impossible to resume download."""
689 self
.to_screen(u
'[download] Unable to resume')
691 def report_finish(self
):
692 """Report download finished."""
693 if self
.params
.get('noprogress', False):
694 self
.to_screen(u
'[download] Download completed')
698 def increment_downloads(self
):
699 """Increment the ordinal that assigns a number to each file."""
700 self
._num
_downloads
+= 1
702 def prepare_filename(self
, info_dict
):
703 """Generate the output filename."""
705 template_dict
= dict(info_dict
)
706 template_dict
['epoch'] = unicode(long(time
.time()))
707 template_dict
['autonumber'] = unicode('%05d' % self
._num
_downloads
)
708 filename
= self
.params
['outtmpl'] % template_dict
710 except (ValueError, KeyError), err
:
711 self
.trouble(u
'ERROR: invalid system charset or erroneous output template')
714 def _match_entry(self
, info_dict
):
715 """ Returns None iff the file should be downloaded """
717 title
= info_dict
['title']
718 matchtitle
= self
.params
.get('matchtitle', False)
719 if matchtitle
and not re
.search(matchtitle
, title
, re
.IGNORECASE
):
720 return u
'[download] "' + title
+ '" title did not match pattern "' + matchtitle
+ '"'
721 rejecttitle
= self
.params
.get('rejecttitle', False)
722 if rejecttitle
and re
.search(rejecttitle
, title
, re
.IGNORECASE
):
723 return u
'"' + title
+ '" title matched reject pattern "' + rejecttitle
+ '"'
726 def process_info(self
, info_dict
):
727 """Process a single dictionary returned by an InfoExtractor."""
729 reason
= self
._match
_entry
(info_dict
)
730 if reason
is not None:
731 self
.to_screen(u
'[download] ' + reason
)
734 max_downloads
= self
.params
.get('max_downloads')
735 if max_downloads
is not None:
736 if self
._num
_downloads
> int(max_downloads
):
737 raise MaxDownloadsReached()
739 filename
= self
.prepare_filename(info_dict
)
742 if self
.params
.get('forcetitle', False):
743 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744 if self
.params
.get('forceurl', False):
745 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746 if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
:
747 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748 if self
.params
.get('forcedescription', False) and 'description' in info_dict
:
749 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750 if self
.params
.get('forcefilename', False) and filename
is not None:
751 print filename
.encode(preferredencoding(), 'xmlcharrefreplace')
752 if self
.params
.get('forceformat', False):
753 print info_dict
['format'].encode(preferredencoding(), 'xmlcharrefreplace')
755 # Do nothing else if in simulate mode
756 if self
.params
.get('simulate', False):
763 dn
= os
.path
.dirname(filename
)
764 if dn
!= '' and not os
.path
.exists(dn
):
766 except (OSError, IOError), err
:
767 self
.trouble(u
'ERROR: unable to create directory ' + unicode(err
))
770 if self
.params
.get('writedescription', False):
772 descfn
= filename
+ '.description'
773 self
.report_writedescription(descfn
)
774 descfile
= open(descfn
, 'wb')
776 descfile
.write(info_dict
['description'].encode('utf-8'))
779 except (OSError, IOError):
780 self
.trouble(u
'ERROR: Cannot write description file ' + descfn
)
783 if self
.params
.get('writeinfojson', False):
784 infofn
= filename
+ '.info.json'
785 self
.report_writeinfojson(infofn
)
788 except (NameError,AttributeError):
789 self
.trouble(u
'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
792 infof
= open(infofn
, 'wb')
794 json_info_dict
= dict((k
,v
) for k
,v
in info_dict
.iteritems() if not k
in ('urlhandle',))
795 json
.dump(json_info_dict
, infof
)
798 except (OSError, IOError):
799 self
.trouble(u
'ERROR: Cannot write metadata to JSON file ' + infofn
)
802 if not self
.params
.get('skip_download', False):
803 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
):
807 success
= self
._do
_download
(filename
, info_dict
)
808 except (OSError, IOError), err
:
809 raise UnavailableVideoError
810 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
811 self
.trouble(u
'ERROR: unable to download video data: %s' % str(err
))
813 except (ContentTooShortError
, ), err
:
814 self
.trouble(u
'ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
))
819 self
.post_process(filename
, info_dict
)
820 except (PostProcessingError
), err
:
821 self
.trouble(u
'ERROR: postprocessing: %s' % str(err
))
824 def download(self
, url_list
):
825 """Download a given list of URLs."""
826 if len(url_list
) > 1 and self
.fixed_template():
827 raise SameFileError(self
.params
['outtmpl'])
830 suitable_found
= False
832 # Go to next InfoExtractor if not suitable
833 if not ie
.suitable(url
):
836 # Suitable InfoExtractor found
837 suitable_found
= True
839 # Extract information from URL and process it
842 # Suitable InfoExtractor had been found; go to next URL
845 if not suitable_found
:
846 self
.trouble(u
'ERROR: no suitable InfoExtractor: %s' % url
)
848 return self
._download
_retcode
850 def post_process(self
, filename
, ie_info
):
851 """Run the postprocessing chain on the given file."""
853 info
['filepath'] = filename
859 def _download_with_rtmpdump(self
, filename
, url
, player_url
):
860 self
.report_destination(filename
)
861 tmpfilename
= self
.temp_name(filename
)
863 # Check for rtmpdump first
865 subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
)
866 except (OSError, IOError):
867 self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run')
870 # Download using rtmpdump. rtmpdump returns exit code 2 when
871 # the connection was interrumpted and resuming appears to be
872 # possible. This is part of rtmpdump's normal usage, AFAIK.
873 basic_args
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url
is not None] + ['-r', url
, '-o', tmpfilename
]
874 retval
= subprocess
.call(basic_args
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)])
875 while retval
== 2 or retval
== 1:
876 prevsize
= os
.path
.getsize(tmpfilename
)
877 self
.to_screen(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True)
878 time
.sleep(5.0) # This seems to be needed
879 retval
= subprocess
.call(basic_args
+ ['-e'] + [[], ['-k', '1']][retval
== 1])
880 cursize
= os
.path
.getsize(tmpfilename
)
881 if prevsize
== cursize
and retval
== 1:
883 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
884 if prevsize
== cursize
and retval
== 2 and cursize
> 1024:
885 self
.to_screen(u
'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
889 self
.to_screen(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(tmpfilename
))
890 self
.try_rename(tmpfilename
, filename
)
893 self
.trouble(u
'\nERROR: rtmpdump exited with code %d' % retval
)
896 def _do_download(self
, filename
, info_dict
):
897 url
= info_dict
['url']
898 player_url
= info_dict
.get('player_url', None)
900 # Check file already present
901 if self
.params
.get('continuedl', False) and os
.path
.isfile(filename
) and not self
.params
.get('nopart', False):
902 self
.report_file_already_downloaded(filename
)
905 # Attempt to download using rtmpdump
906 if url
.startswith('rtmp'):
907 return self
._download
_with
_rtmpdump
(filename
, url
, player_url
)
909 tmpfilename
= self
.temp_name(filename
)
912 # Do not include the Accept-Encoding header
913 headers
= {'Youtubedl-no-compression': 'True'}
914 basic_request
= urllib2
.Request(url
, None, headers
)
915 request
= urllib2
.Request(url
, None, headers
)
917 # Establish possible resume length
918 if os
.path
.isfile(tmpfilename
):
919 resume_len
= os
.path
.getsize(tmpfilename
)
925 if self
.params
.get('continuedl', False):
926 self
.report_resuming_byte(resume_len
)
927 request
.add_header('Range','bytes=%d-' % resume_len
)
933 retries
= self
.params
.get('retries', 0)
934 while count
<= retries
:
935 # Establish connection
937 if count
== 0 and 'urlhandle' in info_dict
:
938 data
= info_dict
['urlhandle']
939 data
= urllib2
.urlopen(request
)
941 except (urllib2
.HTTPError
, ), err
:
942 if (err
.code
< 500 or err
.code
>= 600) and err
.code
!= 416:
943 # Unexpected HTTP error
945 elif err
.code
== 416:
946 # Unable to resume (requested range not satisfiable)
948 # Open the connection again without the range header
949 data
= urllib2
.urlopen(basic_request
)
950 content_length
= data
.info()['Content-Length']
951 except (urllib2
.HTTPError
, ), err
:
952 if err
.code
< 500 or err
.code
>= 600:
955 # Examine the reported length
956 if (content_length
is not None and
957 (resume_len
- 100 < long(content_length
) < resume_len
+ 100)):
958 # The file had already been fully downloaded.
959 # Explanation to the above condition: in issue #175 it was revealed that
960 # YouTube sometimes adds or removes a few bytes from the end of the file,
961 # changing the file size slightly and causing problems for some users. So
962 # I decided to implement a suggested change and consider the file
963 # completely downloaded if the file size differs less than 100 bytes from
964 # the one in the hard drive.
965 self
.report_file_already_downloaded(filename
)
966 self
.try_rename(tmpfilename
, filename
)
969 # The length does not match, we start the download over
970 self
.report_unable_to_resume()
976 self
.report_retry(count
, retries
)
979 self
.trouble(u
'ERROR: giving up after %s retries' % retries
)
982 data_len
= data
.info().get('Content-length', None)
983 if data_len
is not None:
984 data_len
= long(data_len
) + resume_len
985 data_len_str
= self
.format_bytes(data_len
)
986 byte_counter
= 0 + resume_len
992 data_block
= data
.read(block_size
)
994 if len(data_block
) == 0:
996 byte_counter
+= len(data_block
)
998 # Open file just in time
1001 (stream
, tmpfilename
) = sanitize_open(tmpfilename
, open_mode
)
1002 assert stream
is not None
1003 filename
= self
.undo_temp_name(tmpfilename
)
1004 self
.report_destination(filename
)
1005 except (OSError, IOError), err
:
1006 self
.trouble(u
'ERROR: unable to open for writing: %s' % str(err
))
1009 stream
.write(data_block
)
1010 except (IOError, OSError), err
:
1011 self
.trouble(u
'\nERROR: unable to write data: %s' % str(err
))
1013 block_size
= self
.best_block_size(after
- before
, len(data_block
))
1016 speed_str
= self
.calc_speed(start
, time
.time(), byte_counter
- resume_len
)
1017 if data_len
is None:
1018 self
.report_progress('Unknown %', data_len_str
, speed_str
, 'Unknown ETA')
1020 percent_str
= self
.calc_percent(byte_counter
, data_len
)
1021 eta_str
= self
.calc_eta(start
, time
.time(), data_len
- resume_len
, byte_counter
- resume_len
)
1022 self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
)
1025 self
.slow_down(start
, byte_counter
- resume_len
)
1028 self
.trouble(u
'\nERROR: Did not get any data blocks')
1031 self
.report_finish()
1032 if data_len
is not None and byte_counter
!= data_len
:
1033 raise ContentTooShortError(byte_counter
, long(data_len
))
1034 self
.try_rename(tmpfilename
, filename
)
1036 # Update file modification time
1037 if self
.params
.get('updatetime', True):
1038 info_dict
['filetime'] = self
.try_utime(filename
, data
.info().get('last-modified', None))
1043 class InfoExtractor(object):
1044 """Information Extractor class.
1046 Information extractors are the classes that, given a URL, extract
1047 information from the video (or videos) the URL refers to. This
1048 information includes the real video URL, the video title and simplified
1049 title, author and others. The information is stored in a dictionary
1050 which is then passed to the FileDownloader. The FileDownloader
1051 processes this information possibly downloading the video to the file
1052 system, among other possible outcomes. The dictionaries must include
1053 the following fields:
1055 id: Video identifier.
1056 url: Final video URL.
1057 uploader: Nickname of the video uploader.
1058 title: Literal title.
1059 stitle: Simplified title.
1060 ext: Video filename extension.
1061 format: Video format.
1062 player_url: SWF Player URL (may be None).
1064 The following fields are optional. Their primary purpose is to allow
1065 youtube-dl to serve as the backend for a video search function, such
1066 as the one in youtube2mp3. They are only used when their respective
1067 forced printing functions are called:
1069 thumbnail: Full URL to a video thumbnail image.
1070 description: One-line video description.
1072 Subclasses of this one should re-define the _real_initialize() and
1073 _real_extract() methods and define a _VALID_URL regexp.
1074 Probably, they should also be added to the list of extractors.
1080 def __init__(self
, downloader
=None):
1081 """Constructor. Receives an optional downloader."""
1083 self
.set_downloader(downloader
)
1085 def suitable(self
, url
):
1086 """Receives a URL and returns True if suitable for this IE."""
1087 return re
.match(self
._VALID
_URL
, url
) is not None
1089 def initialize(self
):
1090 """Initializes an instance (authentication, etc)."""
1092 self
._real
_initialize
()
1095 def extract(self
, url
):
1096 """Extracts URL information and returns it in list of dicts."""
1098 return self
._real
_extract
(url
)
1100 def set_downloader(self
, downloader
):
1101 """Sets the downloader for this IE."""
1102 self
._downloader
= downloader
1104 def _real_initialize(self
):
1105 """Real initialization process. Redefine in subclasses."""
1108 def _real_extract(self
, url
):
1109 """Real extraction process. Redefine in subclasses."""
1113 class YoutubeIE(InfoExtractor
):
1114 """Information extractor for youtube.com."""
1116 _VALID_URL
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1117 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1118 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1119 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1120 _NETRC_MACHINE
= 'youtube'
1121 # Listed in order of quality
1122 _available_formats
= ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1123 _available_formats_prefer_free
= ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1124 _video_extensions
= {
1130 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1135 _video_dimensions
= {
1150 IE_NAME
= u
'youtube'
1152 def report_lang(self
):
1153 """Report attempt to set language."""
1154 self
._downloader
.to_screen(u
'[youtube] Setting language')
1156 def report_login(self
):
1157 """Report attempt to log in."""
1158 self
._downloader
.to_screen(u
'[youtube] Logging in')
1160 def report_age_confirmation(self
):
1161 """Report attempt to confirm age."""
1162 self
._downloader
.to_screen(u
'[youtube] Confirming age')
1164 def report_video_webpage_download(self
, video_id
):
1165 """Report attempt to download video webpage."""
1166 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
1168 def report_video_info_webpage_download(self
, video_id
):
1169 """Report attempt to download video info webpage."""
1170 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
1172 def report_information_extraction(self
, video_id
):
1173 """Report attempt to extract video information."""
1174 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
1176 def report_unavailable_format(self
, video_id
, format
):
1177 """Report extracted video URL."""
1178 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
1180 def report_rtmp_download(self
):
1181 """Indicate the download will use the RTMP protocol."""
1182 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
1184 def _print_formats(self
, formats
):
1185 print 'Available formats:'
1187 print '%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???'))
1189 def _real_initialize(self
):
1190 if self
._downloader
is None:
1195 downloader_params
= self
._downloader
.params
1197 # Attempt to use provided username and password or .netrc data
1198 if downloader_params
.get('username', None) is not None:
1199 username
= downloader_params
['username']
1200 password
= downloader_params
['password']
1201 elif downloader_params
.get('usenetrc', False):
1203 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
1204 if info
is not None:
1208 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
1209 except (IOError, netrc
.NetrcParseError
), err
:
1210 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
1214 request
= urllib2
.Request(self
._LANG
_URL
)
1217 urllib2
.urlopen(request
).read()
1218 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1219 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
1222 # No authentication to be performed
1223 if username
is None:
1228 'current_form': 'loginForm',
1230 'action_login': 'Log In',
1231 'username': username
,
1232 'password': password
,
1234 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
))
1237 login_results
= urllib2
.urlopen(request
).read()
1238 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
1239 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
1241 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1242 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
1248 'action_confirm': 'Confirm',
1250 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
))
1252 self
.report_age_confirmation()
1253 age_results
= urllib2
.urlopen(request
).read()
1254 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1255 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1258 def _real_extract(self
, url
):
1259 # Extract video id from URL
1260 mobj
= re
.match(self
._VALID
_URL
, url
)
1262 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1264 video_id
= mobj
.group(2)
1267 self
.report_video_webpage_download(video_id
)
1268 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
)
1270 video_webpage
= urllib2
.urlopen(request
).read()
1271 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1272 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
1275 # Attempt to extract SWF player URL
1276 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
1277 if mobj
is not None:
1278 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
1283 self
.report_video_info_webpage_download(video_id
)
1284 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286 % (video_id
, el_type
))
1287 request
= urllib2
.Request(video_info_url
)
1289 video_info_webpage
= urllib2
.urlopen(request
).read()
1290 video_info
= parse_qs(video_info_webpage
)
1291 if 'token' in video_info
:
1293 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1294 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
1296 if 'token' not in video_info
:
1297 if 'reason' in video_info
:
1298 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
1300 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
1303 # Start extracting information
1304 self
.report_information_extraction(video_id
)
1307 if 'author' not in video_info
:
1308 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1310 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
1313 if 'title' not in video_info
:
1314 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1316 video_title
= urllib
.unquote_plus(video_info
['title'][0])
1317 video_title
= video_title
.decode('utf-8')
1318 video_title
= sanitize_title(video_title
)
1321 simple_title
= _simplify_title(video_title
)
1324 if 'thumbnail_url' not in video_info
:
1325 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
1326 video_thumbnail
= ''
1327 else: # don't panic if we can't find it
1328 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
1332 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
1333 if mobj
is not None:
1334 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
1335 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336 for expression
in format_expressions
:
1338 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
1346 video_description
= u
'No description available.'
1347 if self
._downloader
.params
.get('forcedescription', False) or self
._downloader
.params
.get('writedescription', False):
1348 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
)
1349 if mobj
is not None:
1350 video_description
= mobj
.group(1).decode('utf-8')
1352 html_parser
= lxml
.etree
.HTMLParser(encoding
='utf-8')
1353 vwebpage_doc
= lxml
.etree
.parse(StringIO
.StringIO(video_webpage
), html_parser
)
1354 video_description
= u
''.join(vwebpage_doc
.xpath('id("eow-description")//text()'))
1355 # TODO use another parser
1358 video_token
= urllib
.unquote_plus(video_info
['token'][0])
1360 # Decide which formats to download
1361 req_format
= self
._downloader
.params
.get('format', None)
1363 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
1364 self
.report_rtmp_download()
1365 video_url_list
= [(None, video_info
['conn'][0])]
1366 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
1367 url_data_strs
= video_info
['url_encoded_fmt_stream_map'][0].split(',')
1368 url_data
= [parse_qs(uds
) for uds
in url_data_strs
]
1369 url_data
= filter(lambda ud
: 'itag' in ud
and 'url' in ud
, url_data
)
1370 url_map
= dict((ud
['itag'][0], ud
['url'][0]) for ud
in url_data
)
1372 format_limit
= self
._downloader
.params
.get('format_limit', None)
1373 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
1374 if format_limit
is not None and format_limit
in available_formats
:
1375 format_list
= available_formats
[available_formats
.index(format_limit
):]
1377 format_list
= available_formats
1378 existing_formats
= [x
for x
in format_list
if x
in url_map
]
1379 if len(existing_formats
) == 0:
1380 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
1382 if self
._downloader
.params
.get('listformats', None):
1383 self
._print
_formats
(existing_formats
)
1385 if req_format
is None or req_format
== 'best':
1386 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
1387 elif req_format
== 'worst':
1388 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
1389 elif req_format
in ('-1', 'all'):
1390 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
1392 # Specific formats. We pick the first in a slash-delimeted sequence.
1393 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1394 req_formats
= req_format
.split('/')
1395 video_url_list
= None
1396 for rf
in req_formats
:
1398 video_url_list
= [(rf
, url_map
[rf
])]
1400 if video_url_list
is None:
1401 self
._downloader
.trouble(u
'ERROR: requested format not available')
1404 self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1407 for format_param
, video_real_url
in video_url_list
:
1408 # At this point we have a new video
1409 self
._downloader
.increment_downloads()
1412 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
1415 # Process video information
1416 self
._downloader
.process_info({
1417 'id': video_id
.decode('utf-8'),
1418 'url': video_real_url
.decode('utf-8'),
1419 'uploader': video_uploader
.decode('utf-8'),
1420 'upload_date': upload_date
,
1421 'title': video_title
,
1422 'stitle': simple_title
,
1423 'ext': video_extension
.decode('utf-8'),
1424 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
1425 'thumbnail': video_thumbnail
.decode('utf-8'),
1426 'description': video_description
,
1427 'player_url': player_url
,
1429 except UnavailableVideoError
, err
:
1430 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1433 class MetacafeIE(InfoExtractor
):
1434 """Information Extractor for metacafe.com."""
1436 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1437 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
1438 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1440 IE_NAME
= u
'metacafe'
1442 def __init__(self
, youtube_ie
, downloader
=None):
1443 InfoExtractor
.__init
__(self
, downloader
)
1444 self
._youtube
_ie
= youtube_ie
1446 def report_disclaimer(self
):
1447 """Report disclaimer retrieval."""
1448 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
1450 def report_age_confirmation(self
):
1451 """Report attempt to confirm age."""
1452 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
1454 def report_download_webpage(self
, video_id
):
1455 """Report webpage download."""
1456 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
1458 def report_extraction(self
, video_id
):
1459 """Report information extraction."""
1460 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
1462 def _real_initialize(self
):
1463 # Retrieve disclaimer
1464 request
= urllib2
.Request(self
._DISCLAIMER
)
1466 self
.report_disclaimer()
1467 disclaimer
= urllib2
.urlopen(request
).read()
1468 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1469 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
1475 'submit': "Continue - I'm over 18",
1477 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
))
1479 self
.report_age_confirmation()
1480 disclaimer
= urllib2
.urlopen(request
).read()
1481 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1482 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1485 def _real_extract(self
, url
):
1486 # Extract id and simplified title from URL
1487 mobj
= re
.match(self
._VALID
_URL
, url
)
1489 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1492 video_id
= mobj
.group(1)
1494 # Check if video comes from YouTube
1495 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
1496 if mobj2
is not None:
1497 self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1))
1500 # At this point we have a new video
1501 self
._downloader
.increment_downloads()
1503 simple_title
= mobj
.group(2).decode('utf-8')
1505 # Retrieve video webpage to extract further information
1506 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
1508 self
.report_download_webpage(video_id
)
1509 webpage
= urllib2
.urlopen(request
).read()
1510 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1511 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1514 # Extract URL, uploader and title from webpage
1515 self
.report_extraction(video_id
)
1516 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
1517 if mobj
is not None:
1518 mediaURL
= urllib
.unquote(mobj
.group(1))
1519 video_extension
= mediaURL
[-3:]
1521 # Extract gdaKey if available
1522 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
1524 video_url
= mediaURL
1526 gdaKey
= mobj
.group(1)
1527 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
1529 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
1531 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1533 vardict
= parse_qs(mobj
.group(1))
1534 if 'mediaData' not in vardict
:
1535 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1537 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
1539 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1541 mediaURL
= mobj
.group(1).replace('\\/', '/')
1542 video_extension
= mediaURL
[-3:]
1543 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
1545 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
1547 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1549 video_title
= mobj
.group(1).decode('utf-8')
1550 video_title
= sanitize_title(video_title
)
1552 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
1554 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1556 video_uploader
= mobj
.group(1)
1559 # Process video information
1560 self
._downloader
.process_info({
1561 'id': video_id
.decode('utf-8'),
1562 'url': video_url
.decode('utf-8'),
1563 'uploader': video_uploader
.decode('utf-8'),
1564 'upload_date': u
'NA',
1565 'title': video_title
,
1566 'stitle': simple_title
,
1567 'ext': video_extension
.decode('utf-8'),
1571 except UnavailableVideoError
:
1572 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1575 class DailymotionIE(InfoExtractor
):
1576 """Information Extractor for Dailymotion"""
1578 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1579 IE_NAME
= u
'dailymotion'
1581 def __init__(self
, downloader
=None):
1582 InfoExtractor
.__init
__(self
, downloader
)
1584 def report_download_webpage(self
, video_id
):
1585 """Report webpage download."""
1586 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
1588 def report_extraction(self
, video_id
):
1589 """Report information extraction."""
1590 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
1592 def _real_extract(self
, url
):
1593 htmlParser
= HTMLParser
.HTMLParser()
1595 # Extract id and simplified title from URL
1596 mobj
= re
.match(self
._VALID
_URL
, url
)
1598 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1601 # At this point we have a new video
1602 self
._downloader
.increment_downloads()
1603 video_id
= mobj
.group(1)
1605 video_extension
= 'flv'
1607 # Retrieve video webpage to extract further information
1608 request
= urllib2
.Request(url
)
1609 request
.add_header('Cookie', 'family_filter=off')
1611 self
.report_download_webpage(video_id
)
1612 webpage
= urllib2
.urlopen(request
).read()
1613 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1614 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1617 # Extract URL, uploader and title from webpage
1618 self
.report_extraction(video_id
)
1619 mobj
= re
.search(r
'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage
)
1621 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1623 sequence
= urllib
.unquote(mobj
.group(1))
1624 mobj
= re
.search(r
',\"sdURL\"\:\"([^\"]+?)\",', sequence
)
1626 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1628 mediaURL
= urllib
.unquote(mobj
.group(1)).replace('\\', '')
1630 # if needed add http://www.dailymotion.com/ if relative URL
1632 video_url
= mediaURL
1634 mobj
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
)
1636 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1638 video_title
= htmlParser
.unescape(mobj
.group('title')).decode('utf-8')
1639 video_title
= sanitize_title(video_title
)
1640 simple_title
= _simplify_title(video_title
)
1642 mobj
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage
)
1644 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1646 video_uploader
= mobj
.group(1)
1649 # Process video information
1650 self
._downloader
.process_info({
1651 'id': video_id
.decode('utf-8'),
1652 'url': video_url
.decode('utf-8'),
1653 'uploader': video_uploader
.decode('utf-8'),
1654 'upload_date': u
'NA',
1655 'title': video_title
,
1656 'stitle': simple_title
,
1657 'ext': video_extension
.decode('utf-8'),
1661 except UnavailableVideoError
:
1662 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1665 class GoogleIE(InfoExtractor
):
1666 """Information extractor for video.google.com."""
1668 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1669 IE_NAME
= u
'video.google'
1671 def __init__(self
, downloader
=None):
1672 InfoExtractor
.__init
__(self
, downloader
)
1674 def report_download_webpage(self
, video_id
):
1675 """Report webpage download."""
1676 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
1678 def report_extraction(self
, video_id
):
1679 """Report information extraction."""
1680 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
1682 def _real_extract(self
, url
):
1683 # Extract id from URL
1684 mobj
= re
.match(self
._VALID
_URL
, url
)
1686 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1689 # At this point we have a new video
1690 self
._downloader
.increment_downloads()
1691 video_id
= mobj
.group(1)
1693 video_extension
= 'mp4'
1695 # Retrieve video webpage to extract further information
1696 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
1698 self
.report_download_webpage(video_id
)
1699 webpage
= urllib2
.urlopen(request
).read()
1700 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1701 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1704 # Extract URL, uploader, and title from webpage
1705 self
.report_extraction(video_id
)
1706 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
1708 video_extension
= 'flv'
1709 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
1711 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1713 mediaURL
= urllib
.unquote(mobj
.group(1))
1714 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
1715 mediaURL
= mediaURL
.replace('\\x26', '\x26')
1717 video_url
= mediaURL
1719 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1721 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1723 video_title
= mobj
.group(1).decode('utf-8')
1724 video_title
= sanitize_title(video_title
)
1725 simple_title
= _simplify_title(video_title
)
1727 # Extract video description
1728 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
1730 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1732 video_description
= mobj
.group(1).decode('utf-8')
1733 if not video_description
:
1734 video_description
= 'No description available.'
1736 # Extract video thumbnail
1737 if self
._downloader
.params
.get('forcethumbnail', False):
1738 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
1740 webpage
= urllib2
.urlopen(request
).read()
1741 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1742 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1744 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
1746 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1748 video_thumbnail
= mobj
.group(1)
1749 else: # we need something to pass to process_info
1750 video_thumbnail
= ''
1753 # Process video information
1754 self
._downloader
.process_info({
1755 'id': video_id
.decode('utf-8'),
1756 'url': video_url
.decode('utf-8'),
1758 'upload_date': u
'NA',
1759 'title': video_title
,
1760 'stitle': simple_title
,
1761 'ext': video_extension
.decode('utf-8'),
1765 except UnavailableVideoError
:
1766 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1769 class PhotobucketIE(InfoExtractor
):
1770 """Information extractor for photobucket.com."""
1772 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1773 IE_NAME
= u
'photobucket'
1775 def __init__(self
, downloader
=None):
1776 InfoExtractor
.__init
__(self
, downloader
)
1778 def report_download_webpage(self
, video_id
):
1779 """Report webpage download."""
1780 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
1782 def report_extraction(self
, video_id
):
1783 """Report information extraction."""
1784 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
1786 def _real_extract(self
, url
):
1787 # Extract id from URL
1788 mobj
= re
.match(self
._VALID
_URL
, url
)
1790 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1793 # At this point we have a new video
1794 self
._downloader
.increment_downloads()
1795 video_id
= mobj
.group(1)
1797 video_extension
= 'flv'
1799 # Retrieve video webpage to extract further information
1800 request
= urllib2
.Request(url
)
1802 self
.report_download_webpage(video_id
)
1803 webpage
= urllib2
.urlopen(request
).read()
1804 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1805 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1808 # Extract URL, uploader, and title from webpage
1809 self
.report_extraction(video_id
)
1810 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
1812 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1814 mediaURL
= urllib
.unquote(mobj
.group(1))
1816 video_url
= mediaURL
1818 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1820 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1822 video_title
= mobj
.group(1).decode('utf-8')
1823 video_title
= sanitize_title(video_title
)
1824 simple_title
= _simplify_title(vide_title
)
1826 video_uploader
= mobj
.group(2).decode('utf-8')
1829 # Process video information
1830 self
._downloader
.process_info({
1831 'id': video_id
.decode('utf-8'),
1832 'url': video_url
.decode('utf-8'),
1833 'uploader': video_uploader
,
1834 'upload_date': u
'NA',
1835 'title': video_title
,
1836 'stitle': simple_title
,
1837 'ext': video_extension
.decode('utf-8'),
1841 except UnavailableVideoError
:
1842 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1845 class YahooIE(InfoExtractor
):
1846 """Information extractor for video.yahoo.com."""
1848 # _VALID_URL matches all Yahoo! Video URLs
1849 # _VPAGE_URL matches only the extractable '/watch/' URLs
1850 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1851 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1852 IE_NAME
= u
'video.yahoo'
1854 def __init__(self
, downloader
=None):
1855 InfoExtractor
.__init
__(self
, downloader
)
1857 def report_download_webpage(self
, video_id
):
1858 """Report webpage download."""
1859 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
1861 def report_extraction(self
, video_id
):
1862 """Report information extraction."""
1863 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
1865 def _real_extract(self
, url
, new_video
=True):
1866 # Extract ID from URL
1867 mobj
= re
.match(self
._VALID
_URL
, url
)
1869 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1872 # At this point we have a new video
1873 self
._downloader
.increment_downloads()
1874 video_id
= mobj
.group(2)
1875 video_extension
= 'flv'
1877 # Rewrite valid but non-extractable URLs as
1878 # extractable English language /watch/ URLs
1879 if re
.match(self
._VPAGE
_URL
, url
) is None:
1880 request
= urllib2
.Request(url
)
1882 webpage
= urllib2
.urlopen(request
).read()
1883 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1884 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1887 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
1889 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
1891 yahoo_id
= mobj
.group(1)
1893 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
1895 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
1897 yahoo_vid
= mobj
.group(1)
1899 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
1900 return self
._real
_extract
(url
, new_video
=False)
1902 # Retrieve video webpage to extract further information
1903 request
= urllib2
.Request(url
)
1905 self
.report_download_webpage(video_id
)
1906 webpage
= urllib2
.urlopen(request
).read()
1907 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1908 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1911 # Extract uploader and title from webpage
1912 self
.report_extraction(video_id
)
1913 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
1915 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1917 video_title
= mobj
.group(1).decode('utf-8')
1918 simple_title
= _simplify_title(video_title
)
1920 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
1922 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1924 video_uploader
= mobj
.group(1).decode('utf-8')
1926 # Extract video thumbnail
1927 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
1929 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1931 video_thumbnail
= mobj
.group(1).decode('utf-8')
1933 # Extract video description
1934 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
1936 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1938 video_description
= mobj
.group(1).decode('utf-8')
1939 if not video_description
:
1940 video_description
= 'No description available.'
1942 # Extract video height and width
1943 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
1945 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
1947 yv_video_height
= mobj
.group(1)
1949 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
1951 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
1953 yv_video_width
= mobj
.group(1)
1955 # Retrieve video playlist to extract media URL
1956 # I'm not completely sure what all these options are, but we
1957 # seem to need most of them, otherwise the server sends a 401.
1958 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1959 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1960 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1961 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1962 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1964 self
.report_download_webpage(video_id
)
1965 webpage
= urllib2
.urlopen(request
).read()
1966 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1967 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1970 # Extract media URL from playlist XML
1971 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1973 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1975 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1976 video_url
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
)
1979 # Process video information
1980 self
._downloader
.process_info({
1981 'id': video_id
.decode('utf-8'),
1983 'uploader': video_uploader
,
1984 'upload_date': u
'NA',
1985 'title': video_title
,
1986 'stitle': simple_title
,
1987 'ext': video_extension
.decode('utf-8'),
1988 'thumbnail': video_thumbnail
.decode('utf-8'),
1989 'description': video_description
,
1990 'thumbnail': video_thumbnail
,
1993 except UnavailableVideoError
:
1994 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1997 class VimeoIE(InfoExtractor
):
1998 """Information extractor for vimeo.com."""
2000 # _VALID_URL matches Vimeo URLs
2001 _VALID_URL
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2004 def __init__(self
, downloader
=None):
2005 InfoExtractor
.__init
__(self
, downloader
)
2007 def report_download_webpage(self
, video_id
):
2008 """Report webpage download."""
2009 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
)
2011 def report_extraction(self
, video_id
):
2012 """Report information extraction."""
2013 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
)
2015 def _real_extract(self
, url
, new_video
=True):
2016 # Extract ID from URL
2017 mobj
= re
.match(self
._VALID
_URL
, url
)
2019 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
2022 # At this point we have a new video
2023 self
._downloader
.increment_downloads()
2024 video_id
= mobj
.group(1)
2026 # Retrieve video webpage to extract further information
2027 request
= urllib2
.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id
, None, std_headers
)
2029 self
.report_download_webpage(video_id
)
2030 webpage
= urllib2
.urlopen(request
).read()
2031 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2032 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
2035 # Now we begin extracting as much information as we can from what we
2036 # retrieved. First we extract the information common to all extractors,
2037 # and latter we extract those that are Vimeo specific.
2038 self
.report_extraction(video_id
)
2041 mobj
= re
.search(r
'<caption>(.*?)</caption>', webpage
)
2043 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2045 video_title
= mobj
.group(1).decode('utf-8')
2046 simple_title
= _simplify_title(video_title
)
2049 mobj
= re
.search(r
'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage
)
2051 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
2053 video_uploader
= mobj
.group(1).decode('utf-8')
2055 # Extract video thumbnail
2056 mobj
= re
.search(r
'<thumbnail>(.*?)</thumbnail>', webpage
)
2058 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
2060 video_thumbnail
= mobj
.group(1).decode('utf-8')
2062 # # Extract video description
2063 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2065 # self._downloader.trouble(u'ERROR: unable to extract video description')
2067 # video_description = mobj.group(1).decode('utf-8')
2068 # if not video_description: video_description = 'No description available.'
2069 video_description
= 'Foo.'
2071 # Vimeo specific: extract request signature
2072 mobj
= re
.search(r
'<request_signature>(.*?)</request_signature>', webpage
)
2074 self
._downloader
.trouble(u
'ERROR: unable to extract request signature')
2076 sig
= mobj
.group(1).decode('utf-8')
2078 # Vimeo specific: extract video quality information
2079 mobj
= re
.search(r
'<isHD>(\d+)</isHD>', webpage
)
2081 self
._downloader
.trouble(u
'ERROR: unable to extract video quality information')
2083 quality
= mobj
.group(1).decode('utf-8')
2085 if int(quality
) == 1:
2090 # Vimeo specific: Extract request signature expiration
2091 mobj
= re
.search(r
'<request_signature_expires>(.*?)</request_signature_expires>', webpage
)
2093 self
._downloader
.trouble(u
'ERROR: unable to extract request signature expiration')
2095 sig_exp
= mobj
.group(1).decode('utf-8')
2097 video_url
= "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id
, sig
, sig_exp
, quality
)
2100 # Process video information
2101 self
._downloader
.process_info({
2102 'id': video_id
.decode('utf-8'),
2104 'uploader': video_uploader
,
2105 'upload_date': u
'NA',
2106 'title': video_title
,
2107 'stitle': simple_title
,
2109 'thumbnail': video_thumbnail
.decode('utf-8'),
2110 'description': video_description
,
2111 'thumbnail': video_thumbnail
,
2112 'description': video_description
,
2115 except UnavailableVideoError
:
2116 self
._downloader
.trouble(u
'ERROR: unable to download video')
2119 class GenericIE(InfoExtractor
):
2120 """Generic last-resort information extractor."""
2123 IE_NAME
= u
'generic'
2125 def __init__(self
, downloader
=None):
2126 InfoExtractor
.__init
__(self
, downloader
)
2128 def report_download_webpage(self
, video_id
):
2129 """Report webpage download."""
2130 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.')
2131 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
)
2133 def report_extraction(self
, video_id
):
2134 """Report information extraction."""
2135 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
)
2137 def _real_extract(self
, url
):
2138 # At this point we have a new video
2139 self
._downloader
.increment_downloads()
2141 video_id
= url
.split('/')[-1]
2142 request
= urllib2
.Request(url
)
2144 self
.report_download_webpage(video_id
)
2145 webpage
= urllib2
.urlopen(request
).read()
2146 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2147 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
2149 except ValueError, err
:
2150 # since this is the last-resort InfoExtractor, if
2151 # this error is thrown, it'll be thrown here
2152 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
2155 self
.report_extraction(video_id
)
2156 # Start with something easy: JW Player in SWFObject
2157 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2159 # Broaden the search a little bit
2160 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
2162 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2165 # It's possible that one of the regexes
2166 # matched, but returned an empty group:
2167 if mobj.group(1) is None:
2168 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2171 video_url = urllib.unquote(mobj.group(1))
2172 video_id = os.path.basename(video_url)
2174 # here's a fun little line of code for you:
2175 video_extension = os.path.splitext(video_id)[1][1:]
2176 video_id = os.path.splitext(video_id)[0]
2178 # it's tempting to parse this further, but you would
2179 # have to take into account all the variations like
2180 # Video Title - Site Name
2181 # Site Name | Video Title
2182 # Video Title - Tagline | Site Name
2183 # and so on and so forth; it's just not practical
2184 mobj = re.search(r'<title>(.*)</title>', webpage)
2186 self._downloader.trouble(u'ERROR: unable to extract title')
2188 video_title = mobj.group(1).decode('utf-8')
2189 video_title = sanitize_title(video_title)
2190 simple_title = _simplify_title(video_title)
2192 # video uploader is domain name
2193 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2195 self._downloader.trouble(u'ERROR: unable to extract title')
2197 video_uploader = mobj.group(1).decode('utf-8')
2200 # Process video information
2201 self._downloader.process_info({
2202 'id': video_id.decode('utf-8'),
2203 'url': video_url.decode('utf-8'),
2204 'uploader': video_uploader,
2205 'upload_date': u'NA',
2206 'title': video_title,
2207 'stitle': simple_title,
2208 'ext': video_extension.decode('utf-8'),
2212 except UnavailableVideoError, err:
2213 self._downloader.trouble(u'\nERROR: unable to download video')
2216 class YoutubeSearchIE(InfoExtractor):
2217 """Information Extractor for YouTube search queries."""
2218 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2219 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2220 _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"'
2221 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2223 _max_youtube_results = 1000
2224 IE_NAME = u'youtube:search'
2226 def __init__(self, youtube_ie, downloader=None):
2227 InfoExtractor.__init__(self, downloader)
2228 self._youtube_ie = youtube_ie
2230 def report_download_page(self, query, pagenum):
2231 """Report attempt to download playlist page with given number."""
2232 query = query.decode(preferredencoding())
2233 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2235 def _real_initialize(self):
2236 self._youtube_ie.initialize()
2238 def _real_extract(self, query):
2239 mobj = re.match(self._VALID_URL, query)
2241 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2244 prefix, query = query.split(':')
2246 query = query.encode('utf-8')
2248 self._download_n_results(query, 1)
2250 elif prefix == 'all':
2251 self._download_n_results(query, self._max_youtube_results)
2257 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2259 elif n > self._max_youtube_results:
2260 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2261 n = self._max_youtube_results
2262 self._download_n_results(query, n)
2264 except ValueError: # parsing prefix as integer fails
2265 self._download_n_results(query, 1)
2268 def _download_n_results(self, query, n):
2269 """Downloads a specified number of results for a query"""
2272 already_seen = set()
2276 self.report_download_page(query, pagenum)
2277 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2278 request = urllib2.Request(result_url)
2280 page = urllib2.urlopen(request).read()
2281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2282 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2285 # Extract video identifiers
2286 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2287 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2288 if video_id not in already_seen:
2289 video_ids.append(video_id)
2290 already_seen.add(video_id)
2291 if len(video_ids) == n:
2292 # Specified n videos reached
2293 for id in video_ids:
2294 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2297 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2298 for id in video_ids:
2299 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2302 pagenum = pagenum + 1
2305 class GoogleSearchIE(InfoExtractor):
2306 """Information Extractor for Google Video search queries."""
2307 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2308 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2309 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2310 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2312 _max_google_results = 1000
2313 IE_NAME = u'video.google:search'
2315 def __init__(self, google_ie, downloader=None):
2316 InfoExtractor.__init__(self, downloader)
2317 self._google_ie = google_ie
2319 def report_download_page(self, query, pagenum):
2320 """Report attempt to download playlist page with given number."""
2321 query = query.decode(preferredencoding())
2322 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2324 def _real_initialize(self):
2325 self._google_ie.initialize()
2327 def _real_extract(self, query):
2328 mobj = re.match(self._VALID_URL, query)
2330 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2333 prefix, query = query.split(':')
2335 query = query.encode('utf-8')
2337 self._download_n_results(query, 1)
2339 elif prefix == 'all':
2340 self._download_n_results(query, self._max_google_results)
2346 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2348 elif n > self._max_google_results:
2349 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2350 n = self._max_google_results
2351 self._download_n_results(query, n)
2353 except ValueError: # parsing prefix as integer fails
2354 self._download_n_results(query, 1)
2357 def _download_n_results(self, query, n):
2358 """Downloads a specified number of results for a query"""
2361 already_seen = set()
2365 self.report_download_page(query, pagenum)
2366 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2367 request = urllib2.Request(result_url)
2369 page = urllib2.urlopen(request).read()
2370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2374 # Extract video identifiers
2375 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2376 video_id = mobj.group(1)
2377 if video_id not in already_seen:
2378 video_ids.append(video_id)
2379 already_seen.add(video_id)
2380 if len(video_ids) == n:
2381 # Specified n videos reached
2382 for id in video_ids:
2383 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2386 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2387 for id in video_ids:
2388 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2391 pagenum = pagenum + 1
2394 class YahooSearchIE(InfoExtractor):
2395 """Information Extractor for Yahoo! Video search queries."""
2396 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2397 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2398 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
2399 _MORE_PAGES_INDICATOR = r'\s*Next'
2401 _max_yahoo_results = 1000
2402 IE_NAME = u'video.yahoo:search'
2404 def __init__(self, yahoo_ie, downloader=None):
2405 InfoExtractor.__init__(self, downloader)
2406 self._yahoo_ie = yahoo_ie
2408 def report_download_page(self, query, pagenum):
2409 """Report attempt to download playlist page with given number."""
2410 query = query.decode(preferredencoding())
2411 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2413 def _real_initialize(self):
2414 self._yahoo_ie.initialize()
2416 def _real_extract(self, query):
2417 mobj = re.match(self._VALID_URL, query)
2419 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2422 prefix, query = query.split(':')
2424 query = query.encode('utf-8')
2426 self._download_n_results(query, 1)
2428 elif prefix == 'all':
2429 self._download_n_results(query, self._max_yahoo_results)
2435 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2437 elif n > self._max_yahoo_results:
2438 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2439 n = self._max_yahoo_results
2440 self._download_n_results(query, n)
2442 except ValueError: # parsing prefix as integer fails
2443 self._download_n_results(query, 1)
2446 def _download_n_results(self, query, n):
2447 """Downloads a specified number of results for a query"""
2450 already_seen = set()
2454 self.report_download_page(query, pagenum)
2455 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2456 request = urllib2.Request(result_url)
2458 page = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2463 # Extract video identifiers
2464 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465 video_id = mobj.group(1)
2466 if video_id not in already_seen:
2467 video_ids.append(video_id)
2468 already_seen.add(video_id)
2469 if len(video_ids) == n:
2470 # Specified n videos reached
2471 for id in video_ids:
2472 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2475 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2476 for id in video_ids:
2477 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2480 pagenum = pagenum + 1
2483 class YoutubePlaylistIE(InfoExtractor):
2484 """Information Extractor for YouTube playlists."""
2486 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2487 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2488 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2491 IE_NAME = u'youtube:playlist'
2493 def __init__(self, youtube_ie, downloader=None):
2494 InfoExtractor.__init__(self, downloader)
2495 self._youtube_ie = youtube_ie
2497 def report_download_page(self, playlist_id, pagenum):
2498 """Report attempt to download playlist page with given number."""
2499 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2501 def _real_initialize(self):
2502 self._youtube_ie.initialize()
2504 def _real_extract(self, url):
2505 # Extract playlist id
2506 mobj = re.match(self._VALID_URL, url)
2508 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2512 if mobj.group(3) is not None:
2513 self._youtube_ie.extract(mobj.group(3))
2516 # Download playlist pages
2517 # prefix is 'p' as default for playlists but there are other types that need extra care
2518 playlist_prefix = mobj.group(1)
2519 if playlist_prefix == 'a':
2520 playlist_access = 'artist'
2522 playlist_prefix = 'p'
2523 playlist_access = 'view_play_list'
2524 playlist_id = mobj.group(2)
2529 self.report_download_page(playlist_id, pagenum)
2530 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2531 request = urllib2.Request(url)
2533 page = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2538 # Extract video identifiers
2540 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2541 if mobj.group(1) not in ids_in_page:
2542 ids_in_page.append(mobj.group(1))
2543 video_ids.extend(ids_in_page)
2545 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2547 pagenum = pagenum + 1
2549 playliststart = self._downloader.params.get('playliststart', 1) - 1
2550 playlistend = self._downloader.params.get('playlistend', -1)
2551 video_ids = video_ids[playliststart:playlistend]
2553 for id in video_ids:
2554 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2558 class YoutubeUserIE(InfoExtractor):
2559 """Information Extractor for YouTube users."""
2561 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2562 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2563 _GDATA_PAGE_SIZE = 50
2564 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2565 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2567 IE_NAME = u'youtube:user'
2569 def __init__(self, youtube_ie, downloader=None):
2570 InfoExtractor.__init__(self, downloader)
2571 self._youtube_ie = youtube_ie
2573 def report_download_page(self, username, start_index):
2574 """Report attempt to download user page."""
2575 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2576 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2578 def _real_initialize(self):
2579 self._youtube_ie.initialize()
2581 def _real_extract(self, url):
2583 mobj = re.match(self._VALID_URL, url)
2585 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2588 username = mobj.group(1)
2590 # Download video ids using YouTube Data API. Result size per
2591 # query is limited (currently to 50 videos) so we need to query
2592 # page by page until there are no video ids - it means we got
2599 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2600 self.report_download_page(username, start_index)
2602 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2605 page = urllib2.urlopen(request).read()
2606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2610 # Extract video identifiers
2613 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614 if mobj.group(1) not in ids_in_page:
2615 ids_in_page.append(mobj.group(1))
2617 video_ids.extend(ids_in_page)
2619 # A little optimization - if current page is not
2620 # "full
", ie. does not contain PAGE_SIZE video ids then
2621 # we can assume that this page is the last one - there
2622 # are no more ids on further pages - no need to query
2625 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2630 all_ids_count = len(video_ids)
2631 playliststart = self._downloader.params.get('playliststart', 1) - 1
2632 playlistend = self._downloader.params.get('playlistend', -1)
2634 if playlistend == -1:
2635 video_ids = video_ids[playliststart:]
2637 video_ids = video_ids[playliststart:playlistend]
2639 self._downloader.to_screen("[youtube
] user
%s: Collected
%d video
ids (downloading
%d of them
)" %
2640 (username, all_ids_count, len(video_ids)))
2642 for video_id in video_ids:
2643 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2646 class DepositFilesIE(InfoExtractor):
2647 """Information extractor for depositfiles.com"""
2649 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2650 IE_NAME = u'DepositFiles'
2652 def __init__(self, downloader=None):
2653 InfoExtractor.__init__(self, downloader)
2655 def report_download_webpage(self, file_id):
2656 """Report webpage download."""
2657 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2659 def report_extraction(self, file_id):
2660 """Report information extraction."""
2661 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2663 def _real_extract(self, url):
2664 # At this point we have a new file
2665 self._downloader.increment_downloads()
2667 file_id = url.split('/')[-1]
2668 # Rebuild url in english locale
2669 url = 'http://depositfiles.com/en/files/' + file_id
2671 # Retrieve file webpage with 'Free download' button pressed
2672 free_download_indication = { 'gateway_result' : '1' }
2673 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2675 self.report_download_webpage(file_id)
2676 webpage = urllib2.urlopen(request).read()
2677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2681 # Search for the real file URL
2682 mobj = re.search(r'<form action="(http
://fileshare
.+?
)"', webpage)
2683 if (mobj is None) or (mobj.group(1) is None):
2684 # Try to figure out reason of the error.
2685 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2686 if (mobj is not None) and (mobj.group(1) is not None):
2687 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2688 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2690 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2693 file_url = mobj.group(1)
2694 file_extension = os.path.splitext(file_url)[1][1:]
2696 # Search for file title
2697 mobj = re.search(r'<b title="(.*?
)">', webpage)
2699 self._downloader.trouble(u'ERROR: unable to extract title')
2701 file_title = mobj.group(1).decode('utf-8')
2704 # Process file information
2705 self._downloader.process_info({
2706 'id': file_id.decode('utf-8'),
2707 'url': file_url.decode('utf-8'),
2709 'upload_date': u'NA',
2710 'title': file_title,
2711 'stitle': file_title,
2712 'ext': file_extension.decode('utf-8'),
2716 except UnavailableVideoError, err:
2717 self._downloader.trouble(u'ERROR: unable to download file')
2720 class FacebookIE(InfoExtractor):
2721 """Information Extractor for Facebook"""
2723 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2724 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2725 _NETRC_MACHINE = 'facebook'
2726 _available_formats = ['video', 'highqual', 'lowqual']
2727 _video_extensions = {
2732 IE_NAME = u'facebook'
2734 def __init__(self, downloader=None):
2735 InfoExtractor.__init__(self, downloader)
2737 def _reporter(self, message):
2738 """Add header and report message."""
2739 self._downloader.to_screen(u'[facebook] %s' % message)
2741 def report_login(self):
2742 """Report attempt to log in."""
2743 self._reporter(u'Logging in')
2745 def report_video_webpage_download(self, video_id):
2746 """Report attempt to download video webpage."""
2747 self._reporter(u'%s: Downloading video webpage' % video_id)
2749 def report_information_extraction(self, video_id):
2750 """Report attempt to extract video information."""
2751 self._reporter(u'%s: Extracting video information' % video_id)
2753 def _parse_page(self, video_webpage):
2754 """Extract video information from page"""
2756 data = {'title': r'\("video_title
", "(.*?
)"\)',
2757 'description': r'<div class="datawrap
">(.*?)</div>',
2758 'owner': r'\("video_owner_name
", "(.*?
)"\)',
2759 'thumbnail': r'\("thumb_url
", "(?P
<THUMB
>.*?
)"\)',
2762 for piece in data.keys():
2763 mobj = re.search(data[piece], video_webpage)
2764 if mobj is not None:
2765 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape
"))
2769 for fmt in self._available_formats:
2770 mobj = re.search(r'\("%s_src
\", "(.+?)"\
)' % fmt, video_webpage)
2771 if mobj is not None:
2772 # URL is in a Javascript segment inside an escaped Unicode format within
2773 # the generally utf-8 page
2774 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2775 video_info['video_urls
'] = video_urls
2779 def _real_initialize(self):
2780 if self._downloader is None:
2785 downloader_params = self._downloader.params
2787 # Attempt to use provided username and password or .netrc data
2788 if downloader_params.get('username
', None) is not None:
2789 useremail = downloader_params['username
']
2790 password = downloader_params['password
']
2791 elif downloader_params.get('usenetrc
', False):
2793 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2794 if info is not None:
2798 raise netrc.NetrcParseError('No authenticators
for %s' % self._NETRC_MACHINE)
2799 except (IOError, netrc.NetrcParseError), err:
2800 self._downloader.to_stderr(u'WARNING
: parsing
.netrc
: %s' % str(err))
2803 if useremail is None:
2812 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2815 login_results = urllib2.urlopen(request).read()
2816 if re.search(r'<form(.*)name
="login"(.*)</form
>', login_results) is not None:
2817 self._downloader.to_stderr(u'WARNING
: unable to log
in: bad username
/password
, or exceded login rate
limit (~
3/min). Check credentials
or wait
.')
2819 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2820 self._downloader.to_stderr(u'WARNING
: unable to log
in: %s' % str(err))
2823 def _real_extract(self, url):
2824 mobj = re.match(self._VALID_URL, url)
2826 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2828 video_id = mobj.group('ID
')
2831 self.report_video_webpage_download(video_id)
2832 request = urllib2.Request('https
://www
.facebook
.com
/video
/video
.php?v
=%s' % video_id)
2834 page = urllib2.urlopen(request)
2835 video_webpage = page.read()
2836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837 self._downloader.trouble(u'ERROR
: unable to download video webpage
: %s' % str(err))
2840 # Start extracting information
2841 self.report_information_extraction(video_id)
2843 # Extract information
2844 video_info = self._parse_page(video_webpage)
2847 if 'owner
' not in video_info:
2848 self._downloader.trouble(u'ERROR
: unable to extract uploader nickname
')
2850 video_uploader = video_info['owner
']
2853 if 'title
' not in video_info:
2854 self._downloader.trouble(u'ERROR
: unable to extract video title
')
2856 video_title = video_info['title
']
2857 video_title = video_title.decode('utf
-8')
2858 video_title = sanitize_title(video_title)
2860 simple_title = _simplify_title(video_title)
2863 if 'thumbnail
' not in video_info:
2864 self._downloader.trouble(u'WARNING
: unable to extract video thumbnail
')
2865 video_thumbnail = ''
2867 video_thumbnail = video_info['thumbnail
']
2871 if 'upload_date
' in video_info:
2872 upload_time = video_info['upload_date
']
2873 timetuple = email.utils.parsedate_tz(upload_time)
2874 if timetuple is not None:
2876 upload_date = time.strftime('%Y
%m
%d', timetuple[0:9])
2881 video_description = video_info.get('description
', 'No description available
.')
2883 url_map = video_info['video_urls
']
2884 if len(url_map.keys()) > 0:
2885 # Decide which formats to download
2886 req_format = self._downloader.params.get('format
', None)
2887 format_limit = self._downloader.params.get('format_limit
', None)
2889 if format_limit is not None and format_limit in self._available_formats:
2890 format_list = self._available_formats[self._available_formats.index(format_limit):]
2892 format_list = self._available_formats
2893 existing_formats = [x for x in format_list if x in url_map]
2894 if len(existing_formats) == 0:
2895 self._downloader.trouble(u'ERROR
: no known formats available
for video
')
2897 if req_format is None:
2898 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2899 elif req_format == 'worst
':
2900 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2901 elif req_format == '-1':
2902 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2905 if req_format not in url_map:
2906 self._downloader.trouble(u'ERROR
: requested format
not available
')
2908 video_url_list = [(req_format, url_map[req_format])] # Specific format
2910 for format_param, video_real_url in video_url_list:
2912 # At this point we have a new video
2913 self._downloader.increment_downloads()
2916 video_extension = self._video_extensions.get(format_param, 'mp4
')
2919 # Process video information
2920 self._downloader.process_info({
2921 'id': video_id.decode('utf
-8'),
2922 'url
': video_real_url.decode('utf
-8'),
2923 'uploader
': video_uploader.decode('utf
-8'),
2924 'upload_date
': upload_date,
2925 'title
': video_title,
2926 'stitle
': simple_title,
2927 'ext
': video_extension.decode('utf
-8'),
2928 'format
': (format_param is None and u'NA
' or format_param.decode('utf
-8')),
2929 'thumbnail
': video_thumbnail.decode('utf
-8'),
2930 'description
': video_description.decode('utf
-8'),
2933 except UnavailableVideoError, err:
2934 self._downloader.trouble(u'\nERROR
: unable to download video
')
2936 class BlipTVIE(InfoExtractor):
2937 """Information extractor for blip.tv"""
2939 _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?blip\
.tv(/.+)$
'
2940 _URL_EXT = r'^
.*\
.([a
-z0
-9]+)$
'
2941 IE_NAME = u'blip
.tv
'
2943 def report_extraction(self, file_id):
2944 """Report information extraction."""
2945 self._downloader.to_screen(u'[%s] %s: Extracting information
' % (self.IE_NAME, file_id))
2947 def report_direct_download(self, title):
2948 """Report information extraction."""
2949 self._downloader.to_screen(u'[%s] %s: Direct download detected
' % (self.IE_NAME, title))
2951 def _real_extract(self, url):
2952 mobj = re.match(self._VALID_URL, url)
2954 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2961 json_url = url + cchar + 'skin
=json
&version
=2&no_wrap
=1'
2962 request = urllib2.Request(json_url)
2963 self.report_extraction(mobj.group(1))
2966 urlh = urllib2.urlopen(request)
2967 if urlh.headers.get('Content
-Type
', '').startswith('video
/'): # Direct download
2968 basename = url.split('/')[-1]
2969 title,ext = os.path.splitext(basename)
2970 title = title.decode('UTF
-8')
2971 ext = ext.replace('.', '')
2972 self.report_direct_download(title)
2977 'stitle
': _simplify_title(title),
2981 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2982 self._downloader.trouble(u'ERROR
: unable to download video info webpage
: %s' % str(err))
2984 if info is None: # Regular URL
2986 json_code = urlh.read()
2987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988 self._downloader.trouble(u'ERROR
: unable to read video info webpage
: %s' % str(err))
2992 json_data = json.loads(json_code)
2993 if 'Post
' in json_data:
2994 data = json_data['Post
']
2998 upload_date = datetime.datetime.strptime(data['datestamp
'], '%m
-%d-%y
%H
:%M
%p
').strftime('%Y
%m
%d')
2999 video_url = data['media
']['url
']
3000 umobj = re.match(self._URL_EXT, video_url)
3002 raise ValueError('Can
not determine filename extension
')
3003 ext = umobj.group(1)
3006 'id': data['item_id
'],
3008 'uploader
': data['display_name
'],
3009 'upload_date
': upload_date,
3010 'title
': data['title
'],
3011 'stitle
': _simplify_title(data['title
']),
3013 'format
': data['media
']['mimeType
'],
3014 'thumbnail
': data['thumbnailUrl
'],
3015 'description
': data['description
'],
3016 'player_url
': data['embedUrl
']
3018 except (ValueError,KeyError), err:
3019 self._downloader.trouble(u'ERROR
: unable to parse video information
: %s' % repr(err))
3022 self._downloader.increment_downloads()
3025 self._downloader.process_info(info)
3026 except UnavailableVideoError, err:
3027 self._downloader.trouble(u'\nERROR
: unable to download video
')
3030 class MyVideoIE(InfoExtractor):
3031 """Information Extractor for myvideo.de."""
3033 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?myvideo\
.de
/watch
/([0-9]+)/([^?
/]+).*'
3034 IE_NAME = u'myvideo
'
3036 def __init__(self, downloader=None):
3037 InfoExtractor.__init__(self, downloader)
3039 def report_download_webpage(self, video_id):
3040 """Report webpage download."""
3041 self._downloader.to_screen(u'[myvideo
] %s: Downloading webpage
' % video_id)
3043 def report_extraction(self, video_id):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[myvideo
] %s: Extracting information
' % video_id)
3047 def _real_extract(self,url):
3048 mobj = re.match(self._VALID_URL, url)
3050 self._download.trouble(u'ERROR
: invalid URL
: %s' % url)
3053 video_id = mobj.group(1)
3056 request = urllib2.Request('http
://www
.myvideo
.de
/watch
/%s' % video_id)
3058 self.report_download_webpage(video_id)
3059 webpage = urllib2.urlopen(request).read()
3060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3061 self._downloader.trouble(u'ERROR
: Unable to retrieve video webpage
: %s' % str(err))
3064 self.report_extraction(video_id)
3065 mobj = re.search(r'<link rel
=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />',
3068 self._downloader.trouble(u'ERROR
: unable to extract media URL
')
3070 video_url = mobj.group(1) + ('/%s.flv
' % video_id)
3072 mobj = re.search('<title
>([^
<]+)</title
>', webpage)
3074 self._downloader.trouble(u'ERROR
: unable to extract title
')
3077 video_title = mobj.group(1)
3078 video_title = sanitize_title(video_title)
3080 simple_title = _simplify_title(video_title)
3083 self._downloader.process_info({
3087 'upload_date
': u'NA
',
3088 'title
': video_title,
3089 'stitle
': simple_title,
3094 except UnavailableVideoError:
3095 self._downloader.trouble(u'\nERROR
: Unable to download video
')
3097 class ComedyCentralIE(InfoExtractor):
3098 """Information extractor for The Daily Show and Colbert Report """
3100 _VALID_URL = r'^
(:(?P
<shortname
>tds|thedailyshow|cr|colbert|colbertnation|colbertreport
))|
(https?
://)?
(www\
.)?
(?P
<showname
>thedailyshow|colbertnation
)\
.com
/full
-episodes
/(?P
<episode
>.*)$
'
3101 IE_NAME = u'comedycentral
'
3103 def report_extraction(self, episode_id):
3104 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id)
3106 def report_config_download(self, episode_id):
3107 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id)
3109 def report_index_download(self, episode_id):
3110 self._downloader.to_screen(u'[comedycentral
] %s: Downloading show index
' % episode_id)
3112 def report_player_url(self, episode_id):
3113 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id)
3115 def _real_extract(self, url):
3116 mobj = re.match(self._VALID_URL, url)
3118 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
3121 if mobj.group('shortname
'):
3122 if mobj.group('shortname
') in ('tds
', 'thedailyshow
'):
3123 url = u'http
://www
.thedailyshow
.com
/full
-episodes
/'
3125 url = u'http
://www
.colbertnation
.com
/full
-episodes
/'
3126 mobj = re.match(self._VALID_URL, url)
3127 assert mobj is not None
3129 dlNewest = not mobj.group('episode
')
3131 epTitle = mobj.group('showname
')
3133 epTitle = mobj.group('episode
')
3135 req = urllib2.Request(url)
3136 self.report_extraction(epTitle)
3138 htmlHandle = urllib2.urlopen(req)
3139 html = htmlHandle.read()
3140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3141 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err))
3144 url = htmlHandle.geturl()
3145 mobj = re.match(self._VALID_URL, url)
3147 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url)
3149 if mobj.group('episode
') == '':
3150 self._downloader.trouble(u'ERROR
: Redirected URL
is still
not specific
: ' + url)
3152 epTitle = mobj.group('episode
')
3154 mMovieParams = re.findall('<param name
="movie" value
="(http://media.mtvnservices.com/([^"]*episode
.*?
:.*?
))"/>', html)
3155 if len(mMovieParams) == 0:
3156 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3159 playerUrl_raw = mMovieParams[0][0]
3160 self.report_player_url(epTitle)
3162 urlHandle = urllib2.urlopen(playerUrl_raw)
3163 playerUrl = urlHandle.geturl()
3164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3165 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3168 uri = mMovieParams[0][1]
3169 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3170 self.report_index_download(epTitle)
3172 indexXml = urllib2.urlopen(indexUrl).read()
3173 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3174 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3177 idoc = xml.etree.ElementTree.fromstring(indexXml)
3178 itemEls = idoc.findall('.//item')
3179 for itemEl in itemEls:
3180 mediaId = itemEl.findall('./guid')[0].text
3181 shortMediaId = mediaId.split(':')[-1]
3182 showId = mediaId.split(':')[-2].replace('.com', '')
3183 officialTitle = itemEl.findall('./title')[0].text
3184 officialDate = itemEl.findall('./pubDate')[0].text
3186 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3187 urllib.urlencode({'uri': mediaId}))
3188 configReq = urllib2.Request(configUrl)
3189 self.report_config_download(epTitle)
3191 configXml = urllib2.urlopen(configReq).read()
3192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3193 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3196 cdoc = xml.etree.ElementTree.fromstring(configXml)
3198 for rendition in cdoc.findall('.//rendition'):
3199 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3203 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3206 # For now, just pick the highest bitrate
3207 format,video_url = turls[-1]
3209 self._downloader.increment_downloads()
3211 effTitle = showId + u'-' + epTitle
3216 'upload_date': officialDate,
3218 'stitle': _simplify_title(effTitle),
3222 'description': officialTitle,
3223 'player_url': playerUrl
3227 self._downloader.process_info(info)
3228 except UnavailableVideoError, err:
3229 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3233 class EscapistIE(InfoExtractor):
3234 """Information extractor for The Escapist """
3236 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3237 IE_NAME = u'escapist'
3239 def report_extraction(self, showName):
3240 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3242 def report_config_download(self, showName):
3243 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3245 def _real_extract(self, url):
3246 htmlParser = HTMLParser.HTMLParser()
3248 mobj = re.match(self._VALID_URL, url)
3250 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3252 showName = mobj.group('showname')
3253 videoId = mobj.group('episode')
3255 self.report_extraction(showName)
3257 webPage = urllib2.urlopen(url).read()
3258 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3262 descMatch = re.search('<meta name="description
" content="([^
"]*)"', webPage)
3263 description = htmlParser.unescape(descMatch.group(1))
3264 imgMatch = re.search('<meta
property="og:image" content
="([^"]*)"', webPage)
3265 imgUrl = htmlParser.unescape(imgMatch.group(1))
3266 playerUrlMatch = re.search('<meta property="og
:video
" content="([^
"]*)"', webPage)
3267 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3268 configUrlMatch = re.search('config
=(.*)$
', playerUrl)
3269 configUrl = urllib2.unquote(configUrlMatch.group(1))
3271 self.report_config_download(showName)
3273 configJSON = urllib2.urlopen(configUrl).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR
: unable to download configuration
: ' + unicode(err))
3278 # Technically, it's JavaScript
, not JSON
3279 configJSON
= configJSON
.replace("'", '"')
3282 config
= json
.loads(configJSON
)
3283 except (ValueError,), err
:
3284 self
._downloader
.trouble(u
'ERROR: Invalid JSON in configuration file: ' + unicode(err
))
3287 playlist
= config
['playlist']
3288 videoUrl
= playlist
[1]['url']
3290 self
._downloader
.increment_downloads()
3294 'uploader': showName
,
3295 'upload_date': None,
3297 'stitle': _simplify_title(showName
),
3300 'thumbnail': imgUrl
,
3301 'description': description
,
3302 'player_url': playerUrl
,
3306 self
._downloader
.process_info(info
)
3307 except UnavailableVideoError
, err
:
3308 self
._downloader
.trouble(u
'\nERROR: unable to download ' + videoId
)
3311 class CollegeHumorIE(InfoExtractor
):
3312 """Information extractor for collegehumor.com"""
3314 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3315 IE_NAME
= u
'collegehumor'
3317 def report_webpage(self
, video_id
):
3318 """Report information extraction."""
3319 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
3321 def report_extraction(self
, video_id
):
3322 """Report information extraction."""
3323 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3325 def _real_extract(self
, url
):
3326 htmlParser
= HTMLParser
.HTMLParser()
3328 mobj
= re
.match(self
._VALID
_URL
, url
)
3330 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3332 video_id
= mobj
.group('videoid')
3334 self
.report_webpage(video_id
)
3335 request
= urllib2
.Request(url
)
3337 webpage
= urllib2
.urlopen(request
).read()
3338 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3339 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
3342 m
= re
.search(r
'id="video:(?P<internalvideoid>[0-9]+)"', webpage
)
3344 self
._downloader
.trouble(u
'ERROR: Cannot extract internal video ID')
3346 internal_video_id
= m
.group('internalvideoid')
3350 'internal_id': internal_video_id
,
3353 self
.report_extraction(video_id
)
3354 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3356 metaXml
= urllib2
.urlopen(xmlUrl
).read()
3357 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3358 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % str(err
))
3361 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
3363 videoNode
= mdoc
.findall('./video')[0]
3364 info
['description'] = videoNode
.findall('./description')[0].text
3365 info
['title'] = videoNode
.findall('./caption')[0].text
3366 info
['stitle'] = _simplify_title(info
['title'])
3367 info
['url'] = videoNode
.findall('./file')[0].text
3368 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
3369 info
['ext'] = info
['url'].rpartition('.')[2]
3370 info
['format'] = info
['ext']
3372 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file')
3375 self
._downloader
.increment_downloads()
3378 self
._downloader
.process_info(info
)
3379 except UnavailableVideoError
, err
:
3380 self
._downloader
.trouble(u
'\nERROR: unable to download video')
3383 class XVideosIE(InfoExtractor
):
3384 """Information extractor for xvideos.com"""
3386 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3387 IE_NAME
= u
'xvideos'
3389 def report_webpage(self
, video_id
):
3390 """Report information extraction."""
3391 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
3393 def report_extraction(self
, video_id
):
3394 """Report information extraction."""
3395 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3397 def _real_extract(self
, url
):
3398 htmlParser
= HTMLParser
.HTMLParser()
3400 mobj
= re
.match(self
._VALID
_URL
, url
)
3402 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3404 video_id
= mobj
.group(1).decode('utf-8')
3406 self
.report_webpage(video_id
)
3408 request
= urllib2
.Request(r
'http://www.xvideos.com/video' + video_id
)
3410 webpage
= urllib2
.urlopen(request
).read()
3411 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3412 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
3415 self
.report_extraction(video_id
)
3419 mobj
= re
.search(r
'flv_url=(.+?)&', webpage
)
3421 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
3423 video_url
= urllib2
.unquote(mobj
.group(1).decode('utf-8'))
3427 mobj
= re
.search(r
'<title>(.*?)\s+-\s+XVID', webpage
)
3429 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
3431 video_title
= mobj
.group(1).decode('utf-8')
3434 # Extract video thumbnail
3435 mobj
= re
.search(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage
)
3437 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
3439 video_thumbnail
= mobj
.group(1).decode('utf-8')
3443 self
._downloader
.increment_downloads()
3448 'upload_date': None,
3449 'title': video_title
,
3450 'stitle': _simplify_title(video_title
),
3453 'thumbnail': video_thumbnail
,
3454 'description': None,
3459 self
._downloader
.process_info(info
)
3460 except UnavailableVideoError
, err
:
3461 self
._downloader
.trouble(u
'\nERROR: unable to download ' + video_id
)
3464 class SoundcloudIE(InfoExtractor
):
3465 """Information extractor for soundcloud.com
3466 To access the media, the uid of the song and a stream token
3467 must be extracted from the page source and the script must make
3468 a request to media.soundcloud.com/crossdomain.xml. Then
3469 the media can be grabbed by requesting from an url composed
3470 of the stream token and uid
3473 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3474 IE_NAME
= u
'soundcloud'
3476 def __init__(self
, downloader
=None):
3477 InfoExtractor
.__init
__(self
, downloader
)
3479 def report_webpage(self
, video_id
):
3480 """Report information extraction."""
3481 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
3483 def report_extraction(self
, video_id
):
3484 """Report information extraction."""
3485 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3487 def _real_extract(self
, url
):
3488 htmlParser
= HTMLParser
.HTMLParser()
3490 mobj
= re
.match(self
._VALID
_URL
, url
)
3492 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3495 # extract uploader (which is in the url)
3496 uploader
= mobj
.group(1).decode('utf-8')
3497 # extract simple title (uploader + slug of song title)
3498 slug_title
= mobj
.group(2).decode('utf-8')
3499 simple_title
= uploader
+ '-' + slug_title
3501 self
.report_webpage('%s/%s' % (uploader
, slug_title
))
3503 request
= urllib2
.Request('http://soundcloud.com/%s/%s' % (uploader
, slug_title
))
3505 webpage
= urllib2
.urlopen(request
).read()
3506 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3507 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
3510 self
.report_extraction('%s/%s' % (uploader
, slug_title
))
3512 # extract uid and stream token that soundcloud hands out for access
3513 mobj
= re
.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage
)
3515 video_id
= mobj
.group(1)
3516 stream_token
= mobj
.group(2)
3518 # extract unsimplified title
3519 mobj
= re
.search('"title":"(.*?)",', webpage
)
3521 title
= mobj
.group(1)
3523 # construct media url (with uid/token)
3524 mediaURL
= "http://media.soundcloud.com/stream/%s?stream_token=%s"
3525 mediaURL
= mediaURL
% (video_id
, stream_token
)
3528 description
= u
'No description available'
3529 mobj
= re
.search('track-description-value"><p>(.*?)</p>', webpage
)
3531 description
= mobj
.group(1)
3535 mobj
= re
.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage
)
3538 upload_date
= datetime
.datetime
.strptime(mobj
.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3539 except Exception, e
:
3542 # for soundcloud, a request to a cross domain is required for cookies
3543 request
= urllib2
.Request('http://media.soundcloud.com/crossdomain.xml', std_headers
)
3546 self
._downloader
.process_info({
3547 'id': video_id
.decode('utf-8'),
3549 'uploader': uploader
.decode('utf-8'),
3550 'upload_date': upload_date
,
3551 'title': simple_title
.decode('utf-8'),
3552 'stitle': simple_title
.decode('utf-8'),
3556 'description': description
.decode('utf-8')
3558 except UnavailableVideoError
:
3559 self
._downloader
.trouble(u
'\nERROR: unable to download video')
3562 class InfoQIE(InfoExtractor
):
3563 """Information extractor for infoq.com"""
3565 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3568 def report_webpage(self
, video_id
):
3569 """Report information extraction."""
3570 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
3572 def report_extraction(self
, video_id
):
3573 """Report information extraction."""
3574 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3576 def _real_extract(self
, url
):
3577 htmlParser
= HTMLParser
.HTMLParser()
3579 mobj
= re
.match(self
._VALID
_URL
, url
)
3581 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3584 self
.report_webpage(url
)
3586 request
= urllib2
.Request(url
)
3588 webpage
= urllib2
.urlopen(request
).read()
3589 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3590 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
3593 self
.report_extraction(url
)
3597 mobj
= re
.search(r
"jsclassref='([^']*)'", webpage
)
3599 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
3601 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + urllib2
.unquote(mobj
.group(1).decode('base64'))
3605 mobj
= re
.search(r
'contentTitle = "(.*?)";', webpage
)
3607 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
3609 video_title
= mobj
.group(1).decode('utf-8')
3611 # Extract description
3612 video_description
= u
'No description available.'
3613 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', webpage
)
3614 if mobj
is not None:
3615 video_description
= mobj
.group(1).decode('utf-8')
3617 video_filename
= video_url
.split('/')[-1]
3618 video_id
, extension
= video_filename
.split('.')
3620 self
._downloader
.increment_downloads()
3625 'upload_date': None,
3626 'title': video_title
,
3627 'stitle': _simplify_title(video_title
),
3629 'format': extension
, # Extension is always(?) mp4, but seems to be flv
3631 'description': video_description
,
3636 self
._downloader
.process_info(info
)
3637 except UnavailableVideoError
, err
:
3638 self
._downloader
.trouble(u
'\nERROR: unable to download ' + video_url
)
3640 class MixcloudIE(InfoExtractor
):
3641 """Information extractor for www.mixcloud.com"""
3642 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3643 IE_NAME
= u
'mixcloud'
3645 def __init__(self
, downloader
=None):
3646 InfoExtractor
.__init
__(self
, downloader
)
3648 def report_download_json(self
, file_id
):
3649 """Report JSON download."""
3650 self
._downloader
.to_screen(u
'[%s] Downloading json' % self
.IE_NAME
)
3652 def report_extraction(self
, file_id
):
3653 """Report information extraction."""
3654 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
3656 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
3657 """Get urls from 'audio_formats' section in json"""
3660 bitrate_list
= jsonData
[fmt
]
3661 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
3662 bitrate
= max(bitrate_list
) # select highest
3664 url_list
= jsonData
[fmt
][bitrate
]
3665 except TypeError: # we have no bitrate info.
3666 url_list
= jsonData
[fmt
]
3670 def check_urls(self
, url_list
):
3671 """Returns 1st active url from list"""
3672 for url
in url_list
:
3674 urllib2
.urlopen(url
)
3676 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3681 def _print_formats(self
, formats
):
3682 print 'Available formats:'
3683 for fmt
in formats
.keys():
3684 for b
in formats
[fmt
]:
3686 ext
= formats
[fmt
][b
][0]
3687 print '%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1])
3688 except TypeError: # we have no bitrate info
3689 ext
= formats
[fmt
][0]
3690 print '%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1])
3693 def _real_extract(self
, url
):
3694 mobj
= re
.match(self
._VALID
_URL
, url
)
3696 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3698 # extract uploader & filename from url
3699 uploader
= mobj
.group(1).decode('utf-8')
3700 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
3702 # construct API request
3703 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
3704 # retrieve .json file with links to files
3705 request
= urllib2
.Request(file_url
)
3707 self
.report_download_json(file_url
)
3708 jsonData
= urllib2
.urlopen(request
).read()
3709 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3710 self
._downloader
.trouble(u
'ERROR: Unable to retrieve file: %s' % str(err
))
3714 json_data
= json
.loads(jsonData
)
3715 player_url
= json_data
['player_swf_url']
3716 formats
= dict(json_data
['audio_formats'])
3718 req_format
= self
._downloader
.params
.get('format', None)
3721 if self
._downloader
.params
.get('listformats', None):
3722 self
._print
_formats
(formats
)
3725 if req_format
is None or req_format
== 'best':
3726 for format_param
in formats
.keys():
3727 url_list
= self
.get_urls(formats
, format_param
)
3729 file_url
= self
.check_urls(url_list
)
3730 if file_url
is not None:
3733 if req_format
not in formats
.keys():
3734 self
._downloader
.trouble(u
'ERROR: format is not available')
3737 url_list
= self
.get_urls(formats
, req_format
)
3738 file_url
= self
.check_urls(url_list
)
3739 format_param
= req_format
3742 self
._downloader
.increment_downloads()
3744 # Process file information
3745 self
._downloader
.process_info({
3746 'id': file_id
.decode('utf-8'),
3747 'url': file_url
.decode('utf-8'),
3748 'uploader': uploader
.decode('utf-8'),
3749 'upload_date': u
'NA',
3750 'title': json_data
['name'],
3751 'stitle': _simplify_title(json_data
['name']),
3752 'ext': file_url
.split('.')[-1].decode('utf-8'),
3753 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
3754 'thumbnail': json_data
['thumbnail_url'],
3755 'description': json_data
['description'],
3756 'player_url': player_url
.decode('utf-8'),
3758 except UnavailableVideoError
, err
:
3759 self
._downloader
.trouble(u
'ERROR: unable to download file')
3761 class StanfordOpenClassroomIE(InfoExtractor
):
3762 """Information extractor for Stanford's Open ClassRoom"""
3764 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3765 IE_NAME
= u
'stanfordoc'
3767 def report_download_webpage(self
, objid
):
3768 """Report information extraction."""
3769 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, objid
))
3771 def report_extraction(self
, video_id
):
3772 """Report information extraction."""
3773 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3775 def _real_extract(self
, url
):
3776 mobj
= re
.match(self
._VALID
_URL
, url
)
3778 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3781 if mobj
.group('course') and mobj
.group('video'): # A specific video
3782 course
= mobj
.group('course')
3783 video
= mobj
.group('video')
3785 'id': _simplify_title(course
+ '_' + video
),
3788 self
.report_extraction(info
['id'])
3789 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
3790 xmlUrl
= baseUrl
+ video
+ '.xml'
3792 metaXml
= urllib2
.urlopen(xmlUrl
).read()
3793 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3794 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % unicode(err
))
3796 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
3798 info
['title'] = mdoc
.findall('./title')[0].text
3799 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
3801 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file')
3803 info
['stitle'] = _simplify_title(info
['title'])
3804 info
['ext'] = info
['url'].rpartition('.')[2]
3805 info
['format'] = info
['ext']
3806 self
._downloader
.increment_downloads()
3808 self
._downloader
.process_info(info
)
3809 except UnavailableVideoError
, err
:
3810 self
._downloader
.trouble(u
'\nERROR: unable to download video')
3811 elif mobj
.group('course'): # A course page
3812 unescapeHTML
= HTMLParser
.HTMLParser().unescape
3814 course
= mobj
.group('course')
3816 'id': _simplify_title(course
),
3820 self
.report_download_webpage(info
['id'])
3822 coursepage
= urllib2
.urlopen(url
).read()
3823 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3824 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + unicode(err
))
3827 m
= re
.search('<h1>([^<]+)</h1>', coursepage
)
3829 info
['title'] = unescapeHTML(m
.group(1))
3831 info
['title'] = info
['id']
3832 info
['stitle'] = _simplify_title(info
['title'])
3834 m
= re
.search('<description>([^<]+)</description>', coursepage
)
3836 info
['description'] = unescapeHTML(m
.group(1))
3838 links
= _orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
3841 'type': 'reference',
3842 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
3846 for entry
in info
['list']:
3847 assert entry
['type'] == 'reference'
3848 self
.extract(entry
['url'])
3850 unescapeHTML
= HTMLParser
.HTMLParser().unescape
3853 'id': 'Stanford OpenClassroom',
3857 self
.report_download_webpage(info
['id'])
3858 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3860 rootpage
= urllib2
.urlopen(rootURL
).read()
3861 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
3862 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + unicode(err
))
3865 info
['title'] = info
['id']
3866 info
['stitle'] = _simplify_title(info
['title'])
3868 links
= _orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
3871 'type': 'reference',
3872 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
3876 for entry
in info
['list']:
3877 assert entry
['type'] == 'reference'
3878 self
.extract(entry
['url'])
3881 class PostProcessor(object):
3882 """Post Processor class.
3884 PostProcessor objects can be added to downloaders with their
3885 add_post_processor() method. When the downloader has finished a
3886 successful download, it will take its internal chain of PostProcessors
3887 and start calling the run() method on each one of them, first with
3888 an initial argument and then with the returned value of the previous
3891 The chain will be stopped if one of them ever returns None or the end
3892 of the chain is reached.
3894 PostProcessor objects follow a "mutual registration" process similar
3895 to InfoExtractor objects.
3900 def __init__(self
, downloader
=None):
3901 self
._downloader
= downloader
3903 def set_downloader(self
, downloader
):
3904 """Sets the downloader for this PP."""
3905 self
._downloader
= downloader
3907 def run(self
, information
):
3908 """Run the PostProcessor.
3910 The "information" argument is a dictionary like the ones
3911 composed by InfoExtractors. The only difference is that this
3912 one has an extra field called "filepath" that points to the
3915 When this method returns None, the postprocessing chain is
3916 stopped. However, this method may return an information
3917 dictionary that will be passed to the next postprocessing
3918 object in the chain. It can be the one it received after
3919 changing some fields.
3921 In addition, this method may raise a PostProcessingError
3922 exception that will be taken into account by the downloader
3925 return information
# by default, do nothing
3928 class FFmpegExtractAudioPP(PostProcessor
):
3930 def __init__(self
, downloader
=None, preferredcodec
=None, preferredquality
=None, keepvideo
=False):
3931 PostProcessor
.__init
__(self
, downloader
)
3932 if preferredcodec
is None:
3933 preferredcodec
= 'best'
3934 self
._preferredcodec
= preferredcodec
3935 self
._preferredquality
= preferredquality
3936 self
._keepvideo
= keepvideo
3939 def get_audio_codec(path
):
3941 cmd
= ['ffprobe', '-show_streams', '--', path
]
3942 handle
= subprocess
.Popen(cmd
, stderr
=file(os
.path
.devnull
, 'w'), stdout
=subprocess
.PIPE
)
3943 output
= handle
.communicate()[0]
3944 if handle
.wait() != 0:
3946 except (IOError, OSError):
3949 for line
in output
.split('\n'):
3950 if line
.startswith('codec_name='):
3951 audio_codec
= line
.split('=')[1].strip()
3952 elif line
.strip() == 'codec_type=audio' and audio_codec
is not None:
3957 def run_ffmpeg(path
, out_path
, codec
, more_opts
):
3959 cmd
= ['ffmpeg', '-y', '-i', path
, '-vn', '-acodec', codec
] + more_opts
+ ['--', out_path
]
3960 ret
= subprocess
.call(cmd
, stdout
=file(os
.path
.devnull
, 'w'), stderr
=subprocess
.STDOUT
)
3962 except (IOError, OSError):
3965 def run(self
, information
):
3966 path
= information
['filepath']
3968 filecodec
= self
.get_audio_codec(path
)
3969 if filecodec
is None:
3970 self
._downloader
.to_stderr(u
'WARNING: unable to obtain file audio codec with ffprobe')
3974 if self
._preferredcodec
== 'best' or self
._preferredcodec
== filecodec
or (self
._preferredcodec
== 'm4a' and filecodec
== 'aac'):
3975 if self
._preferredcodec
== 'm4a' and filecodec
== 'aac':
3976 # Lossless, but in another container
3978 extension
= self
._preferredcodec
3979 more_opts
= ['-absf', 'aac_adtstoasc']
3980 elif filecodec
in ['aac', 'mp3', 'vorbis']:
3981 # Lossless if possible
3983 extension
= filecodec
3984 if filecodec
== 'aac':
3985 more_opts
= ['-f', 'adts']
3986 if filecodec
== 'vorbis':
3990 acodec
= 'libmp3lame'
3993 if self
._preferredquality
is not None:
3994 more_opts
+= ['-ab', self
._preferredquality
]
3996 # We convert the audio (lossy)
3997 acodec
= {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}
[self
._preferredcodec
]
3998 extension
= self
._preferredcodec
4000 if self
._preferredquality
is not None:
4001 more_opts
+= ['-ab', self
._preferredquality
]
4002 if self
._preferredcodec
== 'aac':
4003 more_opts
+= ['-f', 'adts']
4004 if self
._preferredcodec
== 'm4a':
4005 more_opts
+= ['-absf', 'aac_adtstoasc']
4006 if self
._preferredcodec
== 'vorbis':
4009 (prefix
, ext
) = os
.path
.splitext(path
)
4010 new_path
= prefix
+ '.' + extension
4011 self
._downloader
.to_screen(u
'[ffmpeg] Destination: %s' % new_path
)
4012 status
= self
.run_ffmpeg(path
, new_path
, acodec
, more_opts
)
4015 self
._downloader
.to_stderr(u
'WARNING: error running ffmpeg')
4018 # Try to update the date time for extracted audio file.
4019 if information
.get('filetime') is not None:
4021 os
.utime(new_path
, (time
.time(), information
['filetime']))
4023 self
._downloader
.to_stderr(u
'WARNING: Cannot update utime of audio file')
4025 if not self
._keepvideo
:
4028 except (IOError, OSError):
4029 self
._downloader
.to_stderr(u
'WARNING: Unable to remove downloaded video file')
4032 information
['filepath'] = new_path
4036 def updateSelf(downloader
, filename
):
4037 ''' Update the program file with the latest version from the repository '''
4038 # Note: downloader only used for options
4039 if not os
.access(filename
, os
.W_OK
):
4040 sys
.exit('ERROR: no write permissions on %s' % filename
)
4042 downloader
.to_screen('Updating to latest version...')
4046 urlh
= urllib
.urlopen(UPDATE_URL
)
4047 newcontent
= urlh
.read()
4049 vmatch
= re
.search("__version__ = '([^']+)'", newcontent
)
4050 if vmatch
is not None and vmatch
.group(1) == __version__
:
4051 downloader
.to_screen('youtube-dl is up-to-date (' + __version__
+ ')')
4055 except (IOError, OSError), err
:
4056 sys
.exit('ERROR: unable to download latest version')
4059 outf
= open(filename
, 'wb')
4061 outf
.write(newcontent
)
4064 except (IOError, OSError), err
:
4065 sys
.exit('ERROR: unable to overwrite current version')
4067 downloader
.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4075 def _readOptions(filename
):
4077 optionf
= open(filename
)
4079 return [] # silently skip if file is not present
4083 res
+= shlex
.split(l
, comments
=True)
4088 def _format_option_string(option
):
4089 ''' ('-o', '--option') -> -o, --format METAVAR'''
4093 if option
._short
_opts
: opts
.append(option
._short
_opts
[0])
4094 if option
._long
_opts
: opts
.append(option
._long
_opts
[0])
4095 if len(opts
) > 1: opts
.insert(1, ', ')
4097 if option
.takes_value(): opts
.append(' %s' % option
.metavar
)
4099 return "".join(opts
)
4101 def _find_term_columns():
4102 columns
= os
.environ
.get('COLUMNS', None)
4107 sp
= subprocess
.Popen(['stty', 'size'], stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
4108 out
,err
= sp
.communicate()
4109 return int(out
.split()[1])
4115 max_help_position
= 80
4117 # No need to wrap help messages if we're on a wide console
4118 columns
= _find_term_columns()
4119 if columns
: max_width
= columns
4121 fmt
= optparse
.IndentedHelpFormatter(width
=max_width
, max_help_position
=max_help_position
)
4122 fmt
.format_option_strings
= _format_option_string
4125 'version' : __version__
,
4127 'usage' : '%prog [options] url [url...]',
4128 'conflict_handler' : 'resolve',
4131 parser
= optparse
.OptionParser(**kw
)
4134 general
= optparse
.OptionGroup(parser
, 'General Options')
4135 selection
= optparse
.OptionGroup(parser
, 'Video Selection')
4136 authentication
= optparse
.OptionGroup(parser
, 'Authentication Options')
4137 video_format
= optparse
.OptionGroup(parser
, 'Video Format Options')
4138 postproc
= optparse
.OptionGroup(parser
, 'Post-processing Options')
4139 filesystem
= optparse
.OptionGroup(parser
, 'Filesystem Options')
4140 verbosity
= optparse
.OptionGroup(parser
, 'Verbosity / Simulation Options')
4142 general
.add_option('-h', '--help',
4143 action
='help', help='print this help text and exit')
4144 general
.add_option('-v', '--version',
4145 action
='version', help='print program version and exit')
4146 general
.add_option('-U', '--update',
4147 action
='store_true', dest
='update_self', help='update this program to latest version')
4148 general
.add_option('-i', '--ignore-errors',
4149 action
='store_true', dest
='ignoreerrors', help='continue on download errors', default
=False)
4150 general
.add_option('-r', '--rate-limit',
4151 dest
='ratelimit', metavar
='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4152 general
.add_option('-R', '--retries',
4153 dest
='retries', metavar
='RETRIES', help='number of retries (default is 10)', default
=10)
4154 general
.add_option('--dump-user-agent',
4155 action
='store_true', dest
='dump_user_agent',
4156 help='display the current browser identification', default
=False)
4157 general
.add_option('--list-extractors',
4158 action
='store_true', dest
='list_extractors',
4159 help='List all supported extractors and the URLs they would handle', default
=False)
4161 selection
.add_option('--playlist-start',
4162 dest
='playliststart', metavar
='NUMBER', help='playlist video to start at (default is 1)', default
=1)
4163 selection
.add_option('--playlist-end',
4164 dest
='playlistend', metavar
='NUMBER', help='playlist video to end at (default is last)', default
=-1)
4165 selection
.add_option('--match-title', dest
='matchtitle', metavar
='REGEX',help='download only matching titles (regex or caseless sub-string)')
4166 selection
.add_option('--reject-title', dest
='rejecttitle', metavar
='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4167 selection
.add_option('--max-downloads', metavar
='NUMBER', dest
='max_downloads', help='Abort after downloading NUMBER files', default
=None)
4169 authentication
.add_option('-u', '--username',
4170 dest
='username', metavar
='USERNAME', help='account username')
4171 authentication
.add_option('-p', '--password',
4172 dest
='password', metavar
='PASSWORD', help='account password')
4173 authentication
.add_option('-n', '--netrc',
4174 action
='store_true', dest
='usenetrc', help='use .netrc authentication data', default
=False)
4177 video_format
.add_option('-f', '--format',
4178 action
='store', dest
='format', metavar
='FORMAT', help='video format code')
4179 video_format
.add_option('--all-formats',
4180 action
='store_const', dest
='format', help='download all available video formats', const
='all')
4181 video_format
.add_option('--prefer-free-formats',
4182 action
='store_true', dest
='prefer_free_formats', default
=False, help='prefer free video formats unless a specific one is requested')
4183 video_format
.add_option('--max-quality',
4184 action
='store', dest
='format_limit', metavar
='FORMAT', help='highest quality format to download')
4185 video_format
.add_option('-F', '--list-formats',
4186 action
='store_true', dest
='listformats', help='list all available formats (currently youtube only)')
4189 verbosity
.add_option('-q', '--quiet',
4190 action
='store_true', dest
='quiet', help='activates quiet mode', default
=False)
4191 verbosity
.add_option('-s', '--simulate',
4192 action
='store_true', dest
='simulate', help='do not download the video and do not write anything to disk', default
=False)
4193 verbosity
.add_option('--skip-download',
4194 action
='store_true', dest
='skip_download', help='do not download the video', default
=False)
4195 verbosity
.add_option('-g', '--get-url',
4196 action
='store_true', dest
='geturl', help='simulate, quiet but print URL', default
=False)
4197 verbosity
.add_option('-e', '--get-title',
4198 action
='store_true', dest
='gettitle', help='simulate, quiet but print title', default
=False)
4199 verbosity
.add_option('--get-thumbnail',
4200 action
='store_true', dest
='getthumbnail',
4201 help='simulate, quiet but print thumbnail URL', default
=False)
4202 verbosity
.add_option('--get-description',
4203 action
='store_true', dest
='getdescription',
4204 help='simulate, quiet but print video description', default
=False)
4205 verbosity
.add_option('--get-filename',
4206 action
='store_true', dest
='getfilename',
4207 help='simulate, quiet but print output filename', default
=False)
4208 verbosity
.add_option('--get-format',
4209 action
='store_true', dest
='getformat',
4210 help='simulate, quiet but print output format', default
=False)
4211 verbosity
.add_option('--no-progress',
4212 action
='store_true', dest
='noprogress', help='do not print progress bar', default
=False)
4213 verbosity
.add_option('--console-title',
4214 action
='store_true', dest
='consoletitle',
4215 help='display progress in console titlebar', default
=False)
4218 filesystem
.add_option('-t', '--title',
4219 action
='store_true', dest
='usetitle', help='use title in file name', default
=False)
4220 filesystem
.add_option('-l', '--literal',
4221 action
='store_true', dest
='useliteral', help='use literal title in file name', default
=False)
4222 filesystem
.add_option('-A', '--auto-number',
4223 action
='store_true', dest
='autonumber',
4224 help='number downloaded files starting from 00000', default
=False)
4225 filesystem
.add_option('-o', '--output',
4226 dest
='outtmpl', metavar
='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4227 filesystem
.add_option('-a', '--batch-file',
4228 dest
='batchfile', metavar
='FILE', help='file containing URLs to download (\'-\' for stdin)')
4229 filesystem
.add_option('-w', '--no-overwrites',
4230 action
='store_true', dest
='nooverwrites', help='do not overwrite files', default
=False)
4231 filesystem
.add_option('-c', '--continue',
4232 action
='store_true', dest
='continue_dl', help='resume partially downloaded files', default
=False)
4233 filesystem
.add_option('--no-continue',
4234 action
='store_false', dest
='continue_dl',
4235 help='do not resume partially downloaded files (restart from beginning)')
4236 filesystem
.add_option('--cookies',
4237 dest
='cookiefile', metavar
='FILE', help='file to read cookies from and dump cookie jar in')
4238 filesystem
.add_option('--no-part',
4239 action
='store_true', dest
='nopart', help='do not use .part files', default
=False)
4240 filesystem
.add_option('--no-mtime',
4241 action
='store_false', dest
='updatetime',
4242 help='do not use the Last-modified header to set the file modification time', default
=True)
4243 filesystem
.add_option('--write-description',
4244 action
='store_true', dest
='writedescription',
4245 help='write video description to a .description file', default
=False)
4246 filesystem
.add_option('--write-info-json',
4247 action
='store_true', dest
='writeinfojson',
4248 help='write video metadata to a .info.json file', default
=False)
4251 postproc
.add_option('--extract-audio', action
='store_true', dest
='extractaudio', default
=False,
4252 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4253 postproc
.add_option('--audio-format', metavar
='FORMAT', dest
='audioformat', default
='best',
4254 help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4255 postproc
.add_option('--audio-quality', metavar
='QUALITY', dest
='audioquality', default
='128K',
4256 help='ffmpeg audio bitrate specification, 128k by default')
4257 postproc
.add_option('-k', '--keep-video', action
='store_true', dest
='keepvideo', default
=False,
4258 help='keeps the video file on disk after the post-processing; the video is erased by default')
4261 parser
.add_option_group(general
)
4262 parser
.add_option_group(selection
)
4263 parser
.add_option_group(filesystem
)
4264 parser
.add_option_group(verbosity
)
4265 parser
.add_option_group(video_format
)
4266 parser
.add_option_group(authentication
)
4267 parser
.add_option_group(postproc
)
4269 xdg_config_home
= os
.environ
.get('XDG_CONFIG_HOME')
4271 userConf
= os
.path
.join(xdg_config_home
, 'youtube-dl.conf')
4273 userConf
= os
.path
.join(os
.path
.expanduser('~'), '.config', 'youtube-dl.conf')
4274 argv
= _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf
) + sys
.argv
[1:]
4275 opts
, args
= parser
.parse_args(argv
)
4277 return parser
, opts
, args
4279 def gen_extractors():
4280 """ Return a list of an instance of every supported extractor.
4281 The order does matter; the first extractor matched is the one handling the URL.
4283 youtube_ie
= YoutubeIE()
4284 google_ie
= GoogleIE()
4285 yahoo_ie
= YahooIE()
4287 YoutubePlaylistIE(youtube_ie
),
4288 YoutubeUserIE(youtube_ie
),
4289 YoutubeSearchIE(youtube_ie
),
4291 MetacafeIE(youtube_ie
),
4294 GoogleSearchIE(google_ie
),
4297 YahooSearchIE(yahoo_ie
),
4310 StanfordOpenClassroomIE(),
4316 parser
, opts
, args
= parseOpts()
4318 # Open appropriate CookieJar
4319 if opts
.cookiefile
is None:
4320 jar
= cookielib
.CookieJar()
4323 jar
= cookielib
.MozillaCookieJar(opts
.cookiefile
)
4324 if os
.path
.isfile(opts
.cookiefile
) and os
.access(opts
.cookiefile
, os
.R_OK
):
4326 except (IOError, OSError), err
:
4327 sys
.exit(u
'ERROR: unable to open cookie file')
4330 if opts
.dump_user_agent
:
4331 print std_headers
['User-Agent']
4334 # Batch file verification
4336 if opts
.batchfile
is not None:
4338 if opts
.batchfile
== '-':
4341 batchfd
= open(opts
.batchfile
, 'r')
4342 batchurls
= batchfd
.readlines()
4343 batchurls
= [x
.strip() for x
in batchurls
]
4344 batchurls
= [x
for x
in batchurls
if len(x
) > 0 and not re
.search(r
'^[#/;]', x
)]
4346 sys
.exit(u
'ERROR: batch file could not be read')
4347 all_urls
= batchurls
+ args
4349 # General configuration
4350 cookie_processor
= urllib2
.HTTPCookieProcessor(jar
)
4351 opener
= urllib2
.build_opener(urllib2
.ProxyHandler(), cookie_processor
, YoutubeDLHandler())
4352 urllib2
.install_opener(opener
)
4353 socket
.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4355 extractors
= gen_extractors()
4357 if opts
.list_extractors
:
4358 for ie
in extractors
:
4360 matchedUrls
= filter(lambda url
: ie
.suitable(url
), all_urls
)
4361 all_urls
= filter(lambda url
: url
not in matchedUrls
, all_urls
)
4362 for mu
in matchedUrls
:
4366 # Conflicting, missing and erroneous options
4367 if opts
.usenetrc
and (opts
.username
is not None or opts
.password
is not None):
4368 parser
.error(u
'using .netrc conflicts with giving username/password')
4369 if opts
.password
is not None and opts
.username
is None:
4370 parser
.error(u
'account username missing')
4371 if opts
.outtmpl
is not None and (opts
.useliteral
or opts
.usetitle
or opts
.autonumber
):
4372 parser
.error(u
'using output template conflicts with using title, literal title or auto number')
4373 if opts
.usetitle
and opts
.useliteral
:
4374 parser
.error(u
'using title conflicts with using literal title')
4375 if opts
.username
is not None and opts
.password
is None:
4376 opts
.password
= getpass
.getpass(u
'Type account password and press return:')
4377 if opts
.ratelimit
is not None:
4378 numeric_limit
= FileDownloader
.parse_bytes(opts
.ratelimit
)
4379 if numeric_limit
is None:
4380 parser
.error(u
'invalid rate limit specified')
4381 opts
.ratelimit
= numeric_limit
4382 if opts
.retries
is not None:
4384 opts
.retries
= long(opts
.retries
)
4385 except (TypeError, ValueError), err
:
4386 parser
.error(u
'invalid retry count specified')
4388 opts
.playliststart
= int(opts
.playliststart
)
4389 if opts
.playliststart
<= 0:
4390 raise ValueError(u
'Playlist start must be positive')
4391 except (TypeError, ValueError), err
:
4392 parser
.error(u
'invalid playlist start number specified')
4394 opts
.playlistend
= int(opts
.playlistend
)
4395 if opts
.playlistend
!= -1 and (opts
.playlistend
<= 0 or opts
.playlistend
< opts
.playliststart
):
4396 raise ValueError(u
'Playlist end must be greater than playlist start')
4397 except (TypeError, ValueError), err
:
4398 parser
.error(u
'invalid playlist end number specified')
4399 if opts
.extractaudio
:
4400 if opts
.audioformat
not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4401 parser
.error(u
'invalid audio format specified')
4404 fd
= FileDownloader({
4405 'usenetrc': opts
.usenetrc
,
4406 'username': opts
.username
,
4407 'password': opts
.password
,
4408 'quiet': (opts
.quiet
or opts
.geturl
or opts
.gettitle
or opts
.getthumbnail
or opts
.getdescription
or opts
.getfilename
or opts
.getformat
),
4409 'forceurl': opts
.geturl
,
4410 'forcetitle': opts
.gettitle
,
4411 'forcethumbnail': opts
.getthumbnail
,
4412 'forcedescription': opts
.getdescription
,
4413 'forcefilename': opts
.getfilename
,
4414 'forceformat': opts
.getformat
,
4415 'simulate': opts
.simulate
,
4416 'skip_download': (opts
.skip_download
or opts
.simulate
or opts
.geturl
or opts
.gettitle
or opts
.getthumbnail
or opts
.getdescription
or opts
.getfilename
or opts
.getformat
),
4417 'format': opts
.format
,
4418 'format_limit': opts
.format_limit
,
4419 'listformats': opts
.listformats
,
4420 'outtmpl': ((opts
.outtmpl
is not None and opts
.outtmpl
.decode(preferredencoding()))
4421 or (opts
.format
== '-1' and opts
.usetitle
and u
'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4422 or (opts
.format
== '-1' and opts
.useliteral
and u
'%(title)s-%(id)s-%(format)s.%(ext)s')
4423 or (opts
.format
== '-1' and u
'%(id)s-%(format)s.%(ext)s')
4424 or (opts
.usetitle
and opts
.autonumber
and u
'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4425 or (opts
.useliteral
and opts
.autonumber
and u
'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4426 or (opts
.usetitle
and u
'%(stitle)s-%(id)s.%(ext)s')
4427 or (opts
.useliteral
and u
'%(title)s-%(id)s.%(ext)s')
4428 or (opts
.autonumber
and u
'%(autonumber)s-%(id)s.%(ext)s')
4429 or u
'%(id)s.%(ext)s'),
4430 'ignoreerrors': opts
.ignoreerrors
,
4431 'ratelimit': opts
.ratelimit
,
4432 'nooverwrites': opts
.nooverwrites
,
4433 'retries': opts
.retries
,
4434 'continuedl': opts
.continue_dl
,
4435 'noprogress': opts
.noprogress
,
4436 'playliststart': opts
.playliststart
,
4437 'playlistend': opts
.playlistend
,
4438 'logtostderr': opts
.outtmpl
== '-',
4439 'consoletitle': opts
.consoletitle
,
4440 'nopart': opts
.nopart
,
4441 'updatetime': opts
.updatetime
,
4442 'writedescription': opts
.writedescription
,
4443 'writeinfojson': opts
.writeinfojson
,
4444 'matchtitle': opts
.matchtitle
,
4445 'rejecttitle': opts
.rejecttitle
,
4446 'max_downloads': opts
.max_downloads
,
4447 'prefer_free_formats': opts
.prefer_free_formats
,
4449 for extractor
in extractors
:
4450 fd
.add_info_extractor(extractor
)
4453 if opts
.extractaudio
:
4454 fd
.add_post_processor(FFmpegExtractAudioPP(preferredcodec
=opts
.audioformat
, preferredquality
=opts
.audioquality
, keepvideo
=opts
.keepvideo
))
4457 if opts
.update_self
:
4458 updateSelf(fd
, sys
.argv
[0])
4461 if len(all_urls
) < 1:
4462 if not opts
.update_self
:
4463 parser
.error(u
'you must provide at least one URL')
4468 retcode
= fd
.download(all_urls
)
4469 except MaxDownloadsReached
:
4470 fd
.to_screen(u
'--max-download limit reached, aborting.')
4473 # Dump cookie jar if requested
4474 if opts
.cookiefile
is not None:
4477 except (IOError, OSError), err
:
4478 sys
.exit(u
'ERROR: unable to save cookie jar')
4485 except DownloadError
:
4487 except SameFileError
:
4488 sys
.exit(u
'ERROR: fixed output name but more than one file to download')
4489 except KeyboardInterrupt:
4490 sys
.exit(u
'\nERROR: Interrupted by user')
4492 if __name__
== '__main__':
4495 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: