]>
Commit | Line | Data |
---|---|---|
4fa74b52 RG |
1 | #!/usr/bin/env python |
2 | # -*- coding: utf-8 -*- | |
3 | # Author: Ricardo Garcia Gonzalez | |
64a6f26c | 4 | # Author: Danny Colligan |
49c0028a | 5 | # Author: Benjamin Johnson |
e567ef93 | 6 | # Author: Vasyl' Vavrychuk |
0f7099a5 | 7 | # Author: Witold Baryluk |
5aba6ea4 | 8 | # Author: Paweł Paprota |
ef9f8451 | 9 | # Author: Gergely Imreh |
4fa74b52 | 10 | # License: Public domain code |
80066952 | 11 | import cookielib |
ccbd296b | 12 | import ctypes |
a1f03c7b | 13 | import datetime |
09bd408c | 14 | import email.utils |
1987c232 | 15 | import gzip |
4fa74b52 RG |
16 | import htmlentitydefs |
17 | import httplib | |
2546e767 | 18 | import locale |
4fa74b52 RG |
19 | import math |
20 | import netrc | |
21 | import os | |
22 | import os.path | |
23 | import re | |
24 | import socket | |
25 | import string | |
1987c232 | 26 | import StringIO |
0487b407 | 27 | import subprocess |
4fa74b52 RG |
28 | import sys |
29 | import time | |
30 | import urllib | |
31 | import urllib2 | |
1987c232 | 32 | import zlib |
a04e80a4 RG |
33 | |
34 | # parse_qs was moved from the cgi module to the urlparse module recently. | |
35 | try: | |
36 | from urlparse import parse_qs | |
37 | except ImportError: | |
38 | from cgi import parse_qs | |
4fa74b52 | 39 | |
f995f712 | 40 | std_headers = { |
c44b9ee9 | 41 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', |
4fa74b52 | 42 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', |
96942e62 | 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
a57ed21f | 44 | 'Accept-Encoding': 'gzip, deflate', |
4fa74b52 RG |
45 | 'Accept-Language': 'en-us,en;q=0.5', |
46 | } | |
47 | ||
48 | simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') | |
49 | ||
eae2666c RG |
50 | def preferredencoding(): |
51 | """Get preferred encoding. | |
52 | ||
53 | Returns the best encoding scheme for the system, based on | |
54 | locale.getpreferredencoding() and some further tweaks. | |
55 | """ | |
f94b636c RG |
56 | def yield_preferredencoding(): |
57 | try: | |
58 | pref = locale.getpreferredencoding() | |
59 | u'TEST'.encode(pref) | |
60 | except: | |
61 | pref = 'UTF-8' | |
62 | while True: | |
63 | yield pref | |
64 | return yield_preferredencoding().next() | |
eae2666c | 65 | |
490fd7ae RG |
66 | def htmlentity_transform(matchobj): |
67 | """Transforms an HTML entity to a Unicode character. | |
d3975459 | 68 | |
490fd7ae RG |
69 | This function receives a match object and is intended to be used with |
70 | the re.sub() function. | |
71 | """ | |
72 | entity = matchobj.group(1) | |
73 | ||
74 | # Known non-numeric HTML entity | |
75 | if entity in htmlentitydefs.name2codepoint: | |
76 | return unichr(htmlentitydefs.name2codepoint[entity]) | |
77 | ||
78 | # Unicode character | |
79 | mobj = re.match(ur'(?u)#(x?\d+)', entity) | |
80 | if mobj is not None: | |
81 | numstr = mobj.group(1) | |
82 | if numstr.startswith(u'x'): | |
83 | base = 16 | |
84 | numstr = u'0%s' % numstr | |
85 | else: | |
86 | base = 10 | |
87 | return unichr(long(numstr, base)) | |
88 | ||
89 | # Unknown entity in name, return its literal representation | |
90 | return (u'&%s;' % entity) | |
91 | ||
92 | def sanitize_title(utitle): | |
31bcb480 | 93 | """Sanitizes a video title so it could be used as part of a filename.""" |
490fd7ae | 94 | utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) |
490fd7ae RG |
95 | return utitle.replace(unicode(os.sep), u'%') |
96 | ||
31bcb480 RG |
97 | def sanitize_open(filename, open_mode): |
98 | """Try to open the given filename, and slightly tweak it if this fails. | |
99 | ||
100 | Attempts to open the given filename. If this fails, it tries to change | |
101 | the filename slightly, step by step, until it's either able to open it | |
102 | or it fails and raises a final exception, like the standard open() | |
103 | function. | |
104 | ||
105 | It returns the tuple (stream, definitive_file_name). | |
106 | """ | |
107 | try: | |
131bc765 | 108 | if filename == u'-': |
e08878f4 RG |
109 | if sys.platform == 'win32': |
110 | import msvcrt | |
111 | msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) | |
131bc765 | 112 | return (sys.stdout, filename) |
31bcb480 RG |
113 | stream = open(filename, open_mode) |
114 | return (stream, filename) | |
115 | except (IOError, OSError), err: | |
116 | # In case of error, try to remove win32 forbidden chars | |
ca6a11fa | 117 | filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) |
31bcb480 RG |
118 | |
119 | # An exception here should be caught in the caller | |
120 | stream = open(filename, open_mode) | |
121 | return (stream, filename) | |
122 | ||
09bd408c GI |
123 | def timeconvert(timestr): |
124 | """Convert RFC 2822 defined time string into system timestamp""" | |
125 | timestamp = None | |
126 | timetuple = email.utils.parsedate_tz(timestr) | |
127 | if timetuple is not None: | |
128 | timestamp = email.utils.mktime_tz(timetuple) | |
129 | return timestamp | |
130 | ||
e5bf0f55 RG |
131 | class DownloadError(Exception): |
132 | """Download Error exception. | |
d3975459 | 133 | |
e5bf0f55 RG |
134 | This exception may be thrown by FileDownloader objects if they are not |
135 | configured to continue on errors. They will contain the appropriate | |
136 | error message. | |
137 | """ | |
138 | pass | |
139 | ||
140 | class SameFileError(Exception): | |
141 | """Same File exception. | |
142 | ||
143 | This exception will be thrown by FileDownloader objects if they detect | |
144 | multiple files would have to be downloaded to the same file on disk. | |
145 | """ | |
146 | pass | |
147 | ||
65cd34c5 RG |
148 | class PostProcessingError(Exception): |
149 | """Post Processing exception. | |
150 | ||
151 | This exception may be raised by PostProcessor's .run() method to | |
152 | indicate an error in the postprocessing task. | |
153 | """ | |
154 | pass | |
155 | ||
73f4e7af | 156 | class UnavailableVideoError(Exception): |
7b7759f5 | 157 | """Unavailable Format exception. |
158 | ||
159 | This exception will be thrown when a video is requested | |
160 | in a format that is not available for that video. | |
161 | """ | |
d69a1c91 RG |
162 | pass |
163 | ||
164 | class ContentTooShortError(Exception): | |
165 | """Content Too Short exception. | |
166 | ||
167 | This exception may be raised by FileDownloader objects when a file they | |
168 | download is too small for what the server announced first, indicating | |
169 | the connection was probably interrupted. | |
170 | """ | |
171 | # Both in bytes | |
172 | downloaded = None | |
173 | expected = None | |
174 | ||
175 | def __init__(self, downloaded, expected): | |
176 | self.downloaded = downloaded | |
177 | self.expected = expected | |
7b7759f5 | 178 | |
1987c232 RG |
179 | class YoutubeDLHandler(urllib2.HTTPHandler): |
180 | """Handler for HTTP requests and responses. | |
181 | ||
182 | This class, when installed with an OpenerDirector, automatically adds | |
183 | the standard headers to every HTTP request and handles gzipped and | |
184 | deflated responses from web servers. If compression is to be avoided in | |
185 | a particular request, the original request in the program code only has | |
186 | to include the HTTP header "Youtubedl-No-Compression", which will be | |
187 | removed before making the real request. | |
188 | ||
189 | Part of this code was copied from: | |
190 | ||
191 | http://techknack.net/python-urllib2-handlers/ | |
192 | ||
193 | Andrew Rowls, the author of that code, agreed to release it to the | |
194 | public domain. | |
195 | """ | |
196 | ||
197 | @staticmethod | |
198 | def deflate(data): | |
199 | try: | |
200 | return zlib.decompress(data, -zlib.MAX_WBITS) | |
201 | except zlib.error: | |
202 | return zlib.decompress(data) | |
203 | ||
7b531c0b RG |
204 | @staticmethod |
205 | def addinfourl_wrapper(stream, headers, url, code): | |
206 | if hasattr(urllib2.addinfourl, 'getcode'): | |
207 | return urllib2.addinfourl(stream, headers, url, code) | |
0f6b00b5 RG |
208 | ret = urllib2.addinfourl(stream, headers, url) |
209 | ret.code = code | |
210 | return ret | |
7b531c0b | 211 | |
1987c232 RG |
212 | def http_request(self, req): |
213 | for h in std_headers: | |
214 | if h in req.headers: | |
215 | del req.headers[h] | |
216 | req.add_header(h, std_headers[h]) | |
217 | if 'Youtubedl-no-compression' in req.headers: | |
218 | if 'Accept-encoding' in req.headers: | |
219 | del req.headers['Accept-encoding'] | |
220 | del req.headers['Youtubedl-no-compression'] | |
221 | return req | |
222 | ||
223 | def http_response(self, req, resp): | |
224 | old_resp = resp | |
225 | # gzip | |
226 | if resp.headers.get('Content-encoding', '') == 'gzip': | |
227 | gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') | |
7b531c0b | 228 | resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) |
1987c232 RG |
229 | resp.msg = old_resp.msg |
230 | # deflate | |
231 | if resp.headers.get('Content-encoding', '') == 'deflate': | |
232 | gz = StringIO.StringIO(self.deflate(resp.read())) | |
7b531c0b | 233 | resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) |
1987c232 RG |
234 | resp.msg = old_resp.msg |
235 | return resp | |
236 | ||
4fa74b52 RG |
237 | class FileDownloader(object): |
238 | """File Downloader class. | |
239 | ||
240 | File downloader objects are the ones responsible of downloading the | |
241 | actual video file and writing it to disk if the user has requested | |
242 | it, among some other tasks. In most cases there should be one per | |
243 | program. As, given a video URL, the downloader doesn't know how to | |
244 | extract all the needed information, task that InfoExtractors do, it | |
245 | has to pass the URL to one of them. | |
246 | ||
247 | For this, file downloader objects have a method that allows | |
248 | InfoExtractors to be registered in a given order. When it is passed | |
249 | a URL, the file downloader handles it to the first InfoExtractor it | |
2851b2ca RG |
250 | finds that reports being able to handle it. The InfoExtractor extracts |
251 | all the information about the video or videos the URL refers to, and | |
252 | asks the FileDownloader to process the video information, possibly | |
253 | downloading the video. | |
4fa74b52 RG |
254 | |
255 | File downloaders accept a lot of parameters. In order not to saturate | |
256 | the object constructor with arguments, it receives a dictionary of | |
d0a9affb RG |
257 | options instead. These options are available through the params |
258 | attribute for the InfoExtractors to use. The FileDownloader also | |
259 | registers itself as the downloader in charge for the InfoExtractors | |
260 | that are added to it, so this is a "mutual registration". | |
4fa74b52 RG |
261 | |
262 | Available options: | |
263 | ||
80066952 RG |
264 | username: Username for authentication purposes. |
265 | password: Password for authentication purposes. | |
266 | usenetrc: Use netrc for authentication instead. | |
267 | quiet: Do not print messages to stdout. | |
268 | forceurl: Force printing final URL. | |
269 | forcetitle: Force printing title. | |
270 | forcethumbnail: Force printing thumbnail URL. | |
271 | forcedescription: Force printing description. | |
9f796346 | 272 | forcefilename: Force printing final filename. |
80066952 RG |
273 | simulate: Do not download the video files. |
274 | format: Video format code. | |
275 | format_limit: Highest quality format to try. | |
276 | outtmpl: Template for output names. | |
277 | ignoreerrors: Do not stop on download errors. | |
278 | ratelimit: Download speed limit, in bytes/sec. | |
279 | nooverwrites: Prevent overwriting files. | |
280 | retries: Number of times to retry for HTTP error 5xx | |
281 | continuedl: Try to continue downloads if possible. | |
282 | noprogress: Do not print the progress bar. | |
283 | playliststart: Playlist item to start at. | |
8cc44341 | 284 | playlistend: Playlist item to end at. |
331ce0a0 | 285 | logtostderr: Log messages to stderr instead of stdout. |
ccbd296b | 286 | consoletitle: Display progress in console window's titlebar. |
3fb2c487 | 287 | nopart: Do not use temporary .part files. |
e3018902 | 288 | updatetime: Use the Last-modified header to set output file timestamps. |
4fa74b52 RG |
289 | """ |
290 | ||
d0a9affb | 291 | params = None |
4fa74b52 | 292 | _ies = [] |
65cd34c5 | 293 | _pps = [] |
9bf386d7 | 294 | _download_retcode = None |
7d8d0612 | 295 | _num_downloads = None |
331ce0a0 | 296 | _screen_file = None |
4fa74b52 RG |
297 | |
298 | def __init__(self, params): | |
1c5e2302 | 299 | """Create a FileDownloader object with the given options.""" |
4fa74b52 | 300 | self._ies = [] |
65cd34c5 | 301 | self._pps = [] |
9bf386d7 | 302 | self._download_retcode = 0 |
7d8d0612 | 303 | self._num_downloads = 0 |
331ce0a0 | 304 | self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] |
d0a9affb | 305 | self.params = params |
d3975459 | 306 | |
4fa74b52 RG |
307 | @staticmethod |
308 | def pmkdir(filename): | |
309 | """Create directory components in filename. Similar to Unix "mkdir -p".""" | |
310 | components = filename.split(os.sep) | |
311 | aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))] | |
3af1e172 | 312 | aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator |
4fa74b52 RG |
313 | for dir in aggregate: |
314 | if not os.path.exists(dir): | |
315 | os.mkdir(dir) | |
3fb2c487 | 316 | |
4fa74b52 RG |
317 | @staticmethod |
318 | def format_bytes(bytes): | |
319 | if bytes is None: | |
320 | return 'N/A' | |
8497c36d RG |
321 | if type(bytes) is str: |
322 | bytes = float(bytes) | |
323 | if bytes == 0.0: | |
4fa74b52 RG |
324 | exponent = 0 |
325 | else: | |
8497c36d | 326 | exponent = long(math.log(bytes, 1024.0)) |
4fa74b52 | 327 | suffix = 'bkMGTPEZY'[exponent] |
4fa74b52 RG |
328 | converted = float(bytes) / float(1024**exponent) |
329 | return '%.2f%s' % (converted, suffix) | |
330 | ||
331 | @staticmethod | |
332 | def calc_percent(byte_counter, data_len): | |
333 | if data_len is None: | |
334 | return '---.-%' | |
335 | return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) | |
336 | ||
337 | @staticmethod | |
338 | def calc_eta(start, now, total, current): | |
339 | if total is None: | |
340 | return '--:--' | |
341 | dif = now - start | |
342 | if current == 0 or dif < 0.001: # One millisecond | |
343 | return '--:--' | |
344 | rate = float(current) / dif | |
345 | eta = long((float(total) - float(current)) / rate) | |
346 | (eta_mins, eta_secs) = divmod(eta, 60) | |
347 | if eta_mins > 99: | |
348 | return '--:--' | |
349 | return '%02d:%02d' % (eta_mins, eta_secs) | |
350 | ||
5121ef20 | 351 | @staticmethod |
4fa74b52 RG |
352 | def calc_speed(start, now, bytes): |
353 | dif = now - start | |
354 | if bytes == 0 or dif < 0.001: # One millisecond | |
9fcd8355 | 355 | return '%10s' % '---b/s' |
4fa74b52 RG |
356 | return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) |
357 | ||
358 | @staticmethod | |
359 | def best_block_size(elapsed_time, bytes): | |
360 | new_min = max(bytes / 2.0, 1.0) | |
361 | new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB | |
362 | if elapsed_time < 0.001: | |
e1f18b8a | 363 | return long(new_max) |
4fa74b52 RG |
364 | rate = bytes / elapsed_time |
365 | if rate > new_max: | |
e1f18b8a | 366 | return long(new_max) |
4fa74b52 | 367 | if rate < new_min: |
e1f18b8a RG |
368 | return long(new_min) |
369 | return long(rate) | |
4fa74b52 | 370 | |
acd3d842 RG |
371 | @staticmethod |
372 | def parse_bytes(bytestr): | |
373 | """Parse a string indicating a byte quantity into a long integer.""" | |
374 | matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) | |
375 | if matchobj is None: | |
376 | return None | |
377 | number = float(matchobj.group(1)) | |
378 | multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) | |
379 | return long(round(number * multiplier)) | |
380 | ||
4fa74b52 RG |
381 | def add_info_extractor(self, ie): |
382 | """Add an InfoExtractor object to the end of the list.""" | |
383 | self._ies.append(ie) | |
384 | ie.set_downloader(self) | |
d3975459 | 385 | |
65cd34c5 RG |
386 | def add_post_processor(self, pp): |
387 | """Add a PostProcessor object to the end of the chain.""" | |
388 | self._pps.append(pp) | |
389 | pp.set_downloader(self) | |
d3975459 | 390 | |
331ce0a0 | 391 | def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False): |
9fcd8355 | 392 | """Print message to stdout if not in quiet mode.""" |
43ab0ca4 RG |
393 | try: |
394 | if not self.params.get('quiet', False): | |
331ce0a0 RG |
395 | terminator = [u'\n', u''][skip_eol] |
396 | print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()), | |
397 | self._screen_file.flush() | |
43ab0ca4 RG |
398 | except (UnicodeEncodeError), err: |
399 | if not ignore_encoding_errors: | |
400 | raise | |
d3975459 | 401 | |
7e5cab67 RG |
402 | def to_stderr(self, message): |
403 | """Print message to stderr.""" | |
eae2666c | 404 | print >>sys.stderr, message.encode(preferredencoding()) |
d3975459 | 405 | |
ccbd296b MM |
406 | def to_cons_title(self, message): |
407 | """Set console/terminal window title to message.""" | |
408 | if not self.params.get('consoletitle', False): | |
409 | return | |
410 | if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): | |
411 | # c_wchar_p() might not be necessary if `message` is | |
412 | # already of type unicode() | |
413 | ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) | |
414 | elif 'TERM' in os.environ: | |
415 | sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding())) | |
416 | ||
22899cea RG |
417 | def fixed_template(self): |
418 | """Checks if the output template is fixed.""" | |
d0a9affb | 419 | return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None) |
9fcd8355 | 420 | |
0086d1ec RG |
421 | def trouble(self, message=None): |
422 | """Determine action to take when a download problem appears. | |
423 | ||
424 | Depending on if the downloader has been configured to ignore | |
e5bf0f55 | 425 | download errors or not, this method may throw an exception or |
9bf386d7 | 426 | not when errors are found, after printing the message. |
0086d1ec RG |
427 | """ |
428 | if message is not None: | |
429 | self.to_stderr(message) | |
d0a9affb | 430 | if not self.params.get('ignoreerrors', False): |
e5bf0f55 | 431 | raise DownloadError(message) |
9bf386d7 | 432 | self._download_retcode = 1 |
0086d1ec | 433 | |
acd3d842 RG |
434 | def slow_down(self, start_time, byte_counter): |
435 | """Sleep if the download speed is over the rate limit.""" | |
d0a9affb | 436 | rate_limit = self.params.get('ratelimit', None) |
acd3d842 RG |
437 | if rate_limit is None or byte_counter == 0: |
438 | return | |
439 | now = time.time() | |
440 | elapsed = now - start_time | |
441 | if elapsed <= 0.0: | |
442 | return | |
443 | speed = float(byte_counter) / elapsed | |
444 | if speed > rate_limit: | |
445 | time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) | |
3fb2c487 RG |
446 | |
447 | def temp_name(self, filename): | |
448 | """Returns a temporary filename for the given filename.""" | |
449 | if self.params.get('nopart', False) or filename == u'-' or \ | |
450 | (os.path.exists(filename) and not os.path.isfile(filename)): | |
451 | return filename | |
452 | return filename + u'.part' | |
453 | ||
8cc42e7c RG |
454 | def undo_temp_name(self, filename): |
455 | if filename.endswith(u'.part'): | |
456 | return filename[:-len(u'.part')] | |
457 | return filename | |
458 | ||
62cf7aaf RG |
459 | def try_rename(self, old_filename, new_filename): |
460 | try: | |
7d950ca1 RG |
461 | if old_filename == new_filename: |
462 | return | |
62cf7aaf RG |
463 | os.rename(old_filename, new_filename) |
464 | except (IOError, OSError), err: | |
465 | self.trouble(u'ERROR: unable to rename file') | |
e3018902 RG |
466 | |
467 | def try_utime(self, filename, last_modified_hdr): | |
468 | """Try to set the last-modified time of the given file.""" | |
469 | if last_modified_hdr is None: | |
470 | return | |
471 | if not os.path.isfile(filename): | |
472 | return | |
473 | timestr = last_modified_hdr | |
474 | if timestr is None: | |
475 | return | |
476 | filetime = timeconvert(timestr) | |
477 | if filetime is None: | |
478 | return | |
479 | try: | |
480 | os.utime(filename,(time.time(), filetime)) | |
481 | except: | |
482 | pass | |
acd3d842 | 483 | |
bafa5cd9 RG |
484 | def report_destination(self, filename): |
485 | """Report destination filename.""" | |
331ce0a0 | 486 | self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True) |
d3975459 | 487 | |
bafa5cd9 RG |
488 | def report_progress(self, percent_str, data_len_str, speed_str, eta_str): |
489 | """Report download progress.""" | |
d9835247 RG |
490 | if self.params.get('noprogress', False): |
491 | return | |
331ce0a0 | 492 | self.to_screen(u'\r[download] %s of %s at %s ETA %s' % |
bafa5cd9 | 493 | (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) |
ccbd296b MM |
494 | self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % |
495 | (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) | |
7db85b2c RG |
496 | |
497 | def report_resuming_byte(self, resume_len): | |
8a9f53be | 498 | """Report attempt to resume at given byte.""" |
331ce0a0 | 499 | self.to_screen(u'[download] Resuming download at byte %s' % resume_len) |
d3975459 | 500 | |
7031008c | 501 | def report_retry(self, count, retries): |
e86e9474 | 502 | """Report retry in case of HTTP error 5xx""" |
331ce0a0 | 503 | self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) |
d3975459 | 504 | |
7db85b2c RG |
505 | def report_file_already_downloaded(self, file_name): |
506 | """Report file has already been fully downloaded.""" | |
43ab0ca4 | 507 | try: |
331ce0a0 | 508 | self.to_screen(u'[download] %s has already been downloaded' % file_name) |
43ab0ca4 | 509 | except (UnicodeEncodeError), err: |
331ce0a0 | 510 | self.to_screen(u'[download] The file has already been downloaded') |
d3975459 | 511 | |
7db85b2c RG |
512 | def report_unable_to_resume(self): |
513 | """Report it was impossible to resume download.""" | |
331ce0a0 | 514 | self.to_screen(u'[download] Unable to resume') |
d3975459 | 515 | |
bafa5cd9 RG |
516 | def report_finish(self): |
517 | """Report download finished.""" | |
d9835247 | 518 | if self.params.get('noprogress', False): |
331ce0a0 | 519 | self.to_screen(u'[download] Download completed') |
d9835247 | 520 | else: |
331ce0a0 | 521 | self.to_screen(u'') |
d3975459 | 522 | |
df372a65 RG |
523 | def increment_downloads(self): |
524 | """Increment the ordinal that assigns a number to each file.""" | |
525 | self._num_downloads += 1 | |
bafa5cd9 | 526 | |
9f796346 GI |
527 | def prepare_filename(self, info_dict): |
528 | """Generate the output filename.""" | |
529 | try: | |
530 | template_dict = dict(info_dict) | |
531 | template_dict['epoch'] = unicode(long(time.time())) | |
532 | template_dict['autonumber'] = unicode('%05d' % self._num_downloads) | |
533 | filename = self.params['outtmpl'] % template_dict | |
534 | return filename | |
535 | except (ValueError, KeyError), err: | |
536 | self.trouble(u'ERROR: invalid system charset or erroneous output template') | |
537 | return None | |
538 | ||
c8619e01 RG |
539 | def process_info(self, info_dict): |
540 | """Process a single dictionary returned by an InfoExtractor.""" | |
9f796346 | 541 | filename = self.prepare_filename(info_dict) |
c8619e01 RG |
542 | # Do nothing else if in simulate mode |
543 | if self.params.get('simulate', False): | |
cbfff4db RG |
544 | # Forced printings |
545 | if self.params.get('forcetitle', False): | |
490fd7ae | 546 | print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') |
cbfff4db | 547 | if self.params.get('forceurl', False): |
490fd7ae | 548 | print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') |
7e58d568 RG |
549 | if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: |
550 | print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') | |
551 | if self.params.get('forcedescription', False) and 'description' in info_dict: | |
552 | print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') | |
9f796346 GI |
553 | if self.params.get('forcefilename', False) and filename is not None: |
554 | print filename.encode(preferredencoding(), 'xmlcharrefreplace') | |
cbfff4db | 555 | |
9bf386d7 | 556 | return |
d3975459 | 557 | |
9f796346 | 558 | if filename is None: |
38ed1344 | 559 | return |
850ab765 | 560 | if self.params.get('nooverwrites', False) and os.path.exists(filename): |
5c44af18 | 561 | self.to_stderr(u'WARNING: file exists and will be skipped') |
9bf386d7 | 562 | return |
7b7759f5 | 563 | |
c8619e01 RG |
564 | try: |
565 | self.pmkdir(filename) | |
566 | except (OSError, IOError), err: | |
db7e31b8 | 567 | self.trouble(u'ERROR: unable to create directories: %s' % str(err)) |
9bf386d7 | 568 | return |
7b7759f5 | 569 | |
c8619e01 | 570 | try: |
e616ec0c | 571 | success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None)) |
c8619e01 | 572 | except (OSError, IOError), err: |
73f4e7af | 573 | raise UnavailableVideoError |
c8619e01 | 574 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: |
db7e31b8 | 575 | self.trouble(u'ERROR: unable to download video data: %s' % str(err)) |
9bf386d7 | 576 | return |
d69a1c91 | 577 | except (ContentTooShortError, ), err: |
db7e31b8 | 578 | self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) |
d69a1c91 | 579 | return |
7b7759f5 | 580 | |
55e7c75e RG |
581 | if success: |
582 | try: | |
583 | self.post_process(filename, info_dict) | |
584 | except (PostProcessingError), err: | |
db7e31b8 | 585 | self.trouble(u'ERROR: postprocessing: %s' % str(err)) |
55e7c75e | 586 | return |
c8619e01 | 587 | |
4fa74b52 RG |
588 | def download(self, url_list): |
589 | """Download a given list of URLs.""" | |
22899cea | 590 | if len(url_list) > 1 and self.fixed_template(): |
d0a9affb | 591 | raise SameFileError(self.params['outtmpl']) |
22899cea | 592 | |
4fa74b52 RG |
593 | for url in url_list: |
594 | suitable_found = False | |
595 | for ie in self._ies: | |
c8619e01 | 596 | # Go to next InfoExtractor if not suitable |
4fa74b52 RG |
597 | if not ie.suitable(url): |
598 | continue | |
c8619e01 | 599 | |
4fa74b52 RG |
600 | # Suitable InfoExtractor found |
601 | suitable_found = True | |
c8619e01 | 602 | |
6f21f686 RG |
603 | # Extract information from URL and process it |
604 | ie.extract(url) | |
65cd34c5 | 605 | |
c8619e01 | 606 | # Suitable InfoExtractor had been found; go to next URL |
4fa74b52 | 607 | break |
c8619e01 | 608 | |
4fa74b52 | 609 | if not suitable_found: |
db7e31b8 | 610 | self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) |
bb681b88 | 611 | |
9bf386d7 | 612 | return self._download_retcode |
65cd34c5 RG |
613 | |
614 | def post_process(self, filename, ie_info): | |
615 | """Run the postprocessing chain on the given file.""" | |
616 | info = dict(ie_info) | |
617 | info['filepath'] = filename | |
618 | for pp in self._pps: | |
619 | info = pp.run(info) | |
620 | if info is None: | |
621 | break | |
d3975459 | 622 | |
e616ec0c | 623 | def _download_with_rtmpdump(self, filename, url, player_url): |
0487b407 | 624 | self.report_destination(filename) |
62cf7aaf | 625 | tmpfilename = self.temp_name(filename) |
0487b407 RG |
626 | |
627 | # Check for rtmpdump first | |
628 | try: | |
629 | subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT) | |
630 | except (OSError, IOError): | |
631 | self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run') | |
632 | return False | |
633 | ||
634 | # Download using rtmpdump. rtmpdump returns exit code 2 when | |
635 | # the connection was interrumpted and resuming appears to be | |
636 | # possible. This is part of rtmpdump's normal usage, AFAIK. | |
62cf7aaf | 637 | basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] |
1c1821f8 RG |
638 | retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]) |
639 | while retval == 2 or retval == 1: | |
62cf7aaf | 640 | prevsize = os.path.getsize(tmpfilename) |
331ce0a0 | 641 | self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) |
e616ec0c | 642 | time.sleep(5.0) # This seems to be needed |
1c1821f8 | 643 | retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) |
62cf7aaf | 644 | cursize = os.path.getsize(tmpfilename) |
e616ec0c RG |
645 | if prevsize == cursize and retval == 1: |
646 | break | |
0487b407 | 647 | if retval == 0: |
62cf7aaf RG |
648 | self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) |
649 | self.try_rename(tmpfilename, filename) | |
0487b407 RG |
650 | return True |
651 | else: | |
db7e31b8 | 652 | self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) |
0487b407 RG |
653 | return False |
654 | ||
e616ec0c | 655 | def _do_download(self, filename, url, player_url): |
62cf7aaf | 656 | # Check file already present |
3fb2c487 | 657 | if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): |
62cf7aaf RG |
658 | self.report_file_already_downloaded(filename) |
659 | return True | |
660 | ||
0487b407 RG |
661 | # Attempt to download using rtmpdump |
662 | if url.startswith('rtmp'): | |
e616ec0c | 663 | return self._download_with_rtmpdump(filename, url, player_url) |
0487b407 | 664 | |
62cf7aaf | 665 | tmpfilename = self.temp_name(filename) |
55e7c75e | 666 | stream = None |
9c457d2a | 667 | open_mode = 'wb' |
1987c232 RG |
668 | |
669 | # Do not include the Accept-Encoding header | |
670 | headers = {'Youtubedl-no-compression': 'True'} | |
671 | basic_request = urllib2.Request(url, None, headers) | |
672 | request = urllib2.Request(url, None, headers) | |
7db85b2c | 673 | |
9c457d2a | 674 | # Establish possible resume length |
62cf7aaf RG |
675 | if os.path.isfile(tmpfilename): |
676 | resume_len = os.path.getsize(tmpfilename) | |
55e7c75e RG |
677 | else: |
678 | resume_len = 0 | |
9c457d2a RG |
679 | |
680 | # Request parameters in case of being able to resume | |
850ab765 | 681 | if self.params.get('continuedl', False) and resume_len != 0: |
7db85b2c RG |
682 | self.report_resuming_byte(resume_len) |
683 | request.add_header('Range','bytes=%d-' % resume_len) | |
9c457d2a | 684 | open_mode = 'ab' |
55e7c75e | 685 | |
7031008c RG |
686 | count = 0 |
687 | retries = self.params.get('retries', 0) | |
101e0d1e | 688 | while count <= retries: |
7031008c RG |
689 | # Establish connection |
690 | try: | |
691 | data = urllib2.urlopen(request) | |
692 | break | |
693 | except (urllib2.HTTPError, ), err: | |
ac249f42 | 694 | if (err.code < 500 or err.code >= 600) and err.code != 416: |
101e0d1e | 695 | # Unexpected HTTP error |
7031008c | 696 | raise |
101e0d1e RG |
697 | elif err.code == 416: |
698 | # Unable to resume (requested range not satisfiable) | |
699 | try: | |
700 | # Open the connection again without the range header | |
701 | data = urllib2.urlopen(basic_request) | |
702 | content_length = data.info()['Content-Length'] | |
703 | except (urllib2.HTTPError, ), err: | |
ac249f42 | 704 | if err.code < 500 or err.code >= 600: |
101e0d1e RG |
705 | raise |
706 | else: | |
707 | # Examine the reported length | |
268fb2bd | 708 | if (content_length is not None and |
204c9398 | 709 | (resume_len - 100 < long(content_length) < resume_len + 100)): |
268fb2bd RG |
710 | # The file had already been fully downloaded. |
711 | # Explanation to the above condition: in issue #175 it was revealed that | |
712 | # YouTube sometimes adds or removes a few bytes from the end of the file, | |
713 | # changing the file size slightly and causing problems for some users. So | |
714 | # I decided to implement a suggested change and consider the file | |
715 | # completely downloaded if the file size differs less than 100 bytes from | |
716 | # the one in the hard drive. | |
101e0d1e | 717 | self.report_file_already_downloaded(filename) |
62cf7aaf | 718 | self.try_rename(tmpfilename, filename) |
101e0d1e RG |
719 | return True |
720 | else: | |
721 | # The length does not match, we start the download over | |
722 | self.report_unable_to_resume() | |
723 | open_mode = 'wb' | |
724 | break | |
725 | # Retry | |
726 | count += 1 | |
727 | if count <= retries: | |
728 | self.report_retry(count, retries) | |
729 | ||
730 | if count > retries: | |
731 | self.trouble(u'ERROR: giving up after %s retries' % retries) | |
732 | return False | |
7db85b2c | 733 | |
4fa74b52 | 734 | data_len = data.info().get('Content-length', None) |
106d091e RG |
735 | if data_len is not None: |
736 | data_len = long(data_len) + resume_len | |
4fa74b52 | 737 | data_len_str = self.format_bytes(data_len) |
106d091e | 738 | byte_counter = 0 + resume_len |
4fa74b52 RG |
739 | block_size = 1024 |
740 | start = time.time() | |
741 | while True: | |
bafa5cd9 | 742 | # Download and write |
4fa74b52 RG |
743 | before = time.time() |
744 | data_block = data.read(block_size) | |
745 | after = time.time() | |
975a91d0 | 746 | if len(data_block) == 0: |
4fa74b52 | 747 | break |
975a91d0 | 748 | byte_counter += len(data_block) |
55e7c75e RG |
749 | |
750 | # Open file just in time | |
751 | if stream is None: | |
752 | try: | |
62cf7aaf | 753 | (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) |
8cc42e7c | 754 | filename = self.undo_temp_name(tmpfilename) |
55e7c75e RG |
755 | self.report_destination(filename) |
756 | except (OSError, IOError), err: | |
db7e31b8 | 757 | self.trouble(u'ERROR: unable to open for writing: %s' % str(err)) |
55e7c75e | 758 | return False |
131efd1a RG |
759 | try: |
760 | stream.write(data_block) | |
761 | except (IOError, OSError), err: | |
d67e0974 RG |
762 | self.trouble(u'\nERROR: unable to write data: %s' % str(err)) |
763 | return False | |
975a91d0 | 764 | block_size = self.best_block_size(after - before, len(data_block)) |
4fa74b52 | 765 | |
55e7c75e RG |
766 | # Progress message |
767 | percent_str = self.calc_percent(byte_counter, data_len) | |
975a91d0 RG |
768 | eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) |
769 | speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) | |
55e7c75e RG |
770 | self.report_progress(percent_str, data_len_str, speed_str, eta_str) |
771 | ||
acd3d842 | 772 | # Apply rate limit |
975a91d0 | 773 | self.slow_down(start, byte_counter - resume_len) |
acd3d842 | 774 | |
6f0ff3ba | 775 | stream.close() |
bafa5cd9 | 776 | self.report_finish() |
b905e5f5 | 777 | if data_len is not None and byte_counter != data_len: |
d69a1c91 | 778 | raise ContentTooShortError(byte_counter, long(data_len)) |
62cf7aaf | 779 | self.try_rename(tmpfilename, filename) |
e3018902 | 780 | |
09bd408c | 781 | # Update file modification time |
e3018902 RG |
782 | if self.params.get('updatetime', True): |
783 | self.try_utime(filename, data.info().get('last-modified', None)) | |
784 | ||
55e7c75e | 785 | return True |
4fa74b52 RG |
786 | |
787 | class InfoExtractor(object): | |
788 | """Information Extractor class. | |
789 | ||
790 | Information extractors are the classes that, given a URL, extract | |
791 | information from the video (or videos) the URL refers to. This | |
792 | information includes the real video URL, the video title and simplified | |
2851b2ca RG |
793 | title, author and others. The information is stored in a dictionary |
794 | which is then passed to the FileDownloader. The FileDownloader | |
795 | processes this information possibly downloading the video to the file | |
796 | system, among other possible outcomes. The dictionaries must include | |
4fa74b52 RG |
797 | the following fields: |
798 | ||
799 | id: Video identifier. | |
800 | url: Final video URL. | |
801 | uploader: Nickname of the video uploader. | |
802 | title: Literal title. | |
803 | stitle: Simplified title. | |
804 | ext: Video filename extension. | |
6ba562b0 | 805 | format: Video format. |
e616ec0c | 806 | player_url: SWF Player URL (may be None). |
4fa74b52 | 807 | |
7e58d568 RG |
808 | The following fields are optional. Their primary purpose is to allow |
809 | youtube-dl to serve as the backend for a video search function, such | |
810 | as the one in youtube2mp3. They are only used when their respective | |
811 | forced printing functions are called: | |
812 | ||
813 | thumbnail: Full URL to a video thumbnail image. | |
814 | description: One-line video description. | |
815 | ||
4fa74b52 RG |
816 | Subclasses of this one should re-define the _real_initialize() and |
817 | _real_extract() methods, as well as the suitable() static method. | |
818 | Probably, they should also be instantiated and added to the main | |
819 | downloader. | |
820 | """ | |
821 | ||
822 | _ready = False | |
823 | _downloader = None | |
824 | ||
825 | def __init__(self, downloader=None): | |
826 | """Constructor. Receives an optional downloader.""" | |
827 | self._ready = False | |
828 | self.set_downloader(downloader) | |
829 | ||
830 | @staticmethod | |
831 | def suitable(url): | |
832 | """Receives a URL and returns True if suitable for this IE.""" | |
020f7150 | 833 | return False |
4fa74b52 RG |
834 | |
835 | def initialize(self): | |
1c5e2302 | 836 | """Initializes an instance (authentication, etc).""" |
4fa74b52 RG |
837 | if not self._ready: |
838 | self._real_initialize() | |
839 | self._ready = True | |
840 | ||
841 | def extract(self, url): | |
842 | """Extracts URL information and returns it in list of dicts.""" | |
843 | self.initialize() | |
844 | return self._real_extract(url) | |
845 | ||
846 | def set_downloader(self, downloader): | |
847 | """Sets the downloader for this IE.""" | |
848 | self._downloader = downloader | |
d3975459 | 849 | |
4fa74b52 RG |
850 | def _real_initialize(self): |
851 | """Real initialization process. Redefine in subclasses.""" | |
852 | pass | |
853 | ||
854 | def _real_extract(self, url): | |
855 | """Real extraction process. Redefine in subclasses.""" | |
856 | pass | |
857 | ||
858 | class YoutubeIE(InfoExtractor): | |
859 | """Information extractor for youtube.com.""" | |
860 | ||
da54ed44 | 861 | _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$' |
9715661c | 862 | _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' |
7df4635f | 863 | _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' |
72ac78b8 | 864 | _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' |
4fa74b52 | 865 | _NETRC_MACHINE = 'youtube' |
497cd3e6 RG |
866 | # Listed in order of quality |
867 | _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13'] | |
7b7759f5 | 868 | _video_extensions = { |
869 | '13': '3gp', | |
870 | '17': 'mp4', | |
871 | '18': 'mp4', | |
872 | '22': 'mp4', | |
d9bc015b | 873 | '37': 'mp4', |
9e9647d9 | 874 | '38': 'video', # You actually don't know if this will be MOV, AVI or whatever |
0b59bf4a RG |
875 | '43': 'webm', |
876 | '45': 'webm', | |
7b7759f5 | 877 | } |
4fa74b52 | 878 | |
020f7150 RG |
879 | @staticmethod |
880 | def suitable(url): | |
881 | return (re.match(YoutubeIE._VALID_URL, url) is not None) | |
882 | ||
72ac78b8 RG |
883 | def report_lang(self): |
884 | """Report attempt to set language.""" | |
331ce0a0 | 885 | self._downloader.to_screen(u'[youtube] Setting language') |
72ac78b8 | 886 | |
bafa5cd9 RG |
887 | def report_login(self): |
888 | """Report attempt to log in.""" | |
331ce0a0 | 889 | self._downloader.to_screen(u'[youtube] Logging in') |
d3975459 | 890 | |
bafa5cd9 RG |
891 | def report_age_confirmation(self): |
892 | """Report attempt to confirm age.""" | |
331ce0a0 | 893 | self._downloader.to_screen(u'[youtube] Confirming age') |
d3975459 | 894 | |
e616ec0c RG |
895 | def report_video_webpage_download(self, video_id): |
896 | """Report attempt to download video webpage.""" | |
331ce0a0 | 897 | self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) |
d3975459 | 898 | |
71b7300e RG |
899 | def report_video_info_webpage_download(self, video_id): |
900 | """Report attempt to download video info webpage.""" | |
331ce0a0 | 901 | self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) |
d3975459 | 902 | |
bafa5cd9 RG |
903 | def report_information_extraction(self, video_id): |
904 | """Report attempt to extract video information.""" | |
331ce0a0 | 905 | self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) |
d3975459 | 906 | |
7b7759f5 | 907 | def report_unavailable_format(self, video_id, format): |
908 | """Report extracted video URL.""" | |
331ce0a0 | 909 | self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) |
d3975459 | 910 | |
0487b407 RG |
911 | def report_rtmp_download(self): |
912 | """Indicate the download will use the RTMP protocol.""" | |
331ce0a0 | 913 | self._downloader.to_screen(u'[youtube] RTMP download detected') |
d3975459 | 914 | |
4fa74b52 RG |
915 | def _real_initialize(self): |
916 | if self._downloader is None: | |
917 | return | |
918 | ||
919 | username = None | |
920 | password = None | |
d0a9affb | 921 | downloader_params = self._downloader.params |
4fa74b52 RG |
922 | |
923 | # Attempt to use provided username and password or .netrc data | |
924 | if downloader_params.get('username', None) is not None: | |
925 | username = downloader_params['username'] | |
926 | password = downloader_params['password'] | |
927 | elif downloader_params.get('usenetrc', False): | |
928 | try: | |
929 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) | |
930 | if info is not None: | |
931 | username = info[0] | |
932 | password = info[2] | |
933 | else: | |
934 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) | |
935 | except (IOError, netrc.NetrcParseError), err: | |
6f21f686 | 936 | self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) |
4fa74b52 RG |
937 | return |
938 | ||
72ac78b8 | 939 | # Set language |
1987c232 | 940 | request = urllib2.Request(self._LANG_URL) |
72ac78b8 RG |
941 | try: |
942 | self.report_lang() | |
943 | urllib2.urlopen(request).read() | |
944 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
6f21f686 | 945 | self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) |
72ac78b8 RG |
946 | return |
947 | ||
cc109403 RG |
948 | # No authentication to be performed |
949 | if username is None: | |
950 | return | |
951 | ||
4fa74b52 | 952 | # Log in |
9fcd8355 RG |
953 | login_form = { |
954 | 'current_form': 'loginForm', | |
4fa74b52 RG |
955 | 'next': '/', |
956 | 'action_login': 'Log In', | |
957 | 'username': username, | |
9fcd8355 RG |
958 | 'password': password, |
959 | } | |
1987c232 | 960 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) |
4fa74b52 | 961 | try: |
bafa5cd9 | 962 | self.report_login() |
4fa74b52 RG |
963 | login_results = urllib2.urlopen(request).read() |
964 | if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None: | |
6f21f686 | 965 | self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') |
4fa74b52 RG |
966 | return |
967 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
6f21f686 | 968 | self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) |
4fa74b52 | 969 | return |
d3975459 | 970 | |
4fa74b52 | 971 | # Confirm age |
9fcd8355 RG |
972 | age_form = { |
973 | 'next_url': '/', | |
974 | 'action_confirm': 'Confirm', | |
975 | } | |
1987c232 | 976 | request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form)) |
4fa74b52 | 977 | try: |
bafa5cd9 | 978 | self.report_age_confirmation() |
4fa74b52 RG |
979 | age_results = urllib2.urlopen(request).read() |
980 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
147753eb | 981 | self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) |
e5bf0f55 | 982 | return |
4fa74b52 RG |
983 | |
984 | def _real_extract(self, url): | |
985 | # Extract video id from URL | |
020f7150 | 986 | mobj = re.match(self._VALID_URL, url) |
4fa74b52 | 987 | if mobj is None: |
147753eb | 988 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
6f21f686 | 989 | return |
4fa74b52 RG |
990 | video_id = mobj.group(2) |
991 | ||
497cd3e6 RG |
992 | # Get video webpage |
993 | self.report_video_webpage_download(video_id) | |
1987c232 | 994 | request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) |
497cd3e6 RG |
995 | try: |
996 | video_webpage = urllib2.urlopen(request).read() | |
997 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
998 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) | |
999 | return | |
968aa884 | 1000 | |
497cd3e6 | 1001 | # Attempt to extract SWF player URL |
b620a5f8 | 1002 | mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) |
497cd3e6 | 1003 | if mobj is not None: |
b620a5f8 | 1004 | player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) |
497cd3e6 RG |
1005 | else: |
1006 | player_url = None | |
1007 | ||
1008 | # Get video info | |
1009 | self.report_video_info_webpage_download(video_id) | |
1010 | for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: | |
1011 | video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' | |
1012 | % (video_id, el_type)) | |
1987c232 | 1013 | request = urllib2.Request(video_info_url) |
e616ec0c | 1014 | try: |
497cd3e6 RG |
1015 | video_info_webpage = urllib2.urlopen(request).read() |
1016 | video_info = parse_qs(video_info_webpage) | |
1017 | if 'token' in video_info: | |
1018 | break | |
e616ec0c | 1019 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: |
497cd3e6 | 1020 | self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) |
e616ec0c | 1021 | return |
f95f29fd RG |
1022 | if 'token' not in video_info: |
1023 | if 'reason' in video_info: | |
8e686771 | 1024 | self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8')) |
f95f29fd RG |
1025 | else: |
1026 | self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') | |
1027 | return | |
1028 | ||
1029 | # Start extracting information | |
497cd3e6 RG |
1030 | self.report_information_extraction(video_id) |
1031 | ||
1032 | # uploader | |
1033 | if 'author' not in video_info: | |
1034 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') | |
1035 | return | |
1036 | video_uploader = urllib.unquote_plus(video_info['author'][0]) | |
e616ec0c | 1037 | |
497cd3e6 RG |
1038 | # title |
1039 | if 'title' not in video_info: | |
1040 | self._downloader.trouble(u'ERROR: unable to extract video title') | |
1041 | return | |
1042 | video_title = urllib.unquote_plus(video_info['title'][0]) | |
1043 | video_title = video_title.decode('utf-8') | |
1044 | video_title = sanitize_title(video_title) | |
1045 | ||
1046 | # simplified title | |
1047 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) | |
1048 | simple_title = simple_title.strip(ur'_') | |
1049 | ||
1050 | # thumbnail image | |
1051 | if 'thumbnail_url' not in video_info: | |
1052 | self._downloader.trouble(u'WARNING: unable to extract video thumbnail') | |
1053 | video_thumbnail = '' | |
1054 | else: # don't panic if we can't find it | |
1055 | video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) | |
1056 | ||
b3a27b52 NA |
1057 | # upload date |
1058 | upload_date = u'NA' | |
3efa45c3 | 1059 | mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL) |
b3a27b52 | 1060 | if mobj is not None: |
a1f03c7b | 1061 | upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) |
87cbd213 | 1062 | format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] |
a1f03c7b NA |
1063 | for expression in format_expressions: |
1064 | try: | |
1065 | upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') | |
1066 | except: | |
1067 | pass | |
b3a27b52 | 1068 | |
497cd3e6 RG |
1069 | # description |
1070 | video_description = 'No description available.' | |
1071 | if self._downloader.params.get('forcedescription', False): | |
1072 | mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage) | |
e616ec0c | 1073 | if mobj is not None: |
497cd3e6 RG |
1074 | video_description = mobj.group(1) |
1075 | ||
5ce7d172 RG |
1076 | # token |
1077 | video_token = urllib.unquote_plus(video_info['token'][0]) | |
1078 | ||
497cd3e6 | 1079 | # Decide which formats to download |
f83ae781 | 1080 | req_format = self._downloader.params.get('format', None) |
2e3a32e4 | 1081 | |
8126094c RG |
1082 | if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: |
1083 | url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') | |
1084 | url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs] | |
1085 | url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data) | |
497cd3e6 RG |
1086 | format_limit = self._downloader.params.get('format_limit', None) |
1087 | if format_limit is not None and format_limit in self._available_formats: | |
1088 | format_list = self._available_formats[self._available_formats.index(format_limit):] | |
e616ec0c | 1089 | else: |
497cd3e6 RG |
1090 | format_list = self._available_formats |
1091 | existing_formats = [x for x in format_list if x in url_map] | |
1092 | if len(existing_formats) == 0: | |
1093 | self._downloader.trouble(u'ERROR: no known formats available for video') | |
968aa884 | 1094 | return |
f83ae781 | 1095 | if req_format is None: |
d157d259 | 1096 | video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality |
f83ae781 | 1097 | elif req_format == '-1': |
d157d259 | 1098 | video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats |
497cd3e6 | 1099 | else: |
5c132793 RG |
1100 | # Specific format |
1101 | if req_format not in url_map: | |
1102 | self._downloader.trouble(u'ERROR: requested format not available') | |
1103 | return | |
1104 | video_url_list = [(req_format, url_map[req_format])] # Specific format | |
2e3a32e4 | 1105 | |
497cd3e6 RG |
1106 | elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): |
1107 | self.report_rtmp_download() | |
1108 | video_url_list = [(None, video_info['conn'][0])] | |
2e3a32e4 | 1109 | |
497cd3e6 RG |
1110 | else: |
1111 | self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info') | |
1112 | return | |
7b7759f5 | 1113 | |
497cd3e6 RG |
1114 | for format_param, video_real_url in video_url_list: |
1115 | # At this point we have a new video | |
1116 | self._downloader.increment_downloads() | |
1117 | ||
1118 | # Extension | |
1119 | video_extension = self._video_extensions.get(format_param, 'flv') | |
7e58d568 | 1120 | |
497cd3e6 | 1121 | # Find the video URL in fmt_url_map or conn paramters |
968aa884 | 1122 | try: |
7b7759f5 | 1123 | # Process video information |
1124 | self._downloader.process_info({ | |
1125 | 'id': video_id.decode('utf-8'), | |
1126 | 'url': video_real_url.decode('utf-8'), | |
1127 | 'uploader': video_uploader.decode('utf-8'), | |
138b11f3 | 1128 | 'upload_date': upload_date, |
7b7759f5 | 1129 | 'title': video_title, |
1130 | 'stitle': simple_title, | |
1131 | 'ext': video_extension.decode('utf-8'), | |
6ba562b0 | 1132 | 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), |
7e58d568 RG |
1133 | 'thumbnail': video_thumbnail.decode('utf-8'), |
1134 | 'description': video_description.decode('utf-8'), | |
e616ec0c | 1135 | 'player_url': player_url, |
7b7759f5 | 1136 | }) |
497cd3e6 | 1137 | except UnavailableVideoError, err: |
09cc744c | 1138 | self._downloader.trouble(u'\nERROR: unable to download video') |
42bcd27d | 1139 | |
4fa74b52 | 1140 | |
020f7150 RG |
1141 | class MetacafeIE(InfoExtractor): |
1142 | """Information Extractor for metacafe.com.""" | |
1143 | ||
1144 | _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' | |
2546e767 | 1145 | _DISCLAIMER = 'http://www.metacafe.com/family_filter/' |
dbccb6cd | 1146 | _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' |
020f7150 RG |
1147 | _youtube_ie = None |
1148 | ||
1149 | def __init__(self, youtube_ie, downloader=None): | |
1150 | InfoExtractor.__init__(self, downloader) | |
1151 | self._youtube_ie = youtube_ie | |
1152 | ||
1153 | @staticmethod | |
1154 | def suitable(url): | |
1155 | return (re.match(MetacafeIE._VALID_URL, url) is not None) | |
1156 | ||
1157 | def report_disclaimer(self): | |
1158 | """Report disclaimer retrieval.""" | |
331ce0a0 | 1159 | self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') |
020f7150 RG |
1160 | |
1161 | def report_age_confirmation(self): | |
1162 | """Report attempt to confirm age.""" | |
331ce0a0 | 1163 | self._downloader.to_screen(u'[metacafe] Confirming age') |
d3975459 | 1164 | |
020f7150 RG |
1165 | def report_download_webpage(self, video_id): |
1166 | """Report webpage download.""" | |
331ce0a0 | 1167 | self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) |
d3975459 | 1168 | |
020f7150 RG |
1169 | def report_extraction(self, video_id): |
1170 | """Report information extraction.""" | |
331ce0a0 | 1171 | self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) |
020f7150 RG |
1172 | |
1173 | def _real_initialize(self): | |
1174 | # Retrieve disclaimer | |
1987c232 | 1175 | request = urllib2.Request(self._DISCLAIMER) |
020f7150 RG |
1176 | try: |
1177 | self.report_disclaimer() | |
1178 | disclaimer = urllib2.urlopen(request).read() | |
1179 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
147753eb | 1180 | self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err)) |
020f7150 RG |
1181 | return |
1182 | ||
1183 | # Confirm age | |
1184 | disclaimer_form = { | |
2546e767 | 1185 | 'filters': '0', |
020f7150 RG |
1186 | 'submit': "Continue - I'm over 18", |
1187 | } | |
1987c232 | 1188 | request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form)) |
020f7150 RG |
1189 | try: |
1190 | self.report_age_confirmation() | |
1191 | disclaimer = urllib2.urlopen(request).read() | |
1192 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
147753eb | 1193 | self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) |
020f7150 | 1194 | return |
d3975459 | 1195 | |
020f7150 RG |
1196 | def _real_extract(self, url): |
1197 | # Extract id and simplified title from URL | |
1198 | mobj = re.match(self._VALID_URL, url) | |
1199 | if mobj is None: | |
147753eb | 1200 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
6f21f686 | 1201 | return |
020f7150 RG |
1202 | |
1203 | video_id = mobj.group(1) | |
1204 | ||
1205 | # Check if video comes from YouTube | |
1206 | mobj2 = re.match(r'^yt-(.*)$', video_id) | |
1207 | if mobj2 is not None: | |
6f21f686 RG |
1208 | self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1)) |
1209 | return | |
020f7150 | 1210 | |
df372a65 | 1211 | # At this point we have a new video |
9bf7fa52 | 1212 | self._downloader.increment_downloads() |
df372a65 | 1213 | |
020f7150 | 1214 | simple_title = mobj.group(2).decode('utf-8') |
020f7150 RG |
1215 | |
1216 | # Retrieve video webpage to extract further information | |
1217 | request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) | |
1218 | try: | |
1219 | self.report_download_webpage(video_id) | |
1220 | webpage = urllib2.urlopen(request).read() | |
1221 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
147753eb | 1222 | self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) |
6f21f686 | 1223 | return |
020f7150 RG |
1224 | |
1225 | # Extract URL, uploader and title from webpage | |
1226 | self.report_extraction(video_id) | |
18963a36 | 1227 | mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) |
c6c555cf RG |
1228 | if mobj is not None: |
1229 | mediaURL = urllib.unquote(mobj.group(1)) | |
6b57e8c5 | 1230 | video_extension = mediaURL[-3:] |
d3975459 | 1231 | |
c6c555cf RG |
1232 | # Extract gdaKey if available |
1233 | mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) | |
1234 | if mobj is None: | |
1235 | video_url = mediaURL | |
1236 | else: | |
1237 | gdaKey = mobj.group(1) | |
1238 | video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) | |
109626fc | 1239 | else: |
c6c555cf RG |
1240 | mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) |
1241 | if mobj is None: | |
1242 | self._downloader.trouble(u'ERROR: unable to extract media URL') | |
1243 | return | |
1244 | vardict = parse_qs(mobj.group(1)) | |
1245 | if 'mediaData' not in vardict: | |
1246 | self._downloader.trouble(u'ERROR: unable to extract media URL') | |
1247 | return | |
1248 | mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) | |
1249 | if mobj is None: | |
1250 | self._downloader.trouble(u'ERROR: unable to extract media URL') | |
1251 | return | |
6b57e8c5 RG |
1252 | mediaURL = mobj.group(1).replace('\\/', '/') |
1253 | video_extension = mediaURL[-3:] | |
1254 | video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2)) | |
020f7150 | 1255 | |
2546e767 | 1256 | mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage) |
020f7150 | 1257 | if mobj is None: |
147753eb | 1258 | self._downloader.trouble(u'ERROR: unable to extract title') |
6f21f686 | 1259 | return |
020f7150 | 1260 | video_title = mobj.group(1).decode('utf-8') |
490fd7ae | 1261 | video_title = sanitize_title(video_title) |
020f7150 | 1262 | |
29f07568 | 1263 | mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage) |
020f7150 | 1264 | if mobj is None: |
147753eb | 1265 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') |
6f21f686 | 1266 | return |
dbccb6cd | 1267 | video_uploader = mobj.group(1) |
020f7150 | 1268 | |
42bcd27d | 1269 | try: |
1270 | # Process video information | |
1271 | self._downloader.process_info({ | |
1272 | 'id': video_id.decode('utf-8'), | |
1273 | 'url': video_url.decode('utf-8'), | |
1274 | 'uploader': video_uploader.decode('utf-8'), | |
138b11f3 | 1275 | 'upload_date': u'NA', |
42bcd27d | 1276 | 'title': video_title, |
1277 | 'stitle': simple_title, | |
1278 | 'ext': video_extension.decode('utf-8'), | |
6ba562b0 | 1279 | 'format': u'NA', |
e616ec0c | 1280 | 'player_url': None, |
42bcd27d | 1281 | }) |
73f4e7af | 1282 | except UnavailableVideoError: |
09cc744c | 1283 | self._downloader.trouble(u'\nERROR: unable to download video') |
020f7150 | 1284 | |
25af2bce | 1285 | |
4135fa45 WB |
1286 | class DailymotionIE(InfoExtractor): |
1287 | """Information Extractor for Dailymotion""" | |
1288 | ||
1289 | _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' | |
4135fa45 WB |
1290 | |
1291 | def __init__(self, downloader=None): | |
1292 | InfoExtractor.__init__(self, downloader) | |
1293 | ||
1294 | @staticmethod | |
1295 | def suitable(url): | |
1296 | return (re.match(DailymotionIE._VALID_URL, url) is not None) | |
1297 | ||
4135fa45 WB |
1298 | def report_download_webpage(self, video_id): |
1299 | """Report webpage download.""" | |
331ce0a0 | 1300 | self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) |
d3975459 | 1301 | |
4135fa45 WB |
1302 | def report_extraction(self, video_id): |
1303 | """Report information extraction.""" | |
331ce0a0 | 1304 | self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) |
4135fa45 WB |
1305 | |
1306 | def _real_initialize(self): | |
1307 | return | |
1308 | ||
4135fa45 WB |
1309 | def _real_extract(self, url): |
1310 | # Extract id and simplified title from URL | |
1311 | mobj = re.match(self._VALID_URL, url) | |
1312 | if mobj is None: | |
1313 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | |
1314 | return | |
1315 | ||
df372a65 | 1316 | # At this point we have a new video |
9bf7fa52 | 1317 | self._downloader.increment_downloads() |
4135fa45 WB |
1318 | video_id = mobj.group(1) |
1319 | ||
1320 | simple_title = mobj.group(2).decode('utf-8') | |
1321 | video_extension = 'flv' | |
1322 | ||
1323 | # Retrieve video webpage to extract further information | |
1324 | request = urllib2.Request(url) | |
1325 | try: | |
1326 | self.report_download_webpage(video_id) | |
1327 | webpage = urllib2.urlopen(request).read() | |
1328 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1329 | self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) | |
1330 | return | |
1331 | ||
1332 | # Extract URL, uploader and title from webpage | |
1333 | self.report_extraction(video_id) | |
1334 | mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage) | |
1335 | if mobj is None: | |
1336 | self._downloader.trouble(u'ERROR: unable to extract media URL') | |
1337 | return | |
1338 | mediaURL = urllib.unquote(mobj.group(1)) | |
1339 | ||
1340 | # if needed add http://www.dailymotion.com/ if relative URL | |
1341 | ||
1342 | video_url = mediaURL | |
1343 | ||
1344 | # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>' | |
1345 | mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage) | |
1346 | if mobj is None: | |
1347 | self._downloader.trouble(u'ERROR: unable to extract title') | |
1348 | return | |
1349 | video_title = mobj.group(1).decode('utf-8') | |
1350 | video_title = sanitize_title(video_title) | |
1351 | ||
c02d8e40 | 1352 | mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage) |
4135fa45 WB |
1353 | if mobj is None: |
1354 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') | |
1355 | return | |
1356 | video_uploader = mobj.group(1) | |
1357 | ||
1358 | try: | |
1359 | # Process video information | |
1360 | self._downloader.process_info({ | |
1361 | 'id': video_id.decode('utf-8'), | |
1362 | 'url': video_url.decode('utf-8'), | |
1363 | 'uploader': video_uploader.decode('utf-8'), | |
138b11f3 | 1364 | 'upload_date': u'NA', |
4135fa45 WB |
1365 | 'title': video_title, |
1366 | 'stitle': simple_title, | |
1367 | 'ext': video_extension.decode('utf-8'), | |
1368 | 'format': u'NA', | |
1369 | 'player_url': None, | |
1370 | }) | |
73f4e7af | 1371 | except UnavailableVideoError: |
09cc744c | 1372 | self._downloader.trouble(u'\nERROR: unable to download video') |
4135fa45 | 1373 | |
49c0028a | 1374 | class GoogleIE(InfoExtractor): |
1375 | """Information extractor for video.google.com.""" | |
1376 | ||
490fd7ae | 1377 | _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' |
49c0028a | 1378 | |
1379 | def __init__(self, downloader=None): | |
1380 | InfoExtractor.__init__(self, downloader) | |
1381 | ||
1382 | @staticmethod | |
1383 | def suitable(url): | |
1384 | return (re.match(GoogleIE._VALID_URL, url) is not None) | |
1385 | ||
1386 | def report_download_webpage(self, video_id): | |
1387 | """Report webpage download.""" | |
331ce0a0 | 1388 | self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id) |
49c0028a | 1389 | |
1390 | def report_extraction(self, video_id): | |
1391 | """Report information extraction.""" | |
331ce0a0 | 1392 | self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id) |
49c0028a | 1393 | |
1394 | def _real_initialize(self): | |
1395 | return | |
1396 | ||
1397 | def _real_extract(self, url): | |
1398 | # Extract id from URL | |
1399 | mobj = re.match(self._VALID_URL, url) | |
1400 | if mobj is None: | |
1401 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) | |
1402 | return | |
1403 | ||
df372a65 | 1404 | # At this point we have a new video |
9bf7fa52 | 1405 | self._downloader.increment_downloads() |
49c0028a | 1406 | video_id = mobj.group(1) |
1407 | ||
1408 | video_extension = 'mp4' | |
1409 | ||
1410 | # Retrieve video webpage to extract further information | |
490fd7ae | 1411 | request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id) |
49c0028a | 1412 | try: |
1413 | self.report_download_webpage(video_id) | |
1414 | webpage = urllib2.urlopen(request).read() | |
1415 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1416 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1417 | return | |
1418 | ||
1419 | # Extract URL, uploader, and title from webpage | |
1420 | self.report_extraction(video_id) | |
490fd7ae RG |
1421 | mobj = re.search(r"download_url:'([^']+)'", webpage) |
1422 | if mobj is None: | |
1423 | video_extension = 'flv' | |
1424 | mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage) | |
49c0028a | 1425 | if mobj is None: |
1426 | self._downloader.trouble(u'ERROR: unable to extract media URL') | |
1427 | return | |
1428 | mediaURL = urllib.unquote(mobj.group(1)) | |
1429 | mediaURL = mediaURL.replace('\\x3d', '\x3d') | |
1430 | mediaURL = mediaURL.replace('\\x26', '\x26') | |
1431 | ||
1432 | video_url = mediaURL | |
1433 | ||
1434 | mobj = re.search(r'<title>(.*)</title>', webpage) | |
1435 | if mobj is None: | |
1436 | self._downloader.trouble(u'ERROR: unable to extract title') | |
1437 | return | |
1438 | video_title = mobj.group(1).decode('utf-8') | |
490fd7ae | 1439 | video_title = sanitize_title(video_title) |
31cbdaaf | 1440 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) |
49c0028a | 1441 | |
7e58d568 RG |
1442 | # Extract video description |
1443 | mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage) | |
1444 | if mobj is None: | |
1445 | self._downloader.trouble(u'ERROR: unable to extract video description') | |
1446 | return | |
1447 | video_description = mobj.group(1).decode('utf-8') | |
1448 | if not video_description: | |
1449 | video_description = 'No description available.' | |
1450 | ||
1451 | # Extract video thumbnail | |
1452 | if self._downloader.params.get('forcethumbnail', False): | |
1453 | request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id))) | |
1454 | try: | |
1455 | webpage = urllib2.urlopen(request).read() | |
1456 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1457 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1458 | return | |
1459 | mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage) | |
1460 | if mobj is None: | |
1461 | self._downloader.trouble(u'ERROR: unable to extract video thumbnail') | |
1462 | return | |
1463 | video_thumbnail = mobj.group(1) | |
1464 | else: # we need something to pass to process_info | |
1465 | video_thumbnail = '' | |
1466 | ||
1467 | ||
49c0028a | 1468 | try: |
1469 | # Process video information | |
1470 | self._downloader.process_info({ | |
1471 | 'id': video_id.decode('utf-8'), | |
1472 | 'url': video_url.decode('utf-8'), | |
6ba562b0 | 1473 | 'uploader': u'NA', |
138b11f3 | 1474 | 'upload_date': u'NA', |
490fd7ae | 1475 | 'title': video_title, |
31cbdaaf | 1476 | 'stitle': simple_title, |
49c0028a | 1477 | 'ext': video_extension.decode('utf-8'), |
6ba562b0 | 1478 | 'format': u'NA', |
e616ec0c | 1479 | 'player_url': None, |
49c0028a | 1480 | }) |
73f4e7af | 1481 | except UnavailableVideoError: |
09cc744c | 1482 | self._downloader.trouble(u'\nERROR: unable to download video') |
49c0028a | 1483 | |
1484 | ||
1485 | class PhotobucketIE(InfoExtractor): | |
1486 | """Information extractor for photobucket.com.""" | |
1487 | ||
1488 | _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' | |
1489 | ||
1490 | def __init__(self, downloader=None): | |
1491 | InfoExtractor.__init__(self, downloader) | |
1492 | ||
1493 | @staticmethod | |
1494 | def suitable(url): | |
1495 | return (re.match(PhotobucketIE._VALID_URL, url) is not None) | |
1496 | ||
1497 | def report_download_webpage(self, video_id): | |
1498 | """Report webpage download.""" | |
331ce0a0 | 1499 | self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) |
49c0028a | 1500 | |
1501 | def report_extraction(self, video_id): | |
1502 | """Report information extraction.""" | |
331ce0a0 | 1503 | self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) |
49c0028a | 1504 | |
1505 | def _real_initialize(self): | |
1506 | return | |
1507 | ||
1508 | def _real_extract(self, url): | |
1509 | # Extract id from URL | |
1510 | mobj = re.match(self._VALID_URL, url) | |
1511 | if mobj is None: | |
1512 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) | |
1513 | return | |
1514 | ||
df372a65 | 1515 | # At this point we have a new video |
9bf7fa52 | 1516 | self._downloader.increment_downloads() |
49c0028a | 1517 | video_id = mobj.group(1) |
1518 | ||
1519 | video_extension = 'flv' | |
1520 | ||
1521 | # Retrieve video webpage to extract further information | |
1522 | request = urllib2.Request(url) | |
1523 | try: | |
1524 | self.report_download_webpage(video_id) | |
1525 | webpage = urllib2.urlopen(request).read() | |
1526 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1527 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1528 | return | |
1529 | ||
1530 | # Extract URL, uploader, and title from webpage | |
1531 | self.report_extraction(video_id) | |
1532 | mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage) | |
1533 | if mobj is None: | |
1534 | self._downloader.trouble(u'ERROR: unable to extract media URL') | |
1535 | return | |
1536 | mediaURL = urllib.unquote(mobj.group(1)) | |
1537 | ||
1538 | video_url = mediaURL | |
1539 | ||
1540 | mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage) | |
1541 | if mobj is None: | |
1542 | self._downloader.trouble(u'ERROR: unable to extract title') | |
1543 | return | |
1544 | video_title = mobj.group(1).decode('utf-8') | |
490fd7ae | 1545 | video_title = sanitize_title(video_title) |
31cbdaaf | 1546 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) |
49c0028a | 1547 | |
1548 | video_uploader = mobj.group(2).decode('utf-8') | |
1549 | ||
1550 | try: | |
1551 | # Process video information | |
1552 | self._downloader.process_info({ | |
1553 | 'id': video_id.decode('utf-8'), | |
1554 | 'url': video_url.decode('utf-8'), | |
490fd7ae | 1555 | 'uploader': video_uploader, |
138b11f3 | 1556 | 'upload_date': u'NA', |
490fd7ae | 1557 | 'title': video_title, |
31cbdaaf | 1558 | 'stitle': simple_title, |
490fd7ae | 1559 | 'ext': video_extension.decode('utf-8'), |
6ba562b0 | 1560 | 'format': u'NA', |
e616ec0c | 1561 | 'player_url': None, |
490fd7ae | 1562 | }) |
73f4e7af | 1563 | except UnavailableVideoError: |
09cc744c | 1564 | self._downloader.trouble(u'\nERROR: unable to download video') |
490fd7ae RG |
1565 | |
1566 | ||
61945318 RG |
1567 | class YahooIE(InfoExtractor): |
1568 | """Information extractor for video.yahoo.com.""" | |
1569 | ||
1570 | # _VALID_URL matches all Yahoo! Video URLs | |
1571 | # _VPAGE_URL matches only the extractable '/watch/' URLs | |
1572 | _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' | |
1573 | _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' | |
1574 | ||
1575 | def __init__(self, downloader=None): | |
1576 | InfoExtractor.__init__(self, downloader) | |
1577 | ||
1578 | @staticmethod | |
1579 | def suitable(url): | |
1580 | return (re.match(YahooIE._VALID_URL, url) is not None) | |
1581 | ||
1582 | def report_download_webpage(self, video_id): | |
1583 | """Report webpage download.""" | |
331ce0a0 | 1584 | self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) |
61945318 RG |
1585 | |
1586 | def report_extraction(self, video_id): | |
1587 | """Report information extraction.""" | |
331ce0a0 | 1588 | self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) |
61945318 RG |
1589 | |
1590 | def _real_initialize(self): | |
1591 | return | |
1592 | ||
df372a65 | 1593 | def _real_extract(self, url, new_video=True): |
61945318 RG |
1594 | # Extract ID from URL |
1595 | mobj = re.match(self._VALID_URL, url) | |
1596 | if mobj is None: | |
1597 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) | |
1598 | return | |
1599 | ||
df372a65 | 1600 | # At this point we have a new video |
9bf7fa52 | 1601 | self._downloader.increment_downloads() |
61945318 RG |
1602 | video_id = mobj.group(2) |
1603 | video_extension = 'flv' | |
1604 | ||
1605 | # Rewrite valid but non-extractable URLs as | |
1606 | # extractable English language /watch/ URLs | |
1607 | if re.match(self._VPAGE_URL, url) is None: | |
1608 | request = urllib2.Request(url) | |
1609 | try: | |
1610 | webpage = urllib2.urlopen(request).read() | |
1611 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1612 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1613 | return | |
1614 | ||
1615 | mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) | |
1616 | if mobj is None: | |
1617 | self._downloader.trouble(u'ERROR: Unable to extract id field') | |
1618 | return | |
1619 | yahoo_id = mobj.group(1) | |
1620 | ||
1621 | mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) | |
1622 | if mobj is None: | |
1623 | self._downloader.trouble(u'ERROR: Unable to extract vid field') | |
1624 | return | |
1625 | yahoo_vid = mobj.group(1) | |
1626 | ||
1627 | url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id) | |
df372a65 | 1628 | return self._real_extract(url, new_video=False) |
61945318 RG |
1629 | |
1630 | # Retrieve video webpage to extract further information | |
1631 | request = urllib2.Request(url) | |
1632 | try: | |
1633 | self.report_download_webpage(video_id) | |
1634 | webpage = urllib2.urlopen(request).read() | |
1635 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1636 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1637 | return | |
1638 | ||
1639 | # Extract uploader and title from webpage | |
1640 | self.report_extraction(video_id) | |
1641 | mobj = re.search(r'<meta name="title" content="(.*)" />', webpage) | |
1642 | if mobj is None: | |
1643 | self._downloader.trouble(u'ERROR: unable to extract video title') | |
1644 | return | |
1645 | video_title = mobj.group(1).decode('utf-8') | |
1646 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) | |
1647 | ||
1648 | mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage) | |
1649 | if mobj is None: | |
1650 | self._downloader.trouble(u'ERROR: unable to extract video uploader') | |
1651 | return | |
1652 | video_uploader = mobj.group(1).decode('utf-8') | |
1653 | ||
7e58d568 RG |
1654 | # Extract video thumbnail |
1655 | mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage) | |
1656 | if mobj is None: | |
1657 | self._downloader.trouble(u'ERROR: unable to extract video thumbnail') | |
1658 | return | |
1659 | video_thumbnail = mobj.group(1).decode('utf-8') | |
1660 | ||
1661 | # Extract video description | |
1662 | mobj = re.search(r'<meta name="description" content="(.*)" />', webpage) | |
1663 | if mobj is None: | |
1664 | self._downloader.trouble(u'ERROR: unable to extract video description') | |
1665 | return | |
1666 | video_description = mobj.group(1).decode('utf-8') | |
1667 | if not video_description: video_description = 'No description available.' | |
1668 | ||
61945318 RG |
1669 | # Extract video height and width |
1670 | mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage) | |
1671 | if mobj is None: | |
1672 | self._downloader.trouble(u'ERROR: unable to extract video height') | |
1673 | return | |
1674 | yv_video_height = mobj.group(1) | |
1675 | ||
1676 | mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage) | |
1677 | if mobj is None: | |
1678 | self._downloader.trouble(u'ERROR: unable to extract video width') | |
1679 | return | |
1680 | yv_video_width = mobj.group(1) | |
1681 | ||
1682 | # Retrieve video playlist to extract media URL | |
1683 | # I'm not completely sure what all these options are, but we | |
1684 | # seem to need most of them, otherwise the server sends a 401. | |
1685 | yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents | |
1686 | yv_bitrate = '700' # according to Wikipedia this is hard-coded | |
1687 | request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + | |
1688 | '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + | |
1689 | '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') | |
1690 | try: | |
1691 | self.report_download_webpage(video_id) | |
1692 | webpage = urllib2.urlopen(request).read() | |
1693 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1694 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1695 | return | |
1696 | ||
1697 | # Extract media URL from playlist XML | |
1698 | mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage) | |
1699 | if mobj is None: | |
1700 | self._downloader.trouble(u'ERROR: Unable to extract media URL') | |
1701 | return | |
1702 | video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') | |
1703 | video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) | |
1704 | ||
1705 | try: | |
1706 | # Process video information | |
1707 | self._downloader.process_info({ | |
1708 | 'id': video_id.decode('utf-8'), | |
1709 | 'url': video_url, | |
1710 | 'uploader': video_uploader, | |
138b11f3 | 1711 | 'upload_date': u'NA', |
61945318 RG |
1712 | 'title': video_title, |
1713 | 'stitle': simple_title, | |
1714 | 'ext': video_extension.decode('utf-8'), | |
7e58d568 RG |
1715 | 'thumbnail': video_thumbnail.decode('utf-8'), |
1716 | 'description': video_description, | |
1717 | 'thumbnail': video_thumbnail, | |
1718 | 'description': video_description, | |
e616ec0c | 1719 | 'player_url': None, |
61945318 | 1720 | }) |
73f4e7af | 1721 | except UnavailableVideoError: |
09cc744c | 1722 | self._downloader.trouble(u'\nERROR: unable to download video') |
61945318 RG |
1723 | |
1724 | ||
490fd7ae RG |
1725 | class GenericIE(InfoExtractor): |
1726 | """Generic last-resort information extractor.""" | |
1727 | ||
1728 | def __init__(self, downloader=None): | |
1729 | InfoExtractor.__init__(self, downloader) | |
1730 | ||
1731 | @staticmethod | |
1732 | def suitable(url): | |
1733 | return True | |
1734 | ||
1735 | def report_download_webpage(self, video_id): | |
1736 | """Report webpage download.""" | |
331ce0a0 RG |
1737 | self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') |
1738 | self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id) | |
490fd7ae RG |
1739 | |
1740 | def report_extraction(self, video_id): | |
1741 | """Report information extraction.""" | |
331ce0a0 | 1742 | self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) |
490fd7ae RG |
1743 | |
1744 | def _real_initialize(self): | |
1745 | return | |
1746 | ||
1747 | def _real_extract(self, url): | |
df372a65 | 1748 | # At this point we have a new video |
9bf7fa52 | 1749 | self._downloader.increment_downloads() |
df372a65 | 1750 | |
490fd7ae RG |
1751 | video_id = url.split('/')[-1] |
1752 | request = urllib2.Request(url) | |
1753 | try: | |
1754 | self.report_download_webpage(video_id) | |
1755 | webpage = urllib2.urlopen(request).read() | |
1756 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1757 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) | |
1758 | return | |
1759 | except ValueError, err: | |
1760 | # since this is the last-resort InfoExtractor, if | |
1761 | # this error is thrown, it'll be thrown here | |
1762 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) | |
1763 | return | |
1764 | ||
a9806fd8 | 1765 | self.report_extraction(video_id) |
490fd7ae RG |
1766 | # Start with something easy: JW Player in SWFObject |
1767 | mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) | |
1768 | if mobj is None: | |
1769 | # Broaden the search a little bit | |
1770 | mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) | |
1771 | if mobj is None: | |
1772 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) | |
1773 | return | |
1774 | ||
1775 | # It's possible that one of the regexes | |
1776 | # matched, but returned an empty group: | |
1777 | if mobj.group(1) is None: | |
1778 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) | |
1779 | return | |
1780 | ||
1781 | video_url = urllib.unquote(mobj.group(1)) | |
1782 | video_id = os.path.basename(video_url) | |
1783 | ||
1784 | # here's a fun little line of code for you: | |
1785 | video_extension = os.path.splitext(video_id)[1][1:] | |
1786 | video_id = os.path.splitext(video_id)[0] | |
1787 | ||
1788 | # it's tempting to parse this further, but you would | |
1789 | # have to take into account all the variations like | |
1790 | # Video Title - Site Name | |
1791 | # Site Name | Video Title | |
1792 | # Video Title - Tagline | Site Name | |
1793 | # and so on and so forth; it's just not practical | |
1794 | mobj = re.search(r'<title>(.*)</title>', webpage) | |
1795 | if mobj is None: | |
1796 | self._downloader.trouble(u'ERROR: unable to extract title') | |
1797 | return | |
1798 | video_title = mobj.group(1).decode('utf-8') | |
1799 | video_title = sanitize_title(video_title) | |
31cbdaaf | 1800 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) |
490fd7ae RG |
1801 | |
1802 | # video uploader is domain name | |
1803 | mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) | |
1804 | if mobj is None: | |
1805 | self._downloader.trouble(u'ERROR: unable to extract title') | |
1806 | return | |
1807 | video_uploader = mobj.group(1).decode('utf-8') | |
1808 | ||
1809 | try: | |
1810 | # Process video information | |
1811 | self._downloader.process_info({ | |
1812 | 'id': video_id.decode('utf-8'), | |
1813 | 'url': video_url.decode('utf-8'), | |
1814 | 'uploader': video_uploader, | |
138b11f3 | 1815 | 'upload_date': u'NA', |
490fd7ae | 1816 | 'title': video_title, |
31cbdaaf | 1817 | 'stitle': simple_title, |
49c0028a | 1818 | 'ext': video_extension.decode('utf-8'), |
6ba562b0 | 1819 | 'format': u'NA', |
e616ec0c | 1820 | 'player_url': None, |
49c0028a | 1821 | }) |
73f4e7af | 1822 | except UnavailableVideoError, err: |
09cc744c | 1823 | self._downloader.trouble(u'\nERROR: unable to download video') |
49c0028a | 1824 | |
1825 | ||
25af2bce RG |
1826 | class YoutubeSearchIE(InfoExtractor): |
1827 | """Information Extractor for YouTube search queries.""" | |
1828 | _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+' | |
1829 | _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' | |
1830 | _VIDEO_INDICATOR = r'href="/watch\?v=.+?"' | |
304a4d85 | 1831 | _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' |
25af2bce | 1832 | _youtube_ie = None |
fd9288c3 | 1833 | _max_youtube_results = 1000 |
25af2bce | 1834 | |
f995f712 | 1835 | def __init__(self, youtube_ie, downloader=None): |
25af2bce RG |
1836 | InfoExtractor.__init__(self, downloader) |
1837 | self._youtube_ie = youtube_ie | |
d3975459 | 1838 | |
25af2bce RG |
1839 | @staticmethod |
1840 | def suitable(url): | |
1841 | return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None) | |
1842 | ||
1843 | def report_download_page(self, query, pagenum): | |
1844 | """Report attempt to download playlist page with given number.""" | |
490fd7ae | 1845 | query = query.decode(preferredencoding()) |
331ce0a0 | 1846 | self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) |
25af2bce RG |
1847 | |
1848 | def _real_initialize(self): | |
1849 | self._youtube_ie.initialize() | |
d3975459 | 1850 | |
25af2bce RG |
1851 | def _real_extract(self, query): |
1852 | mobj = re.match(self._VALID_QUERY, query) | |
1853 | if mobj is None: | |
147753eb | 1854 | self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) |
6f21f686 | 1855 | return |
25af2bce RG |
1856 | |
1857 | prefix, query = query.split(':') | |
1858 | prefix = prefix[8:] | |
490fd7ae | 1859 | query = query.encode('utf-8') |
f995f712 | 1860 | if prefix == '': |
6f21f686 RG |
1861 | self._download_n_results(query, 1) |
1862 | return | |
f995f712 | 1863 | elif prefix == 'all': |
6f21f686 RG |
1864 | self._download_n_results(query, self._max_youtube_results) |
1865 | return | |
f995f712 | 1866 | else: |
25af2bce | 1867 | try: |
e1f18b8a | 1868 | n = long(prefix) |
25af2bce | 1869 | if n <= 0: |
147753eb | 1870 | self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) |
6f21f686 | 1871 | return |
257453b9 | 1872 | elif n > self._max_youtube_results: |
6f21f686 | 1873 | self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) |
257453b9 | 1874 | n = self._max_youtube_results |
6f21f686 RG |
1875 | self._download_n_results(query, n) |
1876 | return | |
e1f18b8a | 1877 | except ValueError: # parsing prefix as integer fails |
6f21f686 RG |
1878 | self._download_n_results(query, 1) |
1879 | return | |
25af2bce RG |
1880 | |
1881 | def _download_n_results(self, query, n): | |
1882 | """Downloads a specified number of results for a query""" | |
1883 | ||
1884 | video_ids = [] | |
1885 | already_seen = set() | |
1886 | pagenum = 1 | |
1887 | ||
1888 | while True: | |
1889 | self.report_download_page(query, pagenum) | |
a9633f14 | 1890 | result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) |
1987c232 | 1891 | request = urllib2.Request(result_url) |
25af2bce RG |
1892 | try: |
1893 | page = urllib2.urlopen(request).read() | |
1894 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
147753eb | 1895 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) |
6f21f686 | 1896 | return |
25af2bce RG |
1897 | |
1898 | # Extract video identifiers | |
1899 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): | |
1900 | video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1] | |
1901 | if video_id not in already_seen: | |
1902 | video_ids.append(video_id) | |
1903 | already_seen.add(video_id) | |
1904 | if len(video_ids) == n: | |
1905 | # Specified n videos reached | |
25af2bce | 1906 | for id in video_ids: |
6f21f686 RG |
1907 | self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) |
1908 | return | |
25af2bce | 1909 | |
304a4d85 | 1910 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: |
25af2bce | 1911 | for id in video_ids: |
6f21f686 RG |
1912 | self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) |
1913 | return | |
25af2bce RG |
1914 | |
1915 | pagenum = pagenum + 1 | |
1916 | ||
7e58d568 RG |
1917 | class GoogleSearchIE(InfoExtractor): |
1918 | """Information Extractor for Google Video search queries.""" | |
1919 | _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+' | |
1920 | _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' | |
1921 | _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' | |
1922 | _MORE_PAGES_INDICATOR = r'<span>Next</span>' | |
1923 | _google_ie = None | |
1924 | _max_google_results = 1000 | |
1925 | ||
1926 | def __init__(self, google_ie, downloader=None): | |
1927 | InfoExtractor.__init__(self, downloader) | |
1928 | self._google_ie = google_ie | |
d3975459 | 1929 | |
7e58d568 RG |
1930 | @staticmethod |
1931 | def suitable(url): | |
1932 | return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None) | |
1933 | ||
1934 | def report_download_page(self, query, pagenum): | |
1935 | """Report attempt to download playlist page with given number.""" | |
1936 | query = query.decode(preferredencoding()) | |
331ce0a0 | 1937 | self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum)) |
7e58d568 RG |
1938 | |
1939 | def _real_initialize(self): | |
1940 | self._google_ie.initialize() | |
d3975459 | 1941 | |
7e58d568 RG |
1942 | def _real_extract(self, query): |
1943 | mobj = re.match(self._VALID_QUERY, query) | |
1944 | if mobj is None: | |
1945 | self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) | |
1946 | return | |
1947 | ||
1948 | prefix, query = query.split(':') | |
1949 | prefix = prefix[8:] | |
1950 | query = query.encode('utf-8') | |
1951 | if prefix == '': | |
1952 | self._download_n_results(query, 1) | |
1953 | return | |
1954 | elif prefix == 'all': | |
1955 | self._download_n_results(query, self._max_google_results) | |
1956 | return | |
1957 | else: | |
1958 | try: | |
1959 | n = long(prefix) | |
1960 | if n <= 0: | |
1961 | self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) | |
1962 | return | |
1963 | elif n > self._max_google_results: | |
1964 | self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) | |
1965 | n = self._max_google_results | |
1966 | self._download_n_results(query, n) | |
1967 | return | |
1968 | except ValueError: # parsing prefix as integer fails | |
1969 | self._download_n_results(query, 1) | |
1970 | return | |
1971 | ||
1972 | def _download_n_results(self, query, n): | |
1973 | """Downloads a specified number of results for a query""" | |
1974 | ||
1975 | video_ids = [] | |
1976 | already_seen = set() | |
1977 | pagenum = 1 | |
1978 | ||
1979 | while True: | |
1980 | self.report_download_page(query, pagenum) | |
1981 | result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) | |
1987c232 | 1982 | request = urllib2.Request(result_url) |
7e58d568 RG |
1983 | try: |
1984 | page = urllib2.urlopen(request).read() | |
1985 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
1986 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) | |
1987 | return | |
1988 | ||
1989 | # Extract video identifiers | |
1990 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): | |
1991 | video_id = mobj.group(1) | |
1992 | if video_id not in already_seen: | |
1993 | video_ids.append(video_id) | |
1994 | already_seen.add(video_id) | |
1995 | if len(video_ids) == n: | |
1996 | # Specified n videos reached | |
1997 | for id in video_ids: | |
1998 | self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id) | |
1999 | return | |
2000 | ||
2001 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: | |
2002 | for id in video_ids: | |
2003 | self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id) | |
2004 | return | |
2005 | ||
2006 | pagenum = pagenum + 1 | |
2007 | ||
2008 | class YahooSearchIE(InfoExtractor): | |
2009 | """Information Extractor for Yahoo! Video search queries.""" | |
2010 | _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+' | |
2011 | _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' | |
2012 | _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"' | |
2013 | _MORE_PAGES_INDICATOR = r'\s*Next' | |
2014 | _yahoo_ie = None | |
2015 | _max_yahoo_results = 1000 | |
2016 | ||
2017 | def __init__(self, yahoo_ie, downloader=None): | |
2018 | InfoExtractor.__init__(self, downloader) | |
2019 | self._yahoo_ie = yahoo_ie | |
d3975459 | 2020 | |
7e58d568 RG |
2021 | @staticmethod |
2022 | def suitable(url): | |
2023 | return (re.match(YahooSearchIE._VALID_QUERY, url) is not None) | |
2024 | ||
2025 | def report_download_page(self, query, pagenum): | |
2026 | """Report attempt to download playlist page with given number.""" | |
2027 | query = query.decode(preferredencoding()) | |
331ce0a0 | 2028 | self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) |
7e58d568 RG |
2029 | |
2030 | def _real_initialize(self): | |
2031 | self._yahoo_ie.initialize() | |
d3975459 | 2032 | |
7e58d568 RG |
2033 | def _real_extract(self, query): |
2034 | mobj = re.match(self._VALID_QUERY, query) | |
2035 | if mobj is None: | |
2036 | self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) | |
2037 | return | |
2038 | ||
2039 | prefix, query = query.split(':') | |
2040 | prefix = prefix[8:] | |
2041 | query = query.encode('utf-8') | |
2042 | if prefix == '': | |
2043 | self._download_n_results(query, 1) | |
2044 | return | |
2045 | elif prefix == 'all': | |
2046 | self._download_n_results(query, self._max_yahoo_results) | |
2047 | return | |
2048 | else: | |
2049 | try: | |
2050 | n = long(prefix) | |
2051 | if n <= 0: | |
2052 | self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) | |
2053 | return | |
2054 | elif n > self._max_yahoo_results: | |
2055 | self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) | |
2056 | n = self._max_yahoo_results | |
2057 | self._download_n_results(query, n) | |
2058 | return | |
2059 | except ValueError: # parsing prefix as integer fails | |
2060 | self._download_n_results(query, 1) | |
2061 | return | |
2062 | ||
2063 | def _download_n_results(self, query, n): | |
2064 | """Downloads a specified number of results for a query""" | |
2065 | ||
2066 | video_ids = [] | |
2067 | already_seen = set() | |
2068 | pagenum = 1 | |
2069 | ||
2070 | while True: | |
2071 | self.report_download_page(query, pagenum) | |
2072 | result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) | |
1987c232 | 2073 | request = urllib2.Request(result_url) |
7e58d568 RG |
2074 | try: |
2075 | page = urllib2.urlopen(request).read() | |
2076 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
2077 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) | |
2078 | return | |
2079 | ||
2080 | # Extract video identifiers | |
2081 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): | |
2082 | video_id = mobj.group(1) | |
2083 | if video_id not in already_seen: | |
2084 | video_ids.append(video_id) | |
2085 | already_seen.add(video_id) | |
2086 | if len(video_ids) == n: | |
2087 | # Specified n videos reached | |
2088 | for id in video_ids: | |
2089 | self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id) | |
2090 | return | |
2091 | ||
2092 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: | |
2093 | for id in video_ids: | |
2094 | self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id) | |
2095 | return | |
2096 | ||
2097 | pagenum = pagenum + 1 | |
2098 | ||
0c2dc87d RG |
2099 | class YoutubePlaylistIE(InfoExtractor): |
2100 | """Information Extractor for YouTube playlists.""" | |
2101 | ||
d119b54d | 2102 | _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' |
f74e22ae | 2103 | _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' |
0c2dc87d | 2104 | _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' |
ce5cafea | 2105 | _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' |
0c2dc87d RG |
2106 | _youtube_ie = None |
2107 | ||
2108 | def __init__(self, youtube_ie, downloader=None): | |
2109 | InfoExtractor.__init__(self, downloader) | |
2110 | self._youtube_ie = youtube_ie | |
d3975459 | 2111 | |
0c2dc87d RG |
2112 | @staticmethod |
2113 | def suitable(url): | |
2114 | return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None) | |
2115 | ||
2116 | def report_download_page(self, playlist_id, pagenum): | |
2117 | """Report attempt to download playlist page with given number.""" | |
331ce0a0 | 2118 | self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) |
0c2dc87d RG |
2119 | |
2120 | def _real_initialize(self): | |
2121 | self._youtube_ie.initialize() | |
d3975459 | 2122 | |
0c2dc87d RG |
2123 | def _real_extract(self, url): |
2124 | # Extract playlist id | |
2125 | mobj = re.match(self._VALID_URL, url) | |
2126 | if mobj is None: | |
147753eb | 2127 | self._downloader.trouble(u'ERROR: invalid url: %s' % url) |
6f21f686 | 2128 | return |
0c2dc87d | 2129 | |
d119b54d RG |
2130 | # Single video case |
2131 | if mobj.group(3) is not None: | |
2132 | self._youtube_ie.extract(mobj.group(3)) | |
2133 | return | |
2134 | ||
0c2dc87d | 2135 | # Download playlist pages |
f74e22ae GI |
2136 | # prefix is 'p' as default for playlists but there are other types that need extra care |
2137 | playlist_prefix = mobj.group(1) | |
2138 | if playlist_prefix == 'a': | |
2139 | playlist_access = 'artist' | |
2140 | else: | |
7cc3c6fd | 2141 | playlist_prefix = 'p' |
f74e22ae GI |
2142 | playlist_access = 'view_play_list' |
2143 | playlist_id = mobj.group(2) | |
0c2dc87d RG |
2144 | video_ids = [] |
2145 | pagenum = 1 | |
2146 | ||
2147 | while True: | |
2148 | self.report_download_page(playlist_id, pagenum) | |
f74e22ae | 2149 | request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)) |
0c2dc87d RG |
2150 | try: |
2151 | page = urllib2.urlopen(request).read() | |
2152 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
147753eb | 2153 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) |
6f21f686 | 2154 | return |
0c2dc87d RG |
2155 | |
2156 | # Extract video identifiers | |
27d98b6e | 2157 | ids_in_page = [] |
0c2dc87d | 2158 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): |
27d98b6e RG |
2159 | if mobj.group(1) not in ids_in_page: |
2160 | ids_in_page.append(mobj.group(1)) | |
2161 | video_ids.extend(ids_in_page) | |
0c2dc87d | 2162 | |
ce5cafea | 2163 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: |
0c2dc87d RG |
2164 | break |
2165 | pagenum = pagenum + 1 | |
2166 | ||
8cc44341 RG |
2167 | playliststart = self._downloader.params.get('playliststart', 1) - 1 |
2168 | playlistend = self._downloader.params.get('playlistend', -1) | |
2169 | video_ids = video_ids[playliststart:playlistend] | |
2170 | ||
0c2dc87d | 2171 | for id in video_ids: |
6f21f686 RG |
2172 | self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) |
2173 | return | |
0c2dc87d | 2174 | |
c39c05cd A |
2175 | class YoutubeUserIE(InfoExtractor): |
2176 | """Information Extractor for YouTube users.""" | |
2177 | ||
5aba6ea4 | 2178 | _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)' |
c39c05cd | 2179 | _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' |
5aba6ea4 RG |
2180 | _GDATA_PAGE_SIZE = 50 |
2181 | _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' | |
2182 | _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' | |
c39c05cd A |
2183 | _youtube_ie = None |
2184 | ||
2185 | def __init__(self, youtube_ie, downloader=None): | |
2186 | InfoExtractor.__init__(self, downloader) | |
2187 | self._youtube_ie = youtube_ie | |
d3975459 | 2188 | |
c39c05cd A |
2189 | @staticmethod |
2190 | def suitable(url): | |
2191 | return (re.match(YoutubeUserIE._VALID_URL, url) is not None) | |
2192 | ||
5aba6ea4 | 2193 | def report_download_page(self, username, start_index): |
c39c05cd | 2194 | """Report attempt to download user page.""" |
5aba6ea4 RG |
2195 | self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % |
2196 | (username, start_index, start_index + self._GDATA_PAGE_SIZE)) | |
c39c05cd A |
2197 | |
2198 | def _real_initialize(self): | |
2199 | self._youtube_ie.initialize() | |
d3975459 | 2200 | |
c39c05cd A |
2201 | def _real_extract(self, url): |
2202 | # Extract username | |
2203 | mobj = re.match(self._VALID_URL, url) | |
2204 | if mobj is None: | |
2205 | self._downloader.trouble(u'ERROR: invalid url: %s' % url) | |
2206 | return | |
2207 | ||
c39c05cd | 2208 | username = mobj.group(1) |
5aba6ea4 RG |
2209 | |
2210 | # Download video ids using YouTube Data API. Result size per | |
2211 | # query is limited (currently to 50 videos) so we need to query | |
2212 | # page by page until there are no video ids - it means we got | |
2213 | # all of them. | |
2214 | ||
c39c05cd | 2215 | video_ids = [] |
5aba6ea4 | 2216 | pagenum = 0 |
c39c05cd | 2217 | |
5aba6ea4 RG |
2218 | while True: |
2219 | start_index = pagenum * self._GDATA_PAGE_SIZE + 1 | |
2220 | self.report_download_page(username, start_index) | |
c39c05cd | 2221 | |
5aba6ea4 | 2222 | request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) |
c39c05cd | 2223 | |
5aba6ea4 RG |
2224 | try: |
2225 | page = urllib2.urlopen(request).read() | |
2226 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
2227 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) | |
2228 | return | |
c39c05cd | 2229 | |
5aba6ea4 RG |
2230 | # Extract video identifiers |
2231 | ids_in_page = [] | |
2232 | ||
2233 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): | |
2234 | if mobj.group(1) not in ids_in_page: | |
2235 | ids_in_page.append(mobj.group(1)) | |
2236 | ||
2237 | video_ids.extend(ids_in_page) | |
2238 | ||
2239 | # A little optimization - if current page is not | |
2240 | # "full", ie. does not contain PAGE_SIZE video ids then | |
2241 | # we can assume that this page is the last one - there | |
2242 | # are no more ids on further pages - no need to query | |
2243 | # again. | |
2244 | ||
2245 | if len(ids_in_page) < self._GDATA_PAGE_SIZE: | |
2246 | break | |
2247 | ||
2248 | pagenum += 1 | |
2249 | ||
2250 | all_ids_count = len(video_ids) | |
8cc44341 RG |
2251 | playliststart = self._downloader.params.get('playliststart', 1) - 1 |
2252 | playlistend = self._downloader.params.get('playlistend', -1) | |
204c9398 | 2253 | |
5aba6ea4 RG |
2254 | if playlistend == -1: |
2255 | video_ids = video_ids[playliststart:] | |
2256 | else: | |
2257 | video_ids = video_ids[playliststart:playlistend] | |
2258 | ||
2259 | self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" % | |
2260 | (username, all_ids_count, len(video_ids))) | |
2261 | ||
2262 | for video_id in video_ids: | |
2263 | self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id) | |
2264 | ||
c39c05cd | 2265 | |
27179cfd VV |
2266 | class DepositFilesIE(InfoExtractor): |
2267 | """Information extractor for depositfiles.com""" | |
2268 | ||
2269 | _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)' | |
2270 | ||
2271 | def __init__(self, downloader=None): | |
2272 | InfoExtractor.__init__(self, downloader) | |
2273 | ||
2274 | @staticmethod | |
2275 | def suitable(url): | |
2276 | return (re.match(DepositFilesIE._VALID_URL, url) is not None) | |
2277 | ||
2278 | def report_download_webpage(self, file_id): | |
2279 | """Report webpage download.""" | |
2280 | self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id) | |
2281 | ||
2282 | def report_extraction(self, file_id): | |
2283 | """Report information extraction.""" | |
2284 | self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id) | |
2285 | ||
2286 | def _real_initialize(self): | |
2287 | return | |
2288 | ||
2289 | def _real_extract(self, url): | |
2290 | # At this point we have a new file | |
2291 | self._downloader.increment_downloads() | |
2292 | ||
2293 | file_id = url.split('/')[-1] | |
2294 | # Rebuild url in english locale | |
2295 | url = 'http://depositfiles.com/en/files/' + file_id | |
2296 | ||
2297 | # Retrieve file webpage with 'Free download' button pressed | |
2298 | free_download_indication = { 'gateway_result' : '1' } | |
1987c232 | 2299 | request = urllib2.Request(url, urllib.urlencode(free_download_indication)) |
27179cfd VV |
2300 | try: |
2301 | self.report_download_webpage(file_id) | |
2302 | webpage = urllib2.urlopen(request).read() | |
2303 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
2304 | self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err)) | |
2305 | return | |
2306 | ||
2307 | # Search for the real file URL | |
2308 | mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage) | |
2309 | if (mobj is None) or (mobj.group(1) is None): | |
2310 | # Try to figure out reason of the error. | |
2311 | mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL) | |
2312 | if (mobj is not None) and (mobj.group(1) is not None): | |
2313 | restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip() | |
2314 | self._downloader.trouble(u'ERROR: %s' % restriction_message) | |
2315 | else: | |
2316 | self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url) | |
2317 | return | |
2318 | ||
2319 | file_url = mobj.group(1) | |
2320 | file_extension = os.path.splitext(file_url)[1][1:] | |
2321 | ||
2322 | # Search for file title | |
2323 | mobj = re.search(r'<b title="(.*?)">', webpage) | |
2324 | if mobj is None: | |
2325 | self._downloader.trouble(u'ERROR: unable to extract title') | |
2326 | return | |
2327 | file_title = mobj.group(1).decode('utf-8') | |
2328 | ||
2329 | try: | |
2330 | # Process file information | |
2331 | self._downloader.process_info({ | |
2332 | 'id': file_id.decode('utf-8'), | |
2333 | 'url': file_url.decode('utf-8'), | |
2334 | 'uploader': u'NA', | |
2335 | 'upload_date': u'NA', | |
2336 | 'title': file_title, | |
2337 | 'stitle': file_title, | |
2338 | 'ext': file_extension.decode('utf-8'), | |
2339 | 'format': u'NA', | |
2340 | 'player_url': None, | |
2341 | }) | |
2342 | except UnavailableVideoError, err: | |
2343 | self._downloader.trouble(u'ERROR: unable to download file') | |
2344 | ||
9f5f9602 GI |
2345 | class FacebookIE(InfoExtractor): |
2346 | """Information Extractor for Facebook""" | |
2347 | ||
2348 | _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' | |
2349 | _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' | |
2350 | _NETRC_MACHINE = 'facebook' | |
2351 | _available_formats = ['highqual', 'lowqual'] | |
2352 | _video_extensions = { | |
2353 | 'highqual': 'mp4', | |
2354 | 'lowqual': 'mp4', | |
2355 | } | |
2356 | ||
2357 | def __init__(self, downloader=None): | |
2358 | InfoExtractor.__init__(self, downloader) | |
2359 | ||
2360 | @staticmethod | |
2361 | def suitable(url): | |
2362 | return (re.match(FacebookIE._VALID_URL, url) is not None) | |
2363 | ||
2364 | def _reporter(self, message): | |
2365 | """Add header and report message.""" | |
2366 | self._downloader.to_screen(u'[facebook] %s' % message) | |
2367 | ||
2368 | def report_login(self): | |
2369 | """Report attempt to log in.""" | |
2370 | self._reporter(u'Logging in') | |
2371 | ||
2372 | def report_video_webpage_download(self, video_id): | |
2373 | """Report attempt to download video webpage.""" | |
2374 | self._reporter(u'%s: Downloading video webpage' % video_id) | |
2375 | ||
2376 | def report_information_extraction(self, video_id): | |
2377 | """Report attempt to extract video information.""" | |
2378 | self._reporter(u'%s: Extracting video information' % video_id) | |
2379 | ||
2380 | def _parse_page(self, video_webpage): | |
2381 | """Extract video information from page""" | |
2382 | # General data | |
2383 | data = {'title': r'class="video_title datawrap">(.*?)</', | |
2384 | 'description': r'<div class="datawrap">(.*?)</div>', | |
2385 | 'owner': r'\("video_owner_name", "(.*?)"\)', | |
2386 | 'upload_date': r'data-date="(.*?)"', | |
2387 | 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)', | |
2388 | } | |
2389 | video_info = {} | |
2390 | for piece in data.keys(): | |
2391 | mobj = re.search(data[piece], video_webpage) | |
2392 | if mobj is not None: | |
2393 | video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape")) | |
2394 | ||
2395 | # Video urls | |
2396 | video_urls = {} | |
2397 | for fmt in self._available_formats: | |
2398 | mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage) | |
2399 | if mobj is not None: | |
2400 | # URL is in a Javascript segment inside an escaped Unicode format within | |
2401 | # the generally utf-8 page | |
2402 | video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape")) | |
2403 | video_info['video_urls'] = video_urls | |
2404 | ||
2405 | return video_info | |
2406 | ||
2407 | def _real_initialize(self): | |
2408 | if self._downloader is None: | |
2409 | return | |
2410 | ||
2411 | useremail = None | |
2412 | password = None | |
2413 | downloader_params = self._downloader.params | |
2414 | ||
2415 | # Attempt to use provided username and password or .netrc data | |
2416 | if downloader_params.get('username', None) is not None: | |
2417 | useremail = downloader_params['username'] | |
2418 | password = downloader_params['password'] | |
2419 | elif downloader_params.get('usenetrc', False): | |
2420 | try: | |
2421 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) | |
2422 | if info is not None: | |
2423 | useremail = info[0] | |
2424 | password = info[2] | |
2425 | else: | |
2426 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) | |
2427 | except (IOError, netrc.NetrcParseError), err: | |
2428 | self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) | |
2429 | return | |
2430 | ||
2431 | if useremail is None: | |
2432 | return | |
2433 | ||
2434 | # Log in | |
2435 | login_form = { | |
2436 | 'email': useremail, | |
2437 | 'pass': password, | |
2438 | 'login': 'Log+In' | |
2439 | } | |
2440 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) | |
2441 | try: | |
2442 | self.report_login() | |
2443 | login_results = urllib2.urlopen(request).read() | |
2444 | if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: | |
2445 | self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') | |
2446 | return | |
2447 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
2448 | self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) | |
2449 | return | |
2450 | ||
2451 | def _real_extract(self, url): | |
2452 | mobj = re.match(self._VALID_URL, url) | |
2453 | if mobj is None: | |
2454 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | |
2455 | return | |
2456 | video_id = mobj.group('ID') | |
2457 | ||
2458 | # Get video webpage | |
2459 | self.report_video_webpage_download(video_id) | |
2460 | request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id) | |
2461 | try: | |
2462 | page = urllib2.urlopen(request) | |
2463 | video_webpage = page.read() | |
2464 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
2465 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) | |
2466 | return | |
2467 | ||
2468 | # Start extracting information | |
2469 | self.report_information_extraction(video_id) | |
2470 | ||
2471 | # Extract information | |
2472 | video_info = self._parse_page(video_webpage) | |
2473 | ||
2474 | # uploader | |
2475 | if 'owner' not in video_info: | |
2476 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') | |
2477 | return | |
2478 | video_uploader = video_info['owner'] | |
2479 | ||
2480 | # title | |
2481 | if 'title' not in video_info: | |
2482 | self._downloader.trouble(u'ERROR: unable to extract video title') | |
2483 | return | |
2484 | video_title = video_info['title'] | |
2485 | video_title = video_title.decode('utf-8') | |
2486 | video_title = sanitize_title(video_title) | |
2487 | ||
2488 | # simplified title | |
2489 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) | |
2490 | simple_title = simple_title.strip(ur'_') | |
2491 | ||
2492 | # thumbnail image | |
2493 | if 'thumbnail' not in video_info: | |
2494 | self._downloader.trouble(u'WARNING: unable to extract video thumbnail') | |
2495 | video_thumbnail = '' | |
2496 | else: | |
2497 | video_thumbnail = video_info['thumbnail'] | |
2498 | ||
2499 | # upload date | |
2500 | upload_date = u'NA' | |
2501 | if 'upload_date' in video_info: | |
2502 | upload_time = video_info['upload_date'] | |
2503 | timetuple = email.utils.parsedate_tz(upload_time) | |
2504 | if timetuple is not None: | |
2505 | try: | |
2506 | upload_date = time.strftime('%Y%m%d', timetuple[0:9]) | |
2507 | except: | |
2508 | pass | |
2509 | ||
2510 | # description | |
2511 | video_description = 'No description available.' | |
2512 | if (self._downloader.params.get('forcedescription', False) and | |
2513 | 'description' in video_info): | |
2514 | video_description = video_info['description'] | |
2515 | ||
2516 | url_map = video_info['video_urls'] | |
2517 | if len(url_map.keys()) > 0: | |
2518 | # Decide which formats to download | |
2519 | req_format = self._downloader.params.get('format', None) | |
2520 | format_limit = self._downloader.params.get('format_limit', None) | |
2521 | ||
2522 | if format_limit is not None and format_limit in self._available_formats: | |
2523 | format_list = self._available_formats[self._available_formats.index(format_limit):] | |
2524 | else: | |
2525 | format_list = self._available_formats | |
2526 | existing_formats = [x for x in format_list if x in url_map] | |
2527 | if len(existing_formats) == 0: | |
2528 | self._downloader.trouble(u'ERROR: no known formats available for video') | |
2529 | return | |
2530 | if req_format is None: | |
2531 | video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality | |
2532 | elif req_format == '-1': | |
2533 | video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats | |
2534 | else: | |
2535 | # Specific format | |
2536 | if req_format not in url_map: | |
2537 | self._downloader.trouble(u'ERROR: requested format not available') | |
2538 | return | |
2539 | video_url_list = [(req_format, url_map[req_format])] # Specific format | |
2540 | ||
2541 | for format_param, video_real_url in video_url_list: | |
2542 | ||
2543 | # At this point we have a new video | |
2544 | self._downloader.increment_downloads() | |
2545 | ||
2546 | # Extension | |
2547 | video_extension = self._video_extensions.get(format_param, 'mp4') | |
2548 | ||
2549 | # Find the video URL in fmt_url_map or conn paramters | |
2550 | try: | |
2551 | # Process video information | |
2552 | self._downloader.process_info({ | |
2553 | 'id': video_id.decode('utf-8'), | |
2554 | 'url': video_real_url.decode('utf-8'), | |
2555 | 'uploader': video_uploader.decode('utf-8'), | |
2556 | 'upload_date': upload_date, | |
2557 | 'title': video_title, | |
2558 | 'stitle': simple_title, | |
2559 | 'ext': video_extension.decode('utf-8'), | |
2560 | 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), | |
2561 | 'thumbnail': video_thumbnail.decode('utf-8'), | |
2562 | 'description': video_description.decode('utf-8'), | |
2563 | 'player_url': None, | |
2564 | }) | |
2565 | except UnavailableVideoError, err: | |
2566 | self._downloader.trouble(u'\nERROR: unable to download video') | |
2567 | ||
65cd34c5 RG |
2568 | class PostProcessor(object): |
2569 | """Post Processor class. | |
2570 | ||
2571 | PostProcessor objects can be added to downloaders with their | |
2572 | add_post_processor() method. When the downloader has finished a | |
2573 | successful download, it will take its internal chain of PostProcessors | |
2574 | and start calling the run() method on each one of them, first with | |
2575 | an initial argument and then with the returned value of the previous | |
2576 | PostProcessor. | |
2577 | ||
2578 | The chain will be stopped if one of them ever returns None or the end | |
2579 | of the chain is reached. | |
2580 | ||
2581 | PostProcessor objects follow a "mutual registration" process similar | |
2582 | to InfoExtractor objects. | |
2583 | """ | |
2584 | ||
2585 | _downloader = None | |
2586 | ||
2587 | def __init__(self, downloader=None): | |
2588 | self._downloader = downloader | |
2589 | ||
65cd34c5 RG |
2590 | def set_downloader(self, downloader): |
2591 | """Sets the downloader for this PP.""" | |
2592 | self._downloader = downloader | |
d3975459 | 2593 | |
65cd34c5 RG |
2594 | def run(self, information): |
2595 | """Run the PostProcessor. | |
2596 | ||
2597 | The "information" argument is a dictionary like the ones | |
2f11508a | 2598 | composed by InfoExtractors. The only difference is that this |
65cd34c5 RG |
2599 | one has an extra field called "filepath" that points to the |
2600 | downloaded file. | |
2601 | ||
2602 | When this method returns None, the postprocessing chain is | |
2603 | stopped. However, this method may return an information | |
2604 | dictionary that will be passed to the next postprocessing | |
2605 | object in the chain. It can be the one it received after | |
2606 | changing some fields. | |
2607 | ||
2608 | In addition, this method may raise a PostProcessingError | |
2609 | exception that will be taken into account by the downloader | |
2610 | it was called from. | |
2611 | """ | |
2612 | return information # by default, do nothing | |
d3975459 | 2613 | |
3072fab1 RG |
2614 | class FFmpegExtractAudioPP(PostProcessor): |
2615 | ||
2616 | def __init__(self, downloader=None, preferredcodec=None): | |
2617 | PostProcessor.__init__(self, downloader) | |
2618 | if preferredcodec is None: | |
2619 | preferredcodec = 'best' | |
2620 | self._preferredcodec = preferredcodec | |
2621 | ||
2622 | @staticmethod | |
2623 | def get_audio_codec(path): | |
da273188 | 2624 | try: |
2727dbf7 RG |
2625 | cmd = ['ffprobe', '-show_streams', '--', path] |
2626 | handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE) | |
da273188 RG |
2627 | output = handle.communicate()[0] |
2628 | if handle.wait() != 0: | |
2629 | return None | |
2630 | except (IOError, OSError): | |
3072fab1 RG |
2631 | return None |
2632 | audio_codec = None | |
2633 | for line in output.split('\n'): | |
2634 | if line.startswith('codec_name='): | |
2635 | audio_codec = line.split('=')[1].strip() | |
2636 | elif line.strip() == 'codec_type=audio' and audio_codec is not None: | |
2637 | return audio_codec | |
2638 | return None | |
2639 | ||
2640 | @staticmethod | |
2641 | def run_ffmpeg(path, out_path, codec, more_opts): | |
2642 | try: | |
2727dbf7 RG |
2643 | cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path] |
2644 | ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT) | |
3072fab1 RG |
2645 | return (ret == 0) |
2646 | except (IOError, OSError): | |
2647 | return False | |
2648 | ||
2649 | def run(self, information): | |
2650 | path = information['filepath'] | |
2651 | ||
2652 | filecodec = self.get_audio_codec(path) | |
2653 | if filecodec is None: | |
da273188 | 2654 | self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe') |
3072fab1 RG |
2655 | return None |
2656 | ||
2657 | more_opts = [] | |
2658 | if self._preferredcodec == 'best' or self._preferredcodec == filecodec: | |
2659 | if filecodec == 'aac' or filecodec == 'mp3': | |
2660 | # Lossless if possible | |
2661 | acodec = 'copy' | |
2662 | extension = filecodec | |
2663 | if filecodec == 'aac': | |
2664 | more_opts = ['-f', 'adts'] | |
2665 | else: | |
2666 | # MP3 otherwise. | |
2667 | acodec = 'libmp3lame' | |
2668 | extension = 'mp3' | |
2669 | more_opts = ['-ab', '128k'] | |
2670 | else: | |
2671 | # We convert the audio (lossy) | |
2672 | acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec] | |
2673 | extension = self._preferredcodec | |
2674 | more_opts = ['-ab', '128k'] | |
2675 | if self._preferredcodec == 'aac': | |
2676 | more_opts += ['-f', 'adts'] | |
2677 | ||
2678 | (prefix, ext) = os.path.splitext(path) | |
2679 | new_path = prefix + '.' + extension | |
2680 | self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path) | |
2681 | status = self.run_ffmpeg(path, new_path, acodec, more_opts) | |
2682 | ||
2683 | if not status: | |
1bd92582 | 2684 | self._downloader.to_stderr(u'WARNING: error running ffmpeg') |
3072fab1 RG |
2685 | return None |
2686 | ||
2687 | try: | |
2688 | os.remove(path) | |
2689 | except (IOError, OSError): | |
2690 | self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file') | |
2691 | return None | |
2692 | ||
2693 | information['filepath'] = new_path | |
2694 | return information | |
2695 | ||
65cd34c5 | 2696 | ### MAIN PROGRAM ### |
4fa74b52 RG |
2697 | if __name__ == '__main__': |
2698 | try: | |
f9f1e798 | 2699 | # Modules needed only when running the main program |
209e9e27 | 2700 | import getpass |
f9f1e798 RG |
2701 | import optparse |
2702 | ||
0fe64c04 | 2703 | # Function to update the program file with the latest version from the repository. |
4bec29ef RG |
2704 | def update_self(downloader, filename): |
2705 | # Note: downloader only used for options | |
0fe64c04 | 2706 | if not os.access(filename, os.W_OK): |
4bec29ef RG |
2707 | sys.exit('ERROR: no write permissions on %s' % filename) |
2708 | ||
331ce0a0 | 2709 | downloader.to_screen('Updating to latest stable version...') |
0fe64c04 RG |
2710 | try: |
2711 | latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION' | |
2712 | latest_version = urllib.urlopen(latest_url).read().strip() | |
2713 | prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version | |
2714 | newcontent = urllib.urlopen(prog_url).read() | |
2715 | except (IOError, OSError), err: | |
2716 | sys.exit('ERROR: unable to download latest version') | |
2717 | try: | |
2718 | stream = open(filename, 'w') | |
2719 | stream.write(newcontent) | |
2720 | stream.close() | |
2721 | except (IOError, OSError), err: | |
2722 | sys.exit('ERROR: unable to overwrite current version') | |
331ce0a0 | 2723 | downloader.to_screen('Updated to version %s' % latest_version) |
4bec29ef | 2724 | |
f9f1e798 | 2725 | # Parse command line |
209e9e27 | 2726 | parser = optparse.OptionParser( |
7b7759f5 | 2727 | usage='Usage: %prog [options] url...', |
33d507f1 | 2728 | version='2011.08.04', |
7b7759f5 | 2729 | conflict_handler='resolve', |
2730 | ) | |
2731 | ||
209e9e27 RG |
2732 | parser.add_option('-h', '--help', |
2733 | action='help', help='print this help text and exit') | |
2734 | parser.add_option('-v', '--version', | |
2735 | action='version', help='print program version and exit') | |
4bec29ef RG |
2736 | parser.add_option('-U', '--update', |
2737 | action='store_true', dest='update_self', help='update this program to latest stable version') | |
7b7759f5 | 2738 | parser.add_option('-i', '--ignore-errors', |
2739 | action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) | |
2740 | parser.add_option('-r', '--rate-limit', | |
2b06c33d | 2741 | dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)') |
7031008c | 2742 | parser.add_option('-R', '--retries', |
2b06c33d | 2743 | dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10) |
204c9398 RG |
2744 | parser.add_option('--playlist-start', |
2745 | dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) | |
8cc44341 RG |
2746 | parser.add_option('--playlist-end', |
2747 | dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) | |
e7cf18cb | 2748 | parser.add_option('--dump-user-agent', |
6025795d RG |
2749 | action='store_true', dest='dump_user_agent', |
2750 | help='display the current browser identification', default=False) | |
7b7759f5 | 2751 | |
2752 | authentication = optparse.OptionGroup(parser, 'Authentication Options') | |
2753 | authentication.add_option('-u', '--username', | |
2b06c33d | 2754 | dest='username', metavar='USERNAME', help='account username') |
7b7759f5 | 2755 | authentication.add_option('-p', '--password', |
2b06c33d | 2756 | dest='password', metavar='PASSWORD', help='account password') |
7b7759f5 | 2757 | authentication.add_option('-n', '--netrc', |
209e9e27 | 2758 | action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False) |
7b7759f5 | 2759 | parser.add_option_group(authentication) |
2760 | ||
2761 | video_format = optparse.OptionGroup(parser, 'Video Format Options') | |
2762 | video_format.add_option('-f', '--format', | |
2b06c33d | 2763 | action='store', dest='format', metavar='FORMAT', help='video format code') |
6ba562b0 RG |
2764 | video_format.add_option('--all-formats', |
2765 | action='store_const', dest='format', help='download all available video formats', const='-1') | |
f2413e67 | 2766 | video_format.add_option('--max-quality', |
460d8acb | 2767 | action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') |
7b7759f5 | 2768 | parser.add_option_group(video_format) |
2769 | ||
2770 | verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') | |
2771 | verbosity.add_option('-q', '--quiet', | |
2772 | action='store_true', dest='quiet', help='activates quiet mode', default=False) | |
2773 | verbosity.add_option('-s', '--simulate', | |
2774 | action='store_true', dest='simulate', help='do not download video', default=False) | |
2775 | verbosity.add_option('-g', '--get-url', | |
2776 | action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) | |
2777 | verbosity.add_option('-e', '--get-title', | |
2778 | action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False) | |
7e58d568 | 2779 | verbosity.add_option('--get-thumbnail', |
6025795d RG |
2780 | action='store_true', dest='getthumbnail', |
2781 | help='simulate, quiet but print thumbnail URL', default=False) | |
7e58d568 | 2782 | verbosity.add_option('--get-description', |
6025795d RG |
2783 | action='store_true', dest='getdescription', |
2784 | help='simulate, quiet but print video description', default=False) | |
9f796346 | 2785 | verbosity.add_option('--get-filename', |
6025795d RG |
2786 | action='store_true', dest='getfilename', |
2787 | help='simulate, quiet but print output filename', default=False) | |
d9835247 RG |
2788 | verbosity.add_option('--no-progress', |
2789 | action='store_true', dest='noprogress', help='do not print progress bar', default=False) | |
ccbd296b | 2790 | verbosity.add_option('--console-title', |
6025795d RG |
2791 | action='store_true', dest='consoletitle', |
2792 | help='display progress in console titlebar', default=False) | |
7b7759f5 | 2793 | parser.add_option_group(verbosity) |
2794 | ||
2795 | filesystem = optparse.OptionGroup(parser, 'Filesystem Options') | |
1c76e23e RG |
2796 | filesystem.add_option('-t', '--title', |
2797 | action='store_true', dest='usetitle', help='use title in file name', default=False) | |
2798 | filesystem.add_option('-l', '--literal', | |
2799 | action='store_true', dest='useliteral', help='use literal title in file name', default=False) | |
1e47d226 | 2800 | filesystem.add_option('-A', '--auto-number', |
6025795d RG |
2801 | action='store_true', dest='autonumber', |
2802 | help='number downloaded files starting from 00000', default=False) | |
7b7759f5 | 2803 | filesystem.add_option('-o', '--output', |
2b06c33d | 2804 | dest='outtmpl', metavar='TEMPLATE', help='output filename template') |
7b7759f5 | 2805 | filesystem.add_option('-a', '--batch-file', |
2b06c33d | 2806 | dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') |
7b7759f5 | 2807 | filesystem.add_option('-w', '--no-overwrites', |
0beeff4b | 2808 | action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) |
f76c2df6 PI |
2809 | filesystem.add_option('-c', '--continue', |
2810 | action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False) | |
80066952 RG |
2811 | filesystem.add_option('--cookies', |
2812 | dest='cookiefile', metavar='FILE', help='file to dump cookie jar to') | |
3fb2c487 RG |
2813 | filesystem.add_option('--no-part', |
2814 | action='store_true', dest='nopart', help='do not use .part files', default=False) | |
e3018902 RG |
2815 | filesystem.add_option('--no-mtime', |
2816 | action='store_false', dest='updatetime', | |
2817 | help='do not use the Last-modified header to set the file modification time', default=True) | |
7b7759f5 | 2818 | parser.add_option_group(filesystem) |
2819 | ||
3072fab1 RG |
2820 | postproc = optparse.OptionGroup(parser, 'Post-processing Options') |
2821 | postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False, | |
2822 | help='convert video files to audio-only files (requires ffmpeg and ffprobe)') | |
2823 | postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best', | |
2824 | help='"best", "aac" or "mp3"; best by default') | |
2825 | parser.add_option_group(postproc) | |
2826 | ||
209e9e27 | 2827 | (opts, args) = parser.parse_args() |
2a7353b8 | 2828 | |
80066952 RG |
2829 | # Open appropriate CookieJar |
2830 | if opts.cookiefile is None: | |
2831 | jar = cookielib.CookieJar() | |
2832 | else: | |
2833 | try: | |
2834 | jar = cookielib.MozillaCookieJar(opts.cookiefile) | |
e0c982c8 RG |
2835 | if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK): |
2836 | jar.load() | |
80066952 RG |
2837 | except (IOError, OSError), err: |
2838 | sys.exit(u'ERROR: unable to open cookie file') | |
2839 | ||
e7cf18cb RG |
2840 | # Dump user agent |
2841 | if opts.dump_user_agent: | |
2842 | print std_headers['User-Agent'] | |
2843 | sys.exit(0) | |
2844 | ||
80066952 RG |
2845 | # General configuration |
2846 | cookie_processor = urllib2.HTTPCookieProcessor(jar) | |
1987c232 | 2847 | urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())) |
80066952 RG |
2848 | socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) |
2849 | ||
c6fd0bb8 | 2850 | # Batch file verification |
d1580ed9 | 2851 | batchurls = [] |
c6fd0bb8 RG |
2852 | if opts.batchfile is not None: |
2853 | try: | |
2a7353b8 RG |
2854 | if opts.batchfile == '-': |
2855 | batchfd = sys.stdin | |
2856 | else: | |
2857 | batchfd = open(opts.batchfile, 'r') | |
2858 | batchurls = batchfd.readlines() | |
b65740e4 | 2859 | batchurls = [x.strip() for x in batchurls] |
817e8f52 | 2860 | batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] |
c6fd0bb8 RG |
2861 | except IOError: |
2862 | sys.exit(u'ERROR: batch file could not be read') | |
2863 | all_urls = batchurls + args | |
2864 | ||
209e9e27 | 2865 | # Conflicting, missing and erroneous options |
209e9e27 | 2866 | if opts.usenetrc and (opts.username is not None or opts.password is not None): |
2740c509 | 2867 | parser.error(u'using .netrc conflicts with giving username/password') |
209e9e27 | 2868 | if opts.password is not None and opts.username is None: |
2740c509 | 2869 | parser.error(u'account username missing') |
1e47d226 NA |
2870 | if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber): |
2871 | parser.error(u'using output template conflicts with using title, literal title or auto number') | |
209e9e27 | 2872 | if opts.usetitle and opts.useliteral: |
2740c509 | 2873 | parser.error(u'using title conflicts with using literal title') |
209e9e27 | 2874 | if opts.username is not None and opts.password is None: |
76a7f364 | 2875 | opts.password = getpass.getpass(u'Type account password and press return:') |
acd3d842 RG |
2876 | if opts.ratelimit is not None: |
2877 | numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) | |
2878 | if numeric_limit is None: | |
2740c509 | 2879 | parser.error(u'invalid rate limit specified') |
acd3d842 | 2880 | opts.ratelimit = numeric_limit |
7031008c RG |
2881 | if opts.retries is not None: |
2882 | try: | |
2883 | opts.retries = long(opts.retries) | |
2884 | except (TypeError, ValueError), err: | |
2885 | parser.error(u'invalid retry count specified') | |
8cc44341 RG |
2886 | try: |
2887 | opts.playliststart = long(opts.playliststart) | |
2888 | if opts.playliststart <= 0: | |
2889 | raise ValueError | |
2890 | except (TypeError, ValueError), err: | |
2891 | parser.error(u'invalid playlist start number specified') | |
2892 | try: | |
2893 | opts.playlistend = long(opts.playlistend) | |
2894 | if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): | |
2895 | raise ValueError | |
2896 | except (TypeError, ValueError), err: | |
2897 | parser.error(u'invalid playlist end number specified') | |
3072fab1 RG |
2898 | if opts.extractaudio: |
2899 | if opts.audioformat not in ['best', 'aac', 'mp3']: | |
2900 | parser.error(u'invalid audio format specified') | |
4fa74b52 RG |
2901 | |
2902 | # Information extractors | |
2903 | youtube_ie = YoutubeIE() | |
020f7150 | 2904 | metacafe_ie = MetacafeIE(youtube_ie) |
4135fa45 | 2905 | dailymotion_ie = DailymotionIE() |
0c2dc87d | 2906 | youtube_pl_ie = YoutubePlaylistIE(youtube_ie) |
c39c05cd | 2907 | youtube_user_ie = YoutubeUserIE(youtube_ie) |
25af2bce | 2908 | youtube_search_ie = YoutubeSearchIE(youtube_ie) |
49c0028a | 2909 | google_ie = GoogleIE() |
7e58d568 | 2910 | google_search_ie = GoogleSearchIE(google_ie) |
49c0028a | 2911 | photobucket_ie = PhotobucketIE() |
61945318 | 2912 | yahoo_ie = YahooIE() |
7e58d568 | 2913 | yahoo_search_ie = YahooSearchIE(yahoo_ie) |
27179cfd | 2914 | deposit_files_ie = DepositFilesIE() |
9f5f9602 | 2915 | facebook_ie = FacebookIE() |
490fd7ae | 2916 | generic_ie = GenericIE() |
4fa74b52 RG |
2917 | |
2918 | # File downloader | |
9fcd8355 | 2919 | fd = FileDownloader({ |
209e9e27 RG |
2920 | 'usenetrc': opts.usenetrc, |
2921 | 'username': opts.username, | |
2922 | 'password': opts.password, | |
9f796346 | 2923 | 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename), |
209e9e27 RG |
2924 | 'forceurl': opts.geturl, |
2925 | 'forcetitle': opts.gettitle, | |
7e58d568 RG |
2926 | 'forcethumbnail': opts.getthumbnail, |
2927 | 'forcedescription': opts.getdescription, | |
9f796346 GI |
2928 | 'forcefilename': opts.getfilename, |
2929 | 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename), | |
320becd6 | 2930 | 'format': opts.format, |
f2413e67 | 2931 | 'format_limit': opts.format_limit, |
eae2666c | 2932 | 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding())) |
6ba562b0 RG |
2933 | or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s') |
2934 | or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s') | |
2935 | or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s') | |
1e47d226 NA |
2936 | or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s') |
2937 | or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s') | |
76a7f364 RG |
2938 | or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s') |
2939 | or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s') | |
1e47d226 | 2940 | or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') |
76a7f364 | 2941 | or u'%(id)s.%(ext)s'), |
0086d1ec | 2942 | 'ignoreerrors': opts.ignoreerrors, |
acd3d842 | 2943 | 'ratelimit': opts.ratelimit, |
0beeff4b | 2944 | 'nooverwrites': opts.nooverwrites, |
7031008c | 2945 | 'retries': opts.retries, |
7db85b2c | 2946 | 'continuedl': opts.continue_dl, |
d9835247 | 2947 | 'noprogress': opts.noprogress, |
204c9398 | 2948 | 'playliststart': opts.playliststart, |
8cc44341 | 2949 | 'playlistend': opts.playlistend, |
331ce0a0 | 2950 | 'logtostderr': opts.outtmpl == '-', |
ccbd296b | 2951 | 'consoletitle': opts.consoletitle, |
3fb2c487 | 2952 | 'nopart': opts.nopart, |
e3018902 | 2953 | 'updatetime': opts.updatetime, |
9fcd8355 | 2954 | }) |
25af2bce | 2955 | fd.add_info_extractor(youtube_search_ie) |
0c2dc87d | 2956 | fd.add_info_extractor(youtube_pl_ie) |
c39c05cd | 2957 | fd.add_info_extractor(youtube_user_ie) |
020f7150 | 2958 | fd.add_info_extractor(metacafe_ie) |
4135fa45 | 2959 | fd.add_info_extractor(dailymotion_ie) |
4fa74b52 | 2960 | fd.add_info_extractor(youtube_ie) |
49c0028a | 2961 | fd.add_info_extractor(google_ie) |
7e58d568 | 2962 | fd.add_info_extractor(google_search_ie) |
49c0028a | 2963 | fd.add_info_extractor(photobucket_ie) |
61945318 | 2964 | fd.add_info_extractor(yahoo_ie) |
7e58d568 | 2965 | fd.add_info_extractor(yahoo_search_ie) |
27179cfd | 2966 | fd.add_info_extractor(deposit_files_ie) |
9f5f9602 | 2967 | fd.add_info_extractor(facebook_ie) |
4bec29ef | 2968 | |
490fd7ae RG |
2969 | # This must come last since it's the |
2970 | # fallback if none of the others work | |
2971 | fd.add_info_extractor(generic_ie) | |
2972 | ||
3072fab1 RG |
2973 | # PostProcessors |
2974 | if opts.extractaudio: | |
2975 | fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat)) | |
2976 | ||
4bec29ef RG |
2977 | # Update version |
2978 | if opts.update_self: | |
2979 | update_self(fd, sys.argv[0]) | |
2980 | ||
2981 | # Maybe do nothing | |
2982 | if len(all_urls) < 1: | |
2983 | if not opts.update_self: | |
2984 | parser.error(u'you must provide at least one URL') | |
2985 | else: | |
2986 | sys.exit() | |
c6fd0bb8 | 2987 | retcode = fd.download(all_urls) |
80066952 RG |
2988 | |
2989 | # Dump cookie jar if requested | |
2990 | if opts.cookiefile is not None: | |
2991 | try: | |
2992 | jar.save() | |
2993 | except (IOError, OSError), err: | |
2994 | sys.exit(u'ERROR: unable to save cookie jar') | |
2995 | ||
bb681b88 | 2996 | sys.exit(retcode) |
4fa74b52 | 2997 | |
e5bf0f55 RG |
2998 | except DownloadError: |
2999 | sys.exit(1) | |
3000 | except SameFileError: | |
76a7f364 | 3001 | sys.exit(u'ERROR: fixed output name but more than one file to download') |
4fa74b52 | 3002 | except KeyboardInterrupt: |
76a7f364 | 3003 | sys.exit(u'\nERROR: Interrupted by user') |
e9cb9c28 GV |
3004 | |
3005 | # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: |