]>
Commit | Line | Data |
---|---|---|
4fa74b52 RG |
1 | #!/usr/bin/env python |
2 | # -*- coding: utf-8 -*- | |
3 | # Author: Ricardo Garcia Gonzalez | |
4 | # License: Public domain code | |
5 | import htmlentitydefs | |
6 | import httplib | |
7 | import math | |
8 | import netrc | |
9 | import os | |
10 | import os.path | |
11 | import re | |
12 | import socket | |
13 | import string | |
14 | import sys | |
15 | import time | |
16 | import urllib | |
17 | import urllib2 | |
18 | ||
19 | std_headers = { | |
7414bdf1 | 20 | 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1', |
4fa74b52 RG |
21 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', |
22 | 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', | |
23 | 'Accept-Language': 'en-us,en;q=0.5', | |
24 | } | |
25 | ||
26 | simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') | |
27 | ||
28 | class FileDownloader(object): | |
29 | """File Downloader class. | |
30 | ||
31 | File downloader objects are the ones responsible of downloading the | |
32 | actual video file and writing it to disk if the user has requested | |
33 | it, among some other tasks. In most cases there should be one per | |
34 | program. As, given a video URL, the downloader doesn't know how to | |
35 | extract all the needed information, task that InfoExtractors do, it | |
36 | has to pass the URL to one of them. | |
37 | ||
38 | For this, file downloader objects have a method that allows | |
39 | InfoExtractors to be registered in a given order. When it is passed | |
40 | a URL, the file downloader handles it to the first InfoExtractor it | |
b4634726 | 41 | finds that reports being able to handle it. The InfoExtractor returns |
4fa74b52 RG |
42 | all the information to the FileDownloader and the latter downloads the |
43 | file or does whatever it's instructed to do. | |
44 | ||
45 | File downloaders accept a lot of parameters. In order not to saturate | |
46 | the object constructor with arguments, it receives a dictionary of | |
47 | options instead. These options are available through the get_params() | |
48 | method for the InfoExtractors to use. The FileDownloader also registers | |
49 | itself as the downloader in charge for the InfoExtractors that are | |
50 | added to it, so this is a "mutual registration". | |
51 | ||
52 | Available options: | |
53 | ||
54 | username: Username for authentication purposes. | |
55 | password: Password for authentication purposes. | |
56 | usenetrc: Use netrc for authentication instead. | |
57 | quiet: Do not print messages to stdout. | |
05a84b35 RG |
58 | forceurl: Force printing final URL. |
59 | forcetitle: Force printing title. | |
b609fd54 | 60 | simulate: Do not download the video files. |
4fa74b52 RG |
61 | format: Video format code. |
62 | outtmpl: Template for output names. | |
63 | """ | |
64 | ||
65 | _params = None | |
66 | _ies = [] | |
67 | ||
68 | def __init__(self, params): | |
69 | self._ies = [] | |
70 | self.set_params(params) | |
71 | ||
72 | @staticmethod | |
73 | def pmkdir(filename): | |
74 | """Create directory components in filename. Similar to Unix "mkdir -p".""" | |
75 | components = filename.split(os.sep) | |
76 | aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))] | |
77 | for dir in aggregate: | |
78 | if not os.path.exists(dir): | |
79 | os.mkdir(dir) | |
80 | ||
81 | @staticmethod | |
82 | def format_bytes(bytes): | |
83 | if bytes is None: | |
84 | return 'N/A' | |
85 | if bytes == 0: | |
86 | exponent = 0 | |
87 | else: | |
88 | exponent = long(math.log(float(bytes), 1024.0)) | |
89 | suffix = 'bkMGTPEZY'[exponent] | |
4fa74b52 RG |
90 | converted = float(bytes) / float(1024**exponent) |
91 | return '%.2f%s' % (converted, suffix) | |
92 | ||
93 | @staticmethod | |
94 | def calc_percent(byte_counter, data_len): | |
95 | if data_len is None: | |
96 | return '---.-%' | |
97 | return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) | |
98 | ||
99 | @staticmethod | |
100 | def calc_eta(start, now, total, current): | |
101 | if total is None: | |
102 | return '--:--' | |
103 | dif = now - start | |
104 | if current == 0 or dif < 0.001: # One millisecond | |
105 | return '--:--' | |
106 | rate = float(current) / dif | |
107 | eta = long((float(total) - float(current)) / rate) | |
108 | (eta_mins, eta_secs) = divmod(eta, 60) | |
109 | if eta_mins > 99: | |
110 | return '--:--' | |
111 | return '%02d:%02d' % (eta_mins, eta_secs) | |
112 | ||
113 | @staticmethod | |
114 | def calc_speed(start, now, bytes): | |
115 | dif = now - start | |
116 | if bytes == 0 or dif < 0.001: # One millisecond | |
9fcd8355 | 117 | return '%10s' % '---b/s' |
4fa74b52 RG |
118 | return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) |
119 | ||
120 | @staticmethod | |
121 | def best_block_size(elapsed_time, bytes): | |
122 | new_min = max(bytes / 2.0, 1.0) | |
123 | new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB | |
124 | if elapsed_time < 0.001: | |
125 | return int(new_max) | |
126 | rate = bytes / elapsed_time | |
127 | if rate > new_max: | |
128 | return int(new_max) | |
129 | if rate < new_min: | |
130 | return int(new_min) | |
131 | return int(rate) | |
132 | ||
133 | def set_params(self, params): | |
134 | """Sets parameters.""" | |
135 | if type(params) != dict: | |
136 | raise ValueError('params: dictionary expected') | |
137 | self._params = params | |
138 | ||
139 | def get_params(self): | |
140 | """Get parameters.""" | |
141 | return self._params | |
142 | ||
143 | def add_info_extractor(self, ie): | |
144 | """Add an InfoExtractor object to the end of the list.""" | |
145 | self._ies.append(ie) | |
146 | ie.set_downloader(self) | |
147 | ||
9fcd8355 RG |
148 | def to_stdout(self, message, skip_eol=False): |
149 | """Print message to stdout if not in quiet mode.""" | |
150 | if not self._params.get('quiet', False): | |
151 | sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol])) | |
152 | sys.stdout.flush() | |
7e5cab67 RG |
153 | |
154 | def to_stderr(self, message): | |
155 | """Print message to stderr.""" | |
156 | sys.stderr.write('%s\n' % message) | |
22899cea RG |
157 | |
158 | def fixed_template(self): | |
159 | """Checks if the output template is fixed.""" | |
f97c8db7 | 160 | return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None) |
9fcd8355 | 161 | |
4fa74b52 RG |
162 | def download(self, url_list): |
163 | """Download a given list of URLs.""" | |
22899cea RG |
164 | if len(url_list) > 1 and self.fixed_template(): |
165 | sys.exit('ERROR: fixed output name but more than one file to download') | |
166 | ||
4fa74b52 RG |
167 | for url in url_list: |
168 | suitable_found = False | |
169 | for ie in self._ies: | |
170 | if not ie.suitable(url): | |
171 | continue | |
172 | # Suitable InfoExtractor found | |
173 | suitable_found = True | |
b4634726 RG |
174 | results = [x for x in ie.extract(url) if x is not None] |
175 | ||
22899cea | 176 | if len(results) > 1 and self.fixed_template(): |
b4634726 RG |
177 | sys.exit('ERROR: fixed output name but more than one file to download') |
178 | ||
179 | for result in results: | |
05a84b35 RG |
180 | |
181 | # Forced printings | |
182 | if self._params.get('forcetitle', False): | |
183 | print result['title'] | |
184 | if self._params.get('forceurl', False): | |
185 | print result['url'] | |
186 | ||
187 | # Do nothing else if in simulate mode | |
188 | if self._params.get('simulate', False): | |
189 | continue | |
190 | ||
4fa74b52 RG |
191 | try: |
192 | filename = self._params['outtmpl'] % result | |
14c30068 | 193 | except (ValueError, KeyError), err: |
7e5cab67 | 194 | self.to_stderr('ERROR: invalid output template: %s' % str(err)) |
4fa74b52 RG |
195 | continue |
196 | try: | |
197 | self.pmkdir(filename) | |
198 | except (OSError, IOError), err: | |
7e5cab67 | 199 | self.to_stderr('ERROR: unable to create directories: %s' % str(err)) |
4fa74b52 RG |
200 | continue |
201 | try: | |
202 | outstream = open(filename, 'wb') | |
203 | except (OSError, IOError), err: | |
7e5cab67 | 204 | self.to_stderr('ERROR: unable to open for writing: %s' % str(err)) |
4fa74b52 RG |
205 | continue |
206 | try: | |
207 | self._do_download(outstream, result['url']) | |
208 | outstream.close() | |
209 | except (OSError, IOError), err: | |
7e5cab67 | 210 | self.to_stderr('ERROR: unable to write video data: %s' % str(err)) |
4fa74b52 RG |
211 | continue |
212 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
7e5cab67 | 213 | self.to_stderr('ERROR: unable to download video data: %s' % str(err)) |
4fa74b52 RG |
214 | continue |
215 | break | |
216 | if not suitable_found: | |
7e5cab67 | 217 | self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url) |
4fa74b52 RG |
218 | |
219 | def _do_download(self, stream, url): | |
220 | request = urllib2.Request(url, None, std_headers) | |
221 | data = urllib2.urlopen(request) | |
222 | data_len = data.info().get('Content-length', None) | |
223 | data_len_str = self.format_bytes(data_len) | |
224 | byte_counter = 0 | |
225 | block_size = 1024 | |
226 | start = time.time() | |
227 | while True: | |
228 | percent_str = self.calc_percent(byte_counter, data_len) | |
229 | eta_str = self.calc_eta(start, time.time(), data_len, byte_counter) | |
230 | speed_str = self.calc_speed(start, time.time(), byte_counter) | |
9fcd8355 RG |
231 | self.to_stdout('\r[download] %s of %s at %s ETA %s' % |
232 | (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) | |
4fa74b52 RG |
233 | |
234 | before = time.time() | |
235 | data_block = data.read(block_size) | |
236 | after = time.time() | |
237 | data_block_len = len(data_block) | |
238 | if data_block_len == 0: | |
239 | break | |
240 | byte_counter += data_block_len | |
241 | stream.write(data_block) | |
242 | block_size = self.best_block_size(after - before, data_block_len) | |
243 | ||
9fcd8355 | 244 | self.to_stdout('') |
4fa74b52 RG |
245 | if data_len is not None and str(byte_counter) != data_len: |
246 | raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len)) | |
247 | ||
248 | class InfoExtractor(object): | |
249 | """Information Extractor class. | |
250 | ||
251 | Information extractors are the classes that, given a URL, extract | |
252 | information from the video (or videos) the URL refers to. This | |
253 | information includes the real video URL, the video title and simplified | |
254 | title, author and others. It is returned in a list of dictionaries when | |
255 | calling its extract() method. It is a list because a URL can refer to | |
256 | more than one video (think of playlists). The dictionaries must include | |
257 | the following fields: | |
258 | ||
259 | id: Video identifier. | |
260 | url: Final video URL. | |
261 | uploader: Nickname of the video uploader. | |
262 | title: Literal title. | |
263 | stitle: Simplified title. | |
264 | ext: Video filename extension. | |
265 | ||
266 | Subclasses of this one should re-define the _real_initialize() and | |
267 | _real_extract() methods, as well as the suitable() static method. | |
268 | Probably, they should also be instantiated and added to the main | |
269 | downloader. | |
270 | """ | |
271 | ||
272 | _ready = False | |
273 | _downloader = None | |
274 | ||
275 | def __init__(self, downloader=None): | |
276 | """Constructor. Receives an optional downloader.""" | |
277 | self._ready = False | |
278 | self.set_downloader(downloader) | |
279 | ||
280 | @staticmethod | |
281 | def suitable(url): | |
282 | """Receives a URL and returns True if suitable for this IE.""" | |
283 | return True | |
284 | ||
285 | def initialize(self): | |
286 | """Initializes an instance (login, etc).""" | |
287 | if not self._ready: | |
288 | self._real_initialize() | |
289 | self._ready = True | |
290 | ||
291 | def extract(self, url): | |
292 | """Extracts URL information and returns it in list of dicts.""" | |
293 | self.initialize() | |
294 | return self._real_extract(url) | |
295 | ||
296 | def set_downloader(self, downloader): | |
297 | """Sets the downloader for this IE.""" | |
298 | self._downloader = downloader | |
299 | ||
300 | def to_stdout(self, message): | |
301 | if self._downloader is None or not self._downloader.get_params().get('quiet', False): | |
302 | print message | |
303 | ||
304 | def to_stderr(self, message): | |
305 | sys.stderr.write('%s\n' % message) | |
306 | ||
307 | def _real_initialize(self): | |
308 | """Real initialization process. Redefine in subclasses.""" | |
309 | pass | |
310 | ||
311 | def _real_extract(self, url): | |
312 | """Real extraction process. Redefine in subclasses.""" | |
313 | pass | |
314 | ||
315 | class YoutubeIE(InfoExtractor): | |
316 | """Information extractor for youtube.com.""" | |
317 | ||
318 | _LOGIN_URL = 'http://www.youtube.com/login?next=/' | |
319 | _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/' | |
320 | _NETRC_MACHINE = 'youtube' | |
321 | ||
322 | def _real_initialize(self): | |
323 | if self._downloader is None: | |
324 | return | |
325 | ||
326 | username = None | |
327 | password = None | |
328 | downloader_params = self._downloader.get_params() | |
329 | ||
330 | # Attempt to use provided username and password or .netrc data | |
331 | if downloader_params.get('username', None) is not None: | |
332 | username = downloader_params['username'] | |
333 | password = downloader_params['password'] | |
334 | elif downloader_params.get('usenetrc', False): | |
335 | try: | |
336 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) | |
337 | if info is not None: | |
338 | username = info[0] | |
339 | password = info[2] | |
340 | else: | |
341 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) | |
342 | except (IOError, netrc.NetrcParseError), err: | |
343 | self.to_stderr('WARNING: parsing .netrc: %s' % str(err)) | |
344 | return | |
345 | ||
346 | if username is None: | |
347 | return | |
348 | ||
349 | # Log in | |
9fcd8355 RG |
350 | login_form = { |
351 | 'current_form': 'loginForm', | |
4fa74b52 RG |
352 | 'next': '/', |
353 | 'action_login': 'Log In', | |
354 | 'username': username, | |
9fcd8355 RG |
355 | 'password': password, |
356 | } | |
4fa74b52 RG |
357 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers) |
358 | try: | |
359 | self.to_stdout('[youtube] Logging in') | |
360 | login_results = urllib2.urlopen(request).read() | |
361 | if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None: | |
362 | self.to_stderr('WARNING: Unable to log in: bad username or password') | |
363 | return | |
364 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
365 | self.to_stderr('WARNING: Unable to log in: %s' % str(err)) | |
366 | return | |
367 | ||
368 | # Confirm age | |
9fcd8355 RG |
369 | age_form = { |
370 | 'next_url': '/', | |
371 | 'action_confirm': 'Confirm', | |
372 | } | |
4fa74b52 RG |
373 | request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers) |
374 | try: | |
375 | self.to_stdout('[youtube] Confirming age') | |
376 | age_results = urllib2.urlopen(request).read() | |
377 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
378 | sys.exit('ERROR: Unable to confirm age: %s' % str(err)) | |
379 | ||
380 | def _real_extract(self, url): | |
381 | # Extract video id from URL | |
382 | mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url) | |
383 | if mobj is None: | |
384 | self.to_stderr('ERROR: Invalid URL: %s' % url) | |
385 | return [None] | |
386 | video_id = mobj.group(2) | |
387 | ||
388 | # Downloader parameters | |
389 | format_param = None | |
390 | if self._downloader is not None: | |
391 | params = self._downloader.get_params() | |
392 | format_param = params.get('format', None) | |
393 | ||
394 | # Extension | |
f9f1e798 | 395 | video_extension = {'18': 'mp4'}.get(format_param, 'flv') |
4fa74b52 RG |
396 | |
397 | # Normalize URL, including format | |
398 | normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id | |
399 | if format_param is not None: | |
400 | normalized_url = '%s&fmt=%s' % (normalized_url, format_param) | |
401 | request = urllib2.Request(normalized_url, None, std_headers) | |
402 | try: | |
403 | self.to_stdout('[youtube] %s: Downloading video webpage' % video_id) | |
404 | video_webpage = urllib2.urlopen(request).read() | |
405 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
406 | sys.exit('ERROR: Unable to download video: %s' % str(err)) | |
407 | self.to_stdout('[youtube] %s: Extracting video information' % video_id) | |
408 | ||
409 | # "t" param | |
410 | mobj = re.search(r', "t": "([^"]+)"', video_webpage) | |
411 | if mobj is None: | |
412 | self.to_stderr('ERROR: Unable to extract "t" parameter') | |
413 | return [None] | |
414 | video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1)) | |
415 | if format_param is not None: | |
416 | video_real_url = '%s&fmt=%s' % (video_real_url, format_param) | |
9fcd8355 | 417 | self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url)) |
4fa74b52 RG |
418 | |
419 | # uploader | |
420 | mobj = re.search(r'More From: ([^<]*)<', video_webpage) | |
421 | if mobj is None: | |
422 | self.to_stderr('ERROR: Unable to extract uploader nickname') | |
423 | return [None] | |
424 | video_uploader = mobj.group(1) | |
425 | ||
426 | # title | |
427 | mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage) | |
428 | if mobj is None: | |
429 | self.to_stderr('ERROR: Unable to extract video title') | |
430 | return [None] | |
431 | video_title = mobj.group(1).decode('utf-8') | |
f97c8db7 | 432 | video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title) |
4fa74b52 RG |
433 | |
434 | # simplified title | |
f97c8db7 RG |
435 | simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) |
436 | simple_title = simple_title.strip(ur'_') | |
4fa74b52 RG |
437 | |
438 | # Return information | |
9fcd8355 RG |
439 | return [{ |
440 | 'id': video_id, | |
441 | 'url': video_real_url, | |
442 | 'uploader': video_uploader, | |
443 | 'title': video_title, | |
444 | 'stitle': simple_title, | |
445 | 'ext': video_extension, | |
446 | }] | |
4fa74b52 RG |
447 | |
448 | if __name__ == '__main__': | |
449 | try: | |
f9f1e798 RG |
450 | # Modules needed only when running the main program |
451 | import optparse | |
452 | ||
4fa74b52 RG |
453 | # General configuration |
454 | urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler())) | |
455 | urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor())) | |
f9f1e798 RG |
456 | socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) |
457 | ||
458 | # Parse command line | |
4fa74b52 RG |
459 | |
460 | # Information extractors | |
461 | youtube_ie = YoutubeIE() | |
462 | ||
463 | # File downloader | |
9fcd8355 RG |
464 | fd = FileDownloader({ |
465 | 'usenetrc': False, | |
466 | 'username': None, | |
467 | 'password': None, | |
f9f1e798 RG |
468 | 'quiet': True, |
469 | 'forceurl': True, | |
470 | 'forcetitle': True, | |
471 | 'simulate': True, | |
9fcd8355 | 472 | 'format': None, |
f9f1e798 | 473 | 'outtmpl': '%(id)s.%(ext)s' |
9fcd8355 | 474 | }) |
4fa74b52 | 475 | fd.add_info_extractor(youtube_ie) |
9fcd8355 RG |
476 | fd.download([ |
477 | 'http://www.youtube.com/watch?v=t7qdwI7TVe8', | |
478 | 'http://www.youtube.com/watch?v=IJyn3pRcy_Q', | |
479 | 'http://www.youtube.com/watch?v=DZRXe1wtC-M', | |
480 | ]) | |
4fa74b52 RG |
481 | |
482 | except KeyboardInterrupt: | |
483 | sys.exit('\nERROR: Interrupted by user') |