]>
Commit | Line | Data |
---|---|---|
4fa74b52 RG |
1 | #!/usr/bin/env python |
2 | # -*- coding: utf-8 -*- | |
3 | # Author: Ricardo Garcia Gonzalez | |
4 | # License: Public domain code | |
5 | import htmlentitydefs | |
6 | import httplib | |
7 | import math | |
8 | import netrc | |
9 | import os | |
10 | import os.path | |
11 | import re | |
12 | import socket | |
13 | import string | |
14 | import sys | |
15 | import time | |
16 | import urllib | |
17 | import urllib2 | |
18 | ||
19 | std_headers = { | |
20 | 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0', | |
21 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', | |
22 | 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', | |
23 | 'Accept-Language': 'en-us,en;q=0.5', | |
24 | } | |
25 | ||
26 | simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') | |
27 | ||
28 | class FileDownloader(object): | |
29 | """File Downloader class. | |
30 | ||
31 | File downloader objects are the ones responsible of downloading the | |
32 | actual video file and writing it to disk if the user has requested | |
33 | it, among some other tasks. In most cases there should be one per | |
34 | program. As, given a video URL, the downloader doesn't know how to | |
35 | extract all the needed information, task that InfoExtractors do, it | |
36 | has to pass the URL to one of them. | |
37 | ||
38 | For this, file downloader objects have a method that allows | |
39 | InfoExtractors to be registered in a given order. When it is passed | |
40 | a URL, the file downloader handles it to the first InfoExtractor it | |
b4634726 | 41 | finds that reports being able to handle it. The InfoExtractor returns |
4fa74b52 RG |
42 | all the information to the FileDownloader and the latter downloads the |
43 | file or does whatever it's instructed to do. | |
44 | ||
45 | File downloaders accept a lot of parameters. In order not to saturate | |
46 | the object constructor with arguments, it receives a dictionary of | |
47 | options instead. These options are available through the get_params() | |
48 | method for the InfoExtractors to use. The FileDownloader also registers | |
49 | itself as the downloader in charge for the InfoExtractors that are | |
50 | added to it, so this is a "mutual registration". | |
51 | ||
52 | Available options: | |
53 | ||
54 | username: Username for authentication purposes. | |
55 | password: Password for authentication purposes. | |
56 | usenetrc: Use netrc for authentication instead. | |
57 | quiet: Do not print messages to stdout. | |
b609fd54 | 58 | simulate: Do not download the video files. |
4fa74b52 RG |
59 | format: Video format code. |
60 | outtmpl: Template for output names. | |
61 | """ | |
62 | ||
63 | _params = None | |
64 | _ies = [] | |
65 | ||
66 | def __init__(self, params): | |
67 | self._ies = [] | |
68 | self.set_params(params) | |
69 | ||
70 | @staticmethod | |
71 | def pmkdir(filename): | |
72 | """Create directory components in filename. Similar to Unix "mkdir -p".""" | |
73 | components = filename.split(os.sep) | |
74 | aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))] | |
75 | for dir in aggregate: | |
76 | if not os.path.exists(dir): | |
77 | os.mkdir(dir) | |
78 | ||
79 | @staticmethod | |
80 | def format_bytes(bytes): | |
81 | if bytes is None: | |
82 | return 'N/A' | |
83 | if bytes == 0: | |
84 | exponent = 0 | |
85 | else: | |
86 | exponent = long(math.log(float(bytes), 1024.0)) | |
87 | suffix = 'bkMGTPEZY'[exponent] | |
4fa74b52 RG |
88 | converted = float(bytes) / float(1024**exponent) |
89 | return '%.2f%s' % (converted, suffix) | |
90 | ||
91 | @staticmethod | |
92 | def calc_percent(byte_counter, data_len): | |
93 | if data_len is None: | |
94 | return '---.-%' | |
95 | return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) | |
96 | ||
97 | @staticmethod | |
98 | def calc_eta(start, now, total, current): | |
99 | if total is None: | |
100 | return '--:--' | |
101 | dif = now - start | |
102 | if current == 0 or dif < 0.001: # One millisecond | |
103 | return '--:--' | |
104 | rate = float(current) / dif | |
105 | eta = long((float(total) - float(current)) / rate) | |
106 | (eta_mins, eta_secs) = divmod(eta, 60) | |
107 | if eta_mins > 99: | |
108 | return '--:--' | |
109 | return '%02d:%02d' % (eta_mins, eta_secs) | |
110 | ||
111 | @staticmethod | |
112 | def calc_speed(start, now, bytes): | |
113 | dif = now - start | |
114 | if bytes == 0 or dif < 0.001: # One millisecond | |
9fcd8355 | 115 | return '%10s' % '---b/s' |
4fa74b52 RG |
116 | return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) |
117 | ||
118 | @staticmethod | |
119 | def best_block_size(elapsed_time, bytes): | |
120 | new_min = max(bytes / 2.0, 1.0) | |
121 | new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB | |
122 | if elapsed_time < 0.001: | |
123 | return int(new_max) | |
124 | rate = bytes / elapsed_time | |
125 | if rate > new_max: | |
126 | return int(new_max) | |
127 | if rate < new_min: | |
128 | return int(new_min) | |
129 | return int(rate) | |
130 | ||
131 | def set_params(self, params): | |
132 | """Sets parameters.""" | |
133 | if type(params) != dict: | |
134 | raise ValueError('params: dictionary expected') | |
135 | self._params = params | |
136 | ||
137 | def get_params(self): | |
138 | """Get parameters.""" | |
139 | return self._params | |
140 | ||
141 | def add_info_extractor(self, ie): | |
142 | """Add an InfoExtractor object to the end of the list.""" | |
143 | self._ies.append(ie) | |
144 | ie.set_downloader(self) | |
145 | ||
9fcd8355 RG |
146 | def to_stdout(self, message, skip_eol=False): |
147 | """Print message to stdout if not in quiet mode.""" | |
148 | if not self._params.get('quiet', False): | |
149 | sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol])) | |
150 | sys.stdout.flush() | |
7e5cab67 RG |
151 | |
152 | def to_stderr(self, message): | |
153 | """Print message to stderr.""" | |
154 | sys.stderr.write('%s\n' % message) | |
9fcd8355 | 155 | |
4fa74b52 RG |
156 | def download(self, url_list): |
157 | """Download a given list of URLs.""" | |
158 | for url in url_list: | |
159 | suitable_found = False | |
160 | for ie in self._ies: | |
161 | if not ie.suitable(url): | |
162 | continue | |
163 | # Suitable InfoExtractor found | |
164 | suitable_found = True | |
b4634726 RG |
165 | results = [x for x in ie.extract(url) if x is not None] |
166 | ||
167 | if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None: | |
168 | sys.exit('ERROR: fixed output name but more than one file to download') | |
169 | ||
b609fd54 RG |
170 | if self._params.get('simulate', False): |
171 | continue | |
172 | ||
b4634726 | 173 | for result in results: |
4fa74b52 RG |
174 | try: |
175 | filename = self._params['outtmpl'] % result | |
176 | except (KeyError), err: | |
7e5cab67 | 177 | self.to_stderr('ERROR: invalid output template: %s' % str(err)) |
4fa74b52 RG |
178 | continue |
179 | try: | |
180 | self.pmkdir(filename) | |
181 | except (OSError, IOError), err: | |
7e5cab67 | 182 | self.to_stderr('ERROR: unable to create directories: %s' % str(err)) |
4fa74b52 RG |
183 | continue |
184 | try: | |
185 | outstream = open(filename, 'wb') | |
186 | except (OSError, IOError), err: | |
7e5cab67 | 187 | self.to_stderr('ERROR: unable to open for writing: %s' % str(err)) |
4fa74b52 RG |
188 | continue |
189 | try: | |
190 | self._do_download(outstream, result['url']) | |
191 | outstream.close() | |
192 | except (OSError, IOError), err: | |
7e5cab67 | 193 | self.to_stderr('ERROR: unable to write video data: %s' % str(err)) |
4fa74b52 RG |
194 | continue |
195 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
7e5cab67 | 196 | self.to_stderr('ERROR: unable to download video data: %s' % str(err)) |
4fa74b52 RG |
197 | continue |
198 | break | |
199 | if not suitable_found: | |
7e5cab67 | 200 | self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url) |
4fa74b52 RG |
201 | |
202 | def _do_download(self, stream, url): | |
203 | request = urllib2.Request(url, None, std_headers) | |
204 | data = urllib2.urlopen(request) | |
205 | data_len = data.info().get('Content-length', None) | |
206 | data_len_str = self.format_bytes(data_len) | |
207 | byte_counter = 0 | |
208 | block_size = 1024 | |
209 | start = time.time() | |
210 | while True: | |
211 | percent_str = self.calc_percent(byte_counter, data_len) | |
212 | eta_str = self.calc_eta(start, time.time(), data_len, byte_counter) | |
213 | speed_str = self.calc_speed(start, time.time(), byte_counter) | |
9fcd8355 RG |
214 | self.to_stdout('\r[download] %s of %s at %s ETA %s' % |
215 | (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) | |
4fa74b52 RG |
216 | |
217 | before = time.time() | |
218 | data_block = data.read(block_size) | |
219 | after = time.time() | |
220 | data_block_len = len(data_block) | |
221 | if data_block_len == 0: | |
222 | break | |
223 | byte_counter += data_block_len | |
224 | stream.write(data_block) | |
225 | block_size = self.best_block_size(after - before, data_block_len) | |
226 | ||
9fcd8355 | 227 | self.to_stdout('') |
4fa74b52 RG |
228 | if data_len is not None and str(byte_counter) != data_len: |
229 | raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len)) | |
230 | ||
231 | class InfoExtractor(object): | |
232 | """Information Extractor class. | |
233 | ||
234 | Information extractors are the classes that, given a URL, extract | |
235 | information from the video (or videos) the URL refers to. This | |
236 | information includes the real video URL, the video title and simplified | |
237 | title, author and others. It is returned in a list of dictionaries when | |
238 | calling its extract() method. It is a list because a URL can refer to | |
239 | more than one video (think of playlists). The dictionaries must include | |
240 | the following fields: | |
241 | ||
242 | id: Video identifier. | |
243 | url: Final video URL. | |
244 | uploader: Nickname of the video uploader. | |
245 | title: Literal title. | |
246 | stitle: Simplified title. | |
247 | ext: Video filename extension. | |
248 | ||
249 | Subclasses of this one should re-define the _real_initialize() and | |
250 | _real_extract() methods, as well as the suitable() static method. | |
251 | Probably, they should also be instantiated and added to the main | |
252 | downloader. | |
253 | """ | |
254 | ||
255 | _ready = False | |
256 | _downloader = None | |
257 | ||
258 | def __init__(self, downloader=None): | |
259 | """Constructor. Receives an optional downloader.""" | |
260 | self._ready = False | |
261 | self.set_downloader(downloader) | |
262 | ||
263 | @staticmethod | |
264 | def suitable(url): | |
265 | """Receives a URL and returns True if suitable for this IE.""" | |
266 | return True | |
267 | ||
268 | def initialize(self): | |
269 | """Initializes an instance (login, etc).""" | |
270 | if not self._ready: | |
271 | self._real_initialize() | |
272 | self._ready = True | |
273 | ||
274 | def extract(self, url): | |
275 | """Extracts URL information and returns it in list of dicts.""" | |
276 | self.initialize() | |
277 | return self._real_extract(url) | |
278 | ||
279 | def set_downloader(self, downloader): | |
280 | """Sets the downloader for this IE.""" | |
281 | self._downloader = downloader | |
282 | ||
283 | def to_stdout(self, message): | |
284 | if self._downloader is None or not self._downloader.get_params().get('quiet', False): | |
285 | print message | |
286 | ||
287 | def to_stderr(self, message): | |
288 | sys.stderr.write('%s\n' % message) | |
289 | ||
290 | def _real_initialize(self): | |
291 | """Real initialization process. Redefine in subclasses.""" | |
292 | pass | |
293 | ||
294 | def _real_extract(self, url): | |
295 | """Real extraction process. Redefine in subclasses.""" | |
296 | pass | |
297 | ||
298 | class YoutubeIE(InfoExtractor): | |
299 | """Information extractor for youtube.com.""" | |
300 | ||
301 | _LOGIN_URL = 'http://www.youtube.com/login?next=/' | |
302 | _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/' | |
303 | _NETRC_MACHINE = 'youtube' | |
304 | ||
305 | def _real_initialize(self): | |
306 | if self._downloader is None: | |
307 | return | |
308 | ||
309 | username = None | |
310 | password = None | |
311 | downloader_params = self._downloader.get_params() | |
312 | ||
313 | # Attempt to use provided username and password or .netrc data | |
314 | if downloader_params.get('username', None) is not None: | |
315 | username = downloader_params['username'] | |
316 | password = downloader_params['password'] | |
317 | elif downloader_params.get('usenetrc', False): | |
318 | try: | |
319 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) | |
320 | if info is not None: | |
321 | username = info[0] | |
322 | password = info[2] | |
323 | else: | |
324 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) | |
325 | except (IOError, netrc.NetrcParseError), err: | |
326 | self.to_stderr('WARNING: parsing .netrc: %s' % str(err)) | |
327 | return | |
328 | ||
329 | if username is None: | |
330 | return | |
331 | ||
332 | # Log in | |
9fcd8355 RG |
333 | login_form = { |
334 | 'current_form': 'loginForm', | |
4fa74b52 RG |
335 | 'next': '/', |
336 | 'action_login': 'Log In', | |
337 | 'username': username, | |
9fcd8355 RG |
338 | 'password': password, |
339 | } | |
4fa74b52 RG |
340 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers) |
341 | try: | |
342 | self.to_stdout('[youtube] Logging in') | |
343 | login_results = urllib2.urlopen(request).read() | |
344 | if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None: | |
345 | self.to_stderr('WARNING: Unable to log in: bad username or password') | |
346 | return | |
347 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
348 | self.to_stderr('WARNING: Unable to log in: %s' % str(err)) | |
349 | return | |
350 | ||
351 | # Confirm age | |
9fcd8355 RG |
352 | age_form = { |
353 | 'next_url': '/', | |
354 | 'action_confirm': 'Confirm', | |
355 | } | |
4fa74b52 RG |
356 | request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers) |
357 | try: | |
358 | self.to_stdout('[youtube] Confirming age') | |
359 | age_results = urllib2.urlopen(request).read() | |
360 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
361 | sys.exit('ERROR: Unable to confirm age: %s' % str(err)) | |
362 | ||
363 | def _real_extract(self, url): | |
364 | # Extract video id from URL | |
365 | mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url) | |
366 | if mobj is None: | |
367 | self.to_stderr('ERROR: Invalid URL: %s' % url) | |
368 | return [None] | |
369 | video_id = mobj.group(2) | |
370 | ||
371 | # Downloader parameters | |
372 | format_param = None | |
373 | if self._downloader is not None: | |
374 | params = self._downloader.get_params() | |
375 | format_param = params.get('format', None) | |
376 | ||
377 | # Extension | |
378 | video_extension = {18: 'mp4'}.get(format_param, 'flv') | |
379 | ||
380 | # Normalize URL, including format | |
381 | normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id | |
382 | if format_param is not None: | |
383 | normalized_url = '%s&fmt=%s' % (normalized_url, format_param) | |
384 | request = urllib2.Request(normalized_url, None, std_headers) | |
385 | try: | |
386 | self.to_stdout('[youtube] %s: Downloading video webpage' % video_id) | |
387 | video_webpage = urllib2.urlopen(request).read() | |
388 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
389 | sys.exit('ERROR: Unable to download video: %s' % str(err)) | |
390 | self.to_stdout('[youtube] %s: Extracting video information' % video_id) | |
391 | ||
392 | # "t" param | |
393 | mobj = re.search(r', "t": "([^"]+)"', video_webpage) | |
394 | if mobj is None: | |
395 | self.to_stderr('ERROR: Unable to extract "t" parameter') | |
396 | return [None] | |
397 | video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1)) | |
398 | if format_param is not None: | |
399 | video_real_url = '%s&fmt=%s' % (video_real_url, format_param) | |
9fcd8355 | 400 | self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url)) |
4fa74b52 RG |
401 | |
402 | # uploader | |
403 | mobj = re.search(r'More From: ([^<]*)<', video_webpage) | |
404 | if mobj is None: | |
405 | self.to_stderr('ERROR: Unable to extract uploader nickname') | |
406 | return [None] | |
407 | video_uploader = mobj.group(1) | |
408 | ||
409 | # title | |
410 | mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage) | |
411 | if mobj is None: | |
412 | self.to_stderr('ERROR: Unable to extract video title') | |
413 | return [None] | |
414 | video_title = mobj.group(1).decode('utf-8') | |
415 | video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title) | |
416 | ||
417 | # simplified title | |
418 | simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title) | |
419 | simple_title = simple_title.strip(u'_') | |
420 | ||
421 | # Return information | |
9fcd8355 RG |
422 | return [{ |
423 | 'id': video_id, | |
424 | 'url': video_real_url, | |
425 | 'uploader': video_uploader, | |
426 | 'title': video_title, | |
427 | 'stitle': simple_title, | |
428 | 'ext': video_extension, | |
429 | }] | |
4fa74b52 RG |
430 | |
431 | if __name__ == '__main__': | |
432 | try: | |
433 | # General configuration | |
434 | urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler())) | |
435 | urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor())) | |
436 | ||
437 | # Information extractors | |
438 | youtube_ie = YoutubeIE() | |
439 | ||
440 | # File downloader | |
9fcd8355 RG |
441 | fd = FileDownloader({ |
442 | 'usenetrc': False, | |
443 | 'username': None, | |
444 | 'password': None, | |
445 | 'quiet': False, | |
b609fd54 | 446 | 'simulate': True, |
9fcd8355 RG |
447 | 'format': None, |
448 | 'outtmpl': '%(id)s.%(ext)s' | |
449 | }) | |
4fa74b52 | 450 | fd.add_info_extractor(youtube_ie) |
9fcd8355 RG |
451 | fd.download([ |
452 | 'http://www.youtube.com/watch?v=t7qdwI7TVe8', | |
453 | 'http://www.youtube.com/watch?v=IJyn3pRcy_Q', | |
454 | 'http://www.youtube.com/watch?v=DZRXe1wtC-M', | |
455 | ]) | |
4fa74b52 RG |
456 | |
457 | except KeyboardInterrupt: | |
458 | sys.exit('\nERROR: Interrupted by user') |