]> jfr.im git - yt-dlp.git/blame - youtube-dl
Minor improvements and changes
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Author: Ricardo Garcia Gonzalez
4# License: Public domain code
5import htmlentitydefs
6import httplib
7import math
8import netrc
9import os
10import os.path
11import re
12import socket
13import string
14import sys
15import time
16import urllib
17import urllib2
18
19std_headers = {
7414bdf1 20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
4fa74b52
RG
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
24}
25
26simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28class FileDownloader(object):
29 """File Downloader class.
30
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
37
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
b4634726 41 finds that reports being able to handle it. The InfoExtractor returns
4fa74b52
RG
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
44
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
51
52 Available options:
53
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
05a84b35
RG
58 forceurl: Force printing final URL.
59 forcetitle: Force printing title.
b609fd54 60 simulate: Do not download the video files.
4fa74b52
RG
61 format: Video format code.
62 outtmpl: Template for output names.
63 """
64
65 _params = None
66 _ies = []
67
68 def __init__(self, params):
69 self._ies = []
70 self.set_params(params)
71
72 @staticmethod
73 def pmkdir(filename):
74 """Create directory components in filename. Similar to Unix "mkdir -p"."""
75 components = filename.split(os.sep)
76 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
77 for dir in aggregate:
78 if not os.path.exists(dir):
79 os.mkdir(dir)
80
81 @staticmethod
82 def format_bytes(bytes):
83 if bytes is None:
84 return 'N/A'
85 if bytes == 0:
86 exponent = 0
87 else:
88 exponent = long(math.log(float(bytes), 1024.0))
89 suffix = 'bkMGTPEZY'[exponent]
4fa74b52
RG
90 converted = float(bytes) / float(1024**exponent)
91 return '%.2f%s' % (converted, suffix)
92
93 @staticmethod
94 def calc_percent(byte_counter, data_len):
95 if data_len is None:
96 return '---.-%'
97 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
98
99 @staticmethod
100 def calc_eta(start, now, total, current):
101 if total is None:
102 return '--:--'
103 dif = now - start
104 if current == 0 or dif < 0.001: # One millisecond
105 return '--:--'
106 rate = float(current) / dif
107 eta = long((float(total) - float(current)) / rate)
108 (eta_mins, eta_secs) = divmod(eta, 60)
109 if eta_mins > 99:
110 return '--:--'
111 return '%02d:%02d' % (eta_mins, eta_secs)
112
113 @staticmethod
114 def calc_speed(start, now, bytes):
115 dif = now - start
116 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 117 return '%10s' % '---b/s'
4fa74b52
RG
118 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
119
120 @staticmethod
121 def best_block_size(elapsed_time, bytes):
122 new_min = max(bytes / 2.0, 1.0)
123 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
124 if elapsed_time < 0.001:
125 return int(new_max)
126 rate = bytes / elapsed_time
127 if rate > new_max:
128 return int(new_max)
129 if rate < new_min:
130 return int(new_min)
131 return int(rate)
132
133 def set_params(self, params):
134 """Sets parameters."""
135 if type(params) != dict:
136 raise ValueError('params: dictionary expected')
137 self._params = params
138
139 def get_params(self):
140 """Get parameters."""
141 return self._params
142
143 def add_info_extractor(self, ie):
144 """Add an InfoExtractor object to the end of the list."""
145 self._ies.append(ie)
146 ie.set_downloader(self)
147
9fcd8355
RG
148 def to_stdout(self, message, skip_eol=False):
149 """Print message to stdout if not in quiet mode."""
150 if not self._params.get('quiet', False):
151 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
152 sys.stdout.flush()
7e5cab67
RG
153
154 def to_stderr(self, message):
155 """Print message to stderr."""
156 sys.stderr.write('%s\n' % message)
22899cea
RG
157
158 def fixed_template(self):
159 """Checks if the output template is fixed."""
f97c8db7 160 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
9fcd8355 161
4fa74b52
RG
162 def download(self, url_list):
163 """Download a given list of URLs."""
22899cea
RG
164 if len(url_list) > 1 and self.fixed_template():
165 sys.exit('ERROR: fixed output name but more than one file to download')
166
4fa74b52
RG
167 for url in url_list:
168 suitable_found = False
169 for ie in self._ies:
170 if not ie.suitable(url):
171 continue
172 # Suitable InfoExtractor found
173 suitable_found = True
b4634726
RG
174 results = [x for x in ie.extract(url) if x is not None]
175
22899cea 176 if len(results) > 1 and self.fixed_template():
b4634726
RG
177 sys.exit('ERROR: fixed output name but more than one file to download')
178
179 for result in results:
05a84b35
RG
180
181 # Forced printings
182 if self._params.get('forcetitle', False):
183 print result['title']
184 if self._params.get('forceurl', False):
185 print result['url']
186
187 # Do nothing else if in simulate mode
188 if self._params.get('simulate', False):
189 continue
190
4fa74b52
RG
191 try:
192 filename = self._params['outtmpl'] % result
14c30068 193 except (ValueError, KeyError), err:
7e5cab67 194 self.to_stderr('ERROR: invalid output template: %s' % str(err))
4fa74b52
RG
195 continue
196 try:
197 self.pmkdir(filename)
198 except (OSError, IOError), err:
7e5cab67 199 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
4fa74b52
RG
200 continue
201 try:
202 outstream = open(filename, 'wb')
203 except (OSError, IOError), err:
7e5cab67 204 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
4fa74b52
RG
205 continue
206 try:
207 self._do_download(outstream, result['url'])
208 outstream.close()
209 except (OSError, IOError), err:
7e5cab67 210 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
4fa74b52
RG
211 continue
212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
7e5cab67 213 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
4fa74b52
RG
214 continue
215 break
216 if not suitable_found:
7e5cab67 217 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
4fa74b52
RG
218
219 def _do_download(self, stream, url):
220 request = urllib2.Request(url, None, std_headers)
221 data = urllib2.urlopen(request)
222 data_len = data.info().get('Content-length', None)
223 data_len_str = self.format_bytes(data_len)
224 byte_counter = 0
225 block_size = 1024
226 start = time.time()
227 while True:
228 percent_str = self.calc_percent(byte_counter, data_len)
229 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
230 speed_str = self.calc_speed(start, time.time(), byte_counter)
9fcd8355
RG
231 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
232 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
4fa74b52
RG
233
234 before = time.time()
235 data_block = data.read(block_size)
236 after = time.time()
237 data_block_len = len(data_block)
238 if data_block_len == 0:
239 break
240 byte_counter += data_block_len
241 stream.write(data_block)
242 block_size = self.best_block_size(after - before, data_block_len)
243
9fcd8355 244 self.to_stdout('')
4fa74b52
RG
245 if data_len is not None and str(byte_counter) != data_len:
246 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
247
248class InfoExtractor(object):
249 """Information Extractor class.
250
251 Information extractors are the classes that, given a URL, extract
252 information from the video (or videos) the URL refers to. This
253 information includes the real video URL, the video title and simplified
254 title, author and others. It is returned in a list of dictionaries when
255 calling its extract() method. It is a list because a URL can refer to
256 more than one video (think of playlists). The dictionaries must include
257 the following fields:
258
259 id: Video identifier.
260 url: Final video URL.
261 uploader: Nickname of the video uploader.
262 title: Literal title.
263 stitle: Simplified title.
264 ext: Video filename extension.
265
266 Subclasses of this one should re-define the _real_initialize() and
267 _real_extract() methods, as well as the suitable() static method.
268 Probably, they should also be instantiated and added to the main
269 downloader.
270 """
271
272 _ready = False
273 _downloader = None
274
275 def __init__(self, downloader=None):
276 """Constructor. Receives an optional downloader."""
277 self._ready = False
278 self.set_downloader(downloader)
279
280 @staticmethod
281 def suitable(url):
282 """Receives a URL and returns True if suitable for this IE."""
283 return True
284
285 def initialize(self):
286 """Initializes an instance (login, etc)."""
287 if not self._ready:
288 self._real_initialize()
289 self._ready = True
290
291 def extract(self, url):
292 """Extracts URL information and returns it in list of dicts."""
293 self.initialize()
294 return self._real_extract(url)
295
296 def set_downloader(self, downloader):
297 """Sets the downloader for this IE."""
298 self._downloader = downloader
299
300 def to_stdout(self, message):
301 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
302 print message
303
304 def to_stderr(self, message):
305 sys.stderr.write('%s\n' % message)
306
307 def _real_initialize(self):
308 """Real initialization process. Redefine in subclasses."""
309 pass
310
311 def _real_extract(self, url):
312 """Real extraction process. Redefine in subclasses."""
313 pass
314
315class YoutubeIE(InfoExtractor):
316 """Information extractor for youtube.com."""
317
318 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
319 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
320 _NETRC_MACHINE = 'youtube'
321
322 def _real_initialize(self):
323 if self._downloader is None:
324 return
325
326 username = None
327 password = None
328 downloader_params = self._downloader.get_params()
329
330 # Attempt to use provided username and password or .netrc data
331 if downloader_params.get('username', None) is not None:
332 username = downloader_params['username']
333 password = downloader_params['password']
334 elif downloader_params.get('usenetrc', False):
335 try:
336 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
337 if info is not None:
338 username = info[0]
339 password = info[2]
340 else:
341 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
342 except (IOError, netrc.NetrcParseError), err:
343 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
344 return
345
346 if username is None:
347 return
348
349 # Log in
9fcd8355
RG
350 login_form = {
351 'current_form': 'loginForm',
4fa74b52
RG
352 'next': '/',
353 'action_login': 'Log In',
354 'username': username,
9fcd8355
RG
355 'password': password,
356 }
4fa74b52
RG
357 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
358 try:
359 self.to_stdout('[youtube] Logging in')
360 login_results = urllib2.urlopen(request).read()
361 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
362 self.to_stderr('WARNING: Unable to log in: bad username or password')
363 return
364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
365 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
366 return
367
368 # Confirm age
9fcd8355
RG
369 age_form = {
370 'next_url': '/',
371 'action_confirm': 'Confirm',
372 }
4fa74b52
RG
373 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
374 try:
375 self.to_stdout('[youtube] Confirming age')
376 age_results = urllib2.urlopen(request).read()
377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
378 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
379
380 def _real_extract(self, url):
381 # Extract video id from URL
382 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
383 if mobj is None:
384 self.to_stderr('ERROR: Invalid URL: %s' % url)
385 return [None]
386 video_id = mobj.group(2)
387
388 # Downloader parameters
389 format_param = None
390 if self._downloader is not None:
391 params = self._downloader.get_params()
392 format_param = params.get('format', None)
393
394 # Extension
f9f1e798 395 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
4fa74b52
RG
396
397 # Normalize URL, including format
398 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
399 if format_param is not None:
400 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
401 request = urllib2.Request(normalized_url, None, std_headers)
402 try:
403 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
404 video_webpage = urllib2.urlopen(request).read()
405 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
406 sys.exit('ERROR: Unable to download video: %s' % str(err))
407 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
408
409 # "t" param
410 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
411 if mobj is None:
412 self.to_stderr('ERROR: Unable to extract "t" parameter')
413 return [None]
414 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
415 if format_param is not None:
416 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
9fcd8355 417 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
4fa74b52
RG
418
419 # uploader
420 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
421 if mobj is None:
422 self.to_stderr('ERROR: Unable to extract uploader nickname')
423 return [None]
424 video_uploader = mobj.group(1)
425
426 # title
427 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
428 if mobj is None:
429 self.to_stderr('ERROR: Unable to extract video title')
430 return [None]
431 video_title = mobj.group(1).decode('utf-8')
f97c8db7 432 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
4fa74b52
RG
433
434 # simplified title
f97c8db7
RG
435 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
436 simple_title = simple_title.strip(ur'_')
4fa74b52
RG
437
438 # Return information
9fcd8355
RG
439 return [{
440 'id': video_id,
441 'url': video_real_url,
442 'uploader': video_uploader,
443 'title': video_title,
444 'stitle': simple_title,
445 'ext': video_extension,
446 }]
4fa74b52
RG
447
448if __name__ == '__main__':
449 try:
f9f1e798
RG
450 # Modules needed only when running the main program
451 import optparse
452
4fa74b52
RG
453 # General configuration
454 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
455 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
f9f1e798
RG
456 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
457
458 # Parse command line
4fa74b52
RG
459
460 # Information extractors
461 youtube_ie = YoutubeIE()
462
463 # File downloader
9fcd8355
RG
464 fd = FileDownloader({
465 'usenetrc': False,
466 'username': None,
467 'password': None,
f9f1e798
RG
468 'quiet': True,
469 'forceurl': True,
470 'forcetitle': True,
471 'simulate': True,
9fcd8355 472 'format': None,
f9f1e798 473 'outtmpl': '%(id)s.%(ext)s'
9fcd8355 474 })
4fa74b52 475 fd.add_info_extractor(youtube_ie)
9fcd8355
RG
476 fd.download([
477 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
478 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
479 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
480 ])
4fa74b52
RG
481
482 except KeyboardInterrupt:
483 sys.exit('\nERROR: Interrupted by user')