]> jfr.im git - yt-dlp.git/blame - youtube-dl
Add .to_stderr() to downloaders
[yt-dlp.git] / youtube-dl
CommitLineData
4fa74b52
RG
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Author: Ricardo Garcia Gonzalez
4# License: Public domain code
5import htmlentitydefs
6import httplib
7import math
8import netrc
9import os
10import os.path
11import re
12import socket
13import string
14import sys
15import time
16import urllib
17import urllib2
18
19std_headers = {
20 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
24}
25
26simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28class FileDownloader(object):
29 """File Downloader class.
30
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
37
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
b4634726 41 finds that reports being able to handle it. The InfoExtractor returns
4fa74b52
RG
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
44
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
51
52 Available options:
53
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
b609fd54 58 simulate: Do not download the video files.
4fa74b52
RG
59 format: Video format code.
60 outtmpl: Template for output names.
61 """
62
63 _params = None
64 _ies = []
65
66 def __init__(self, params):
67 self._ies = []
68 self.set_params(params)
69
70 @staticmethod
71 def pmkdir(filename):
72 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73 components = filename.split(os.sep)
74 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75 for dir in aggregate:
76 if not os.path.exists(dir):
77 os.mkdir(dir)
78
79 @staticmethod
80 def format_bytes(bytes):
81 if bytes is None:
82 return 'N/A'
83 if bytes == 0:
84 exponent = 0
85 else:
86 exponent = long(math.log(float(bytes), 1024.0))
87 suffix = 'bkMGTPEZY'[exponent]
4fa74b52
RG
88 converted = float(bytes) / float(1024**exponent)
89 return '%.2f%s' % (converted, suffix)
90
91 @staticmethod
92 def calc_percent(byte_counter, data_len):
93 if data_len is None:
94 return '---.-%'
95 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
96
97 @staticmethod
98 def calc_eta(start, now, total, current):
99 if total is None:
100 return '--:--'
101 dif = now - start
102 if current == 0 or dif < 0.001: # One millisecond
103 return '--:--'
104 rate = float(current) / dif
105 eta = long((float(total) - float(current)) / rate)
106 (eta_mins, eta_secs) = divmod(eta, 60)
107 if eta_mins > 99:
108 return '--:--'
109 return '%02d:%02d' % (eta_mins, eta_secs)
110
111 @staticmethod
112 def calc_speed(start, now, bytes):
113 dif = now - start
114 if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355 115 return '%10s' % '---b/s'
4fa74b52
RG
116 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
117
118 @staticmethod
119 def best_block_size(elapsed_time, bytes):
120 new_min = max(bytes / 2.0, 1.0)
121 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122 if elapsed_time < 0.001:
123 return int(new_max)
124 rate = bytes / elapsed_time
125 if rate > new_max:
126 return int(new_max)
127 if rate < new_min:
128 return int(new_min)
129 return int(rate)
130
131 def set_params(self, params):
132 """Sets parameters."""
133 if type(params) != dict:
134 raise ValueError('params: dictionary expected')
135 self._params = params
136
137 def get_params(self):
138 """Get parameters."""
139 return self._params
140
141 def add_info_extractor(self, ie):
142 """Add an InfoExtractor object to the end of the list."""
143 self._ies.append(ie)
144 ie.set_downloader(self)
145
9fcd8355
RG
146 def to_stdout(self, message, skip_eol=False):
147 """Print message to stdout if not in quiet mode."""
148 if not self._params.get('quiet', False):
149 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
150 sys.stdout.flush()
7e5cab67
RG
151
152 def to_stderr(self, message):
153 """Print message to stderr."""
154 sys.stderr.write('%s\n' % message)
9fcd8355 155
4fa74b52
RG
156 def download(self, url_list):
157 """Download a given list of URLs."""
158 for url in url_list:
159 suitable_found = False
160 for ie in self._ies:
161 if not ie.suitable(url):
162 continue
163 # Suitable InfoExtractor found
164 suitable_found = True
b4634726
RG
165 results = [x for x in ie.extract(url) if x is not None]
166
167 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
168 sys.exit('ERROR: fixed output name but more than one file to download')
169
b609fd54
RG
170 if self._params.get('simulate', False):
171 continue
172
b4634726 173 for result in results:
4fa74b52
RG
174 try:
175 filename = self._params['outtmpl'] % result
176 except (KeyError), err:
7e5cab67 177 self.to_stderr('ERROR: invalid output template: %s' % str(err))
4fa74b52
RG
178 continue
179 try:
180 self.pmkdir(filename)
181 except (OSError, IOError), err:
7e5cab67 182 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
4fa74b52
RG
183 continue
184 try:
185 outstream = open(filename, 'wb')
186 except (OSError, IOError), err:
7e5cab67 187 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
4fa74b52
RG
188 continue
189 try:
190 self._do_download(outstream, result['url'])
191 outstream.close()
192 except (OSError, IOError), err:
7e5cab67 193 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
4fa74b52
RG
194 continue
195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
7e5cab67 196 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
4fa74b52
RG
197 continue
198 break
199 if not suitable_found:
7e5cab67 200 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
4fa74b52
RG
201
202 def _do_download(self, stream, url):
203 request = urllib2.Request(url, None, std_headers)
204 data = urllib2.urlopen(request)
205 data_len = data.info().get('Content-length', None)
206 data_len_str = self.format_bytes(data_len)
207 byte_counter = 0
208 block_size = 1024
209 start = time.time()
210 while True:
211 percent_str = self.calc_percent(byte_counter, data_len)
212 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
213 speed_str = self.calc_speed(start, time.time(), byte_counter)
9fcd8355
RG
214 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
215 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
4fa74b52
RG
216
217 before = time.time()
218 data_block = data.read(block_size)
219 after = time.time()
220 data_block_len = len(data_block)
221 if data_block_len == 0:
222 break
223 byte_counter += data_block_len
224 stream.write(data_block)
225 block_size = self.best_block_size(after - before, data_block_len)
226
9fcd8355 227 self.to_stdout('')
4fa74b52
RG
228 if data_len is not None and str(byte_counter) != data_len:
229 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
230
231class InfoExtractor(object):
232 """Information Extractor class.
233
234 Information extractors are the classes that, given a URL, extract
235 information from the video (or videos) the URL refers to. This
236 information includes the real video URL, the video title and simplified
237 title, author and others. It is returned in a list of dictionaries when
238 calling its extract() method. It is a list because a URL can refer to
239 more than one video (think of playlists). The dictionaries must include
240 the following fields:
241
242 id: Video identifier.
243 url: Final video URL.
244 uploader: Nickname of the video uploader.
245 title: Literal title.
246 stitle: Simplified title.
247 ext: Video filename extension.
248
249 Subclasses of this one should re-define the _real_initialize() and
250 _real_extract() methods, as well as the suitable() static method.
251 Probably, they should also be instantiated and added to the main
252 downloader.
253 """
254
255 _ready = False
256 _downloader = None
257
258 def __init__(self, downloader=None):
259 """Constructor. Receives an optional downloader."""
260 self._ready = False
261 self.set_downloader(downloader)
262
263 @staticmethod
264 def suitable(url):
265 """Receives a URL and returns True if suitable for this IE."""
266 return True
267
268 def initialize(self):
269 """Initializes an instance (login, etc)."""
270 if not self._ready:
271 self._real_initialize()
272 self._ready = True
273
274 def extract(self, url):
275 """Extracts URL information and returns it in list of dicts."""
276 self.initialize()
277 return self._real_extract(url)
278
279 def set_downloader(self, downloader):
280 """Sets the downloader for this IE."""
281 self._downloader = downloader
282
283 def to_stdout(self, message):
284 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
285 print message
286
287 def to_stderr(self, message):
288 sys.stderr.write('%s\n' % message)
289
290 def _real_initialize(self):
291 """Real initialization process. Redefine in subclasses."""
292 pass
293
294 def _real_extract(self, url):
295 """Real extraction process. Redefine in subclasses."""
296 pass
297
298class YoutubeIE(InfoExtractor):
299 """Information extractor for youtube.com."""
300
301 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
302 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
303 _NETRC_MACHINE = 'youtube'
304
305 def _real_initialize(self):
306 if self._downloader is None:
307 return
308
309 username = None
310 password = None
311 downloader_params = self._downloader.get_params()
312
313 # Attempt to use provided username and password or .netrc data
314 if downloader_params.get('username', None) is not None:
315 username = downloader_params['username']
316 password = downloader_params['password']
317 elif downloader_params.get('usenetrc', False):
318 try:
319 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
320 if info is not None:
321 username = info[0]
322 password = info[2]
323 else:
324 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
325 except (IOError, netrc.NetrcParseError), err:
326 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
327 return
328
329 if username is None:
330 return
331
332 # Log in
9fcd8355
RG
333 login_form = {
334 'current_form': 'loginForm',
4fa74b52
RG
335 'next': '/',
336 'action_login': 'Log In',
337 'username': username,
9fcd8355
RG
338 'password': password,
339 }
4fa74b52
RG
340 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
341 try:
342 self.to_stdout('[youtube] Logging in')
343 login_results = urllib2.urlopen(request).read()
344 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
345 self.to_stderr('WARNING: Unable to log in: bad username or password')
346 return
347 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
348 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
349 return
350
351 # Confirm age
9fcd8355
RG
352 age_form = {
353 'next_url': '/',
354 'action_confirm': 'Confirm',
355 }
4fa74b52
RG
356 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
357 try:
358 self.to_stdout('[youtube] Confirming age')
359 age_results = urllib2.urlopen(request).read()
360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
361 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
362
363 def _real_extract(self, url):
364 # Extract video id from URL
365 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
366 if mobj is None:
367 self.to_stderr('ERROR: Invalid URL: %s' % url)
368 return [None]
369 video_id = mobj.group(2)
370
371 # Downloader parameters
372 format_param = None
373 if self._downloader is not None:
374 params = self._downloader.get_params()
375 format_param = params.get('format', None)
376
377 # Extension
378 video_extension = {18: 'mp4'}.get(format_param, 'flv')
379
380 # Normalize URL, including format
381 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
382 if format_param is not None:
383 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
384 request = urllib2.Request(normalized_url, None, std_headers)
385 try:
386 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
387 video_webpage = urllib2.urlopen(request).read()
388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
389 sys.exit('ERROR: Unable to download video: %s' % str(err))
390 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
391
392 # "t" param
393 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
394 if mobj is None:
395 self.to_stderr('ERROR: Unable to extract "t" parameter')
396 return [None]
397 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
398 if format_param is not None:
399 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
9fcd8355 400 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
4fa74b52
RG
401
402 # uploader
403 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
404 if mobj is None:
405 self.to_stderr('ERROR: Unable to extract uploader nickname')
406 return [None]
407 video_uploader = mobj.group(1)
408
409 # title
410 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
411 if mobj is None:
412 self.to_stderr('ERROR: Unable to extract video title')
413 return [None]
414 video_title = mobj.group(1).decode('utf-8')
415 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
416
417 # simplified title
418 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
419 simple_title = simple_title.strip(u'_')
420
421 # Return information
9fcd8355
RG
422 return [{
423 'id': video_id,
424 'url': video_real_url,
425 'uploader': video_uploader,
426 'title': video_title,
427 'stitle': simple_title,
428 'ext': video_extension,
429 }]
4fa74b52
RG
430
431if __name__ == '__main__':
432 try:
433 # General configuration
434 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
435 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
436
437 # Information extractors
438 youtube_ie = YoutubeIE()
439
440 # File downloader
9fcd8355
RG
441 fd = FileDownloader({
442 'usenetrc': False,
443 'username': None,
444 'password': None,
445 'quiet': False,
b609fd54 446 'simulate': True,
9fcd8355
RG
447 'format': None,
448 'outtmpl': '%(id)s.%(ext)s'
449 })
4fa74b52 450 fd.add_info_extractor(youtube_ie)
9fcd8355
RG
451 fd.download([
452 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
453 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
454 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
455 ])
4fa74b52
RG
456
457 except KeyboardInterrupt:
458 sys.exit('\nERROR: Interrupted by user')