[yt-dlp.git] / youtube-dl

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# License: Public domain code
import htmlentitydefs
import httplib
import math
import netrc
import os
import os.path
import re
import socket
import string
import sys
import time
import urllib
import urllib2

std_headers = {	
	'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
	'Accept-Language': 'en-us,en;q=0.5',
}

simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')

class FileDownloader(object):
	"""File Downloader class.

	File downloader objects are the ones responsible of downloading the
	actual video file and writing it to disk if the user has requested
	it, among some other tasks. In most cases there should be one per
	program. As, given a video URL, the downloader doesn't know how to
	extract all the needed information, task that InfoExtractors do, it
	has to pass the URL to one of them.

	For this, file downloader objects have a method that allows
	InfoExtractors to be registered in a given order. When it is passed
	a URL, the file downloader handles it to the first InfoExtractor it
	finds that reports being able to handle it. The InfoExtractor returns
	all the information to the FileDownloader and the latter downloads the
	file or does whatever it's instructed to do.

	File downloaders accept a lot of parameters. In order not to saturate
	the object constructor with arguments, it receives a dictionary of
	options instead. These options are available through the get_params()
	method for the InfoExtractors to use. The FileDownloader also registers
	itself as the downloader in charge for the InfoExtractors that are
	added to it, so this is a "mutual registration".

	Available options:

	username:	Username for authentication purposes.
	password:	Password for authentication purposes.
	usenetrc:	Use netrc for authentication instead.
	quiet:		Do not print messages to stdout.
	forceurl:	Force printing final URL.
	forcetitle:	Force printing title.
	simulate:	Do not download the video files.
	format:		Video format code.
	outtmpl:	Template for output names.
	"""

	_params = None
	_ies = []

	def __init__(self, params):
		self._ies = []
		self.set_params(params)
	
	@staticmethod
	def pmkdir(filename):
		"""Create directory components in filename. Similar to Unix "mkdir -p"."""
		components = filename.split(os.sep)
		aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
		for dir in aggregate:
			if not os.path.exists(dir):
				os.mkdir(dir)
	
	@staticmethod
	def format_bytes(bytes):
		if bytes is None:
			return 'N/A'
		if bytes == 0:
			exponent = 0
		else:
			exponent = long(math.log(float(bytes), 1024.0))
		suffix = 'bkMGTPEZY'[exponent]
		converted = float(bytes) / float(1024**exponent)
		return '%.2f%s' % (converted, suffix)

	@staticmethod
	def calc_percent(byte_counter, data_len):
		if data_len is None:
			return '---.-%'
		return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))

	@staticmethod
	def calc_eta(start, now, total, current):
		if total is None:
			return '--:--'
		dif = now - start
		if current == 0 or dif < 0.001: # One millisecond
			return '--:--'
		rate = float(current) / dif
		eta = long((float(total) - float(current)) / rate)
		(eta_mins, eta_secs) = divmod(eta, 60)
		if eta_mins > 99:
			return '--:--'
		return '%02d:%02d' % (eta_mins, eta_secs)

 	@staticmethod
	def calc_speed(start, now, bytes):
		dif = now - start
		if bytes == 0 or dif < 0.001: # One millisecond
			return '%10s' % '---b/s'
		return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))

	@staticmethod
	def best_block_size(elapsed_time, bytes):
		new_min = max(bytes / 2.0, 1.0)
		new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
		if elapsed_time < 0.001:
			return int(new_max)
		rate = bytes / elapsed_time
		if rate > new_max:
			return int(new_max)
		if rate < new_min:
			return int(new_min)
		return int(rate)

	def set_params(self, params):
		"""Sets parameters."""
		if type(params) != dict:
			raise ValueError('params: dictionary expected')
		self._params = params
	
	def get_params(self):
		"""Get parameters."""
		return self._params

	def add_info_extractor(self, ie):
		"""Add an InfoExtractor object to the end of the list."""
		self._ies.append(ie)
		ie.set_downloader(self)
	
	def to_stdout(self, message, skip_eol=False):
		"""Print message to stdout if not in quiet mode."""
		if not self._params.get('quiet', False):
			sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
			sys.stdout.flush()
	
	def to_stderr(self, message):
		"""Print message to stderr."""
		sys.stderr.write('%s\n' % message)
	
	def fixed_template(self):
		"""Checks if the output template is fixed."""
		return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)

	def download(self, url_list):
		"""Download a given list of URLs."""
		if len(url_list) > 1 and self.fixed_template():
			sys.exit('ERROR: fixed output name but more than one file to download')

		for url in url_list:
			suitable_found = False
			for ie in self._ies:
				if not ie.suitable(url):
					continue
				# Suitable InfoExtractor found
				suitable_found = True
				results = [x for x in ie.extract(url) if x is not None]

				if len(results) > 1 and self.fixed_template():
					sys.exit('ERROR: fixed output name but more than one file to download')

				for result in results:

					# Forced printings
					if self._params.get('forcetitle', False):
						print result['title']
					if self._params.get('forceurl', False):
						print result['url']
						
					# Do nothing else if in simulate mode
					if self._params.get('simulate', False):
						continue

					try:
						filename = self._params['outtmpl'] % result
					except (ValueError, KeyError), err:
						self.to_stderr('ERROR: invalid output template: %s' % str(err))
						continue
					try:
						self.pmkdir(filename)
					except (OSError, IOError), err:
						self.to_stderr('ERROR: unable to create directories: %s' % str(err))
						continue
					try:
						outstream = open(filename, 'wb')
					except (OSError, IOError), err:
						self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
						continue
					try:
						self._do_download(outstream, result['url'])
						outstream.close()
					except (OSError, IOError), err:
						self.to_stderr('ERROR: unable to write video data: %s' % str(err))
						continue
					except (urllib2.URLError, httplib.HTTPException, socket.error), err:
						self.to_stderr('ERROR: unable to download video data: %s' % str(err))
						continue
				break
			if not suitable_found:
				self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
	
	def _do_download(self, stream, url):
		request = urllib2.Request(url, None, std_headers)
		data = urllib2.urlopen(request)
		data_len = data.info().get('Content-length', None)
		data_len_str = self.format_bytes(data_len)
		byte_counter = 0
		block_size = 1024
		start = time.time()
		while True:
			percent_str = self.calc_percent(byte_counter, data_len)
			eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
			speed_str = self.calc_speed(start, time.time(), byte_counter)
			self.to_stdout('\r[download] %s of %s at %s ETA %s' %
					(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)

			before = time.time()
			data_block = data.read(block_size)
			after = time.time()
			data_block_len = len(data_block)
			if data_block_len == 0:
				break
			byte_counter += data_block_len
			stream.write(data_block)
			block_size = self.best_block_size(after - before, data_block_len)

		self.to_stdout('')
		if data_len is not None and str(byte_counter) != data_len:
			raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))

class InfoExtractor(object):
	"""Information Extractor class.

	Information extractors are the classes that, given a URL, extract
	information from the video (or videos) the URL refers to. This
	information includes the real video URL, the video title and simplified
	title, author and others. It is returned in a list of dictionaries when
	calling its extract() method. It is a list because a URL can refer to
	more than one video (think of playlists). The dictionaries must include
	the following fields:

	id:		Video identifier.
	url:		Final video URL.
	uploader:	Nickname of the video uploader.
	title:		Literal title.
	stitle:		Simplified title.
	ext:		Video filename extension.

	Subclasses of this one should re-define the _real_initialize() and
	_real_extract() methods, as well as the suitable() static method.
	Probably, they should also be instantiated and added to the main
	downloader.
	"""

	_ready = False
	_downloader = None

	def __init__(self, downloader=None):
		"""Constructor. Receives an optional downloader."""
		self._ready = False
		self.set_downloader(downloader)

	@staticmethod
	def suitable(url):
		"""Receives a URL and returns True if suitable for this IE."""
		return True

	def initialize(self):
		"""Initializes an instance (login, etc)."""
		if not self._ready:
			self._real_initialize()
			self._ready = True

	def extract(self, url):
		"""Extracts URL information and returns it in list of dicts."""
		self.initialize()
		return self._real_extract(url)

	def set_downloader(self, downloader):
		"""Sets the downloader for this IE."""
		self._downloader = downloader
	
	def to_stdout(self, message):
		if self._downloader is None or not self._downloader.get_params().get('quiet', False):
			print message
	
	def to_stderr(self, message):
		sys.stderr.write('%s\n' % message)

	def _real_initialize(self):
		"""Real initialization process. Redefine in subclasses."""
		pass

	def _real_extract(self, url):
		"""Real extraction process. Redefine in subclasses."""
		pass

class YoutubeIE(InfoExtractor):
	"""Information extractor for youtube.com."""

	_LOGIN_URL = 'http://www.youtube.com/login?next=/'
	_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
	_NETRC_MACHINE = 'youtube'

	def _real_initialize(self):
		if self._downloader is None:
			return

		username = None
		password = None
		downloader_params = self._downloader.get_params()

		# Attempt to use provided username and password or .netrc data
		if downloader_params.get('username', None) is not None:
			username = downloader_params['username']
			password = downloader_params['password']
		elif downloader_params.get('usenetrc', False):
			try:
				info = netrc.netrc().authenticators(self._NETRC_MACHINE)
				if info is not None:
					username = info[0]
					password = info[2]
				else:
					raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
			except (IOError, netrc.NetrcParseError), err:
				self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
				return

		if username is None:
			return

		# Log in
		login_form = {
				'current_form': 'loginForm',
				'next':		'/',
				'action_login':	'Log In',
				'username':	username,
				'password':	password,
				}
		request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
		try:
			self.to_stdout('[youtube] Logging in')
			login_results = urllib2.urlopen(request).read()
			if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
				self.to_stderr('WARNING: Unable to log in: bad username or password')
				return
		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			self.to_stderr('WARNING: Unable to log in: %s' % str(err))
			return
	
		# Confirm age
		age_form = {
				'next_url':		'/',
				'action_confirm':	'Confirm',
				}
		request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
		try:
			self.to_stdout('[youtube] Confirming age')
			age_results = urllib2.urlopen(request).read()
		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			sys.exit('ERROR: Unable to confirm age: %s' % str(err))

	def _real_extract(self, url):
		# Extract video id from URL
		mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
		if mobj is None:
			self.to_stderr('ERROR: Invalid URL: %s' % url)
			return [None]
		video_id = mobj.group(2)

		# Downloader parameters
		format_param = None
		if self._downloader is not None:
			params = self._downloader.get_params()
			format_param = params.get('format', None)

		# Extension
		video_extension = {'18': 'mp4'}.get(format_param, 'flv')

		# Normalize URL, including format
		normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
		if format_param is not None:
			normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
		request = urllib2.Request(normalized_url, None, std_headers)
		try:
			self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
			video_webpage = urllib2.urlopen(request).read()
		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			sys.exit('ERROR: Unable to download video: %s' % str(err))
		self.to_stdout('[youtube] %s: Extracting video information' % video_id)
		
		# "t" param
		mobj = re.search(r', "t": "([^"]+)"', video_webpage)
		if mobj is None:
			self.to_stderr('ERROR: Unable to extract "t" parameter')
			return [None]
		video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
		if format_param is not None:
			video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
		self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))

		# uploader
		mobj = re.search(r'More From: ([^<]*)<', video_webpage)
		if mobj is None:
			self.to_stderr('ERROR: Unable to extract uploader nickname')
			return [None]
		video_uploader = mobj.group(1)

		# title
		mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
		if mobj is None:
			self.to_stderr('ERROR: Unable to extract video title')
			return [None]
		video_title = mobj.group(1).decode('utf-8')
		video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)

		# simplified title
		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
		simple_title = simple_title.strip(ur'_')

		# Return information
		return [{
			'id':		video_id,
			'url':		video_real_url,
			'uploader':	video_uploader,
			'title':	video_title,
			'stitle':	simple_title,
			'ext':		video_extension,
			}]

if __name__ == '__main__':
	try:
		# Modules needed only when running the main program
		import optparse

		# General configuration
		urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
		urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
		socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)

		# Parse command line

		# Information extractors
		youtube_ie = YoutubeIE()

		# File downloader
		fd = FileDownloader({
			'usenetrc': False,
			'username': None,
			'password': None,
			'quiet': True,
			'forceurl': True,
			'forcetitle': True,
			'simulate': True,
			'format': None,
			'outtmpl': '%(id)s.%(ext)s'
			})
		fd.add_info_extractor(youtube_ie)
		fd.download([
			'http://www.youtube.com/watch?v=t7qdwI7TVe8',
			'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
			'http://www.youtube.com/watch?v=DZRXe1wtC-M',
			])

	except KeyboardInterrupt:
		sys.exit('\nERROR: Interrupted by user')
Commit	Line	Data
4fa74b52 RG	1	#!/usr/bin/env python
	2	# -- coding: utf-8 --
	3	# Author: Ricardo Garcia Gonzalez
	4	# License: Public domain code
	5	import htmlentitydefs
	6	import httplib
	7	import math
	8	import netrc
	9	import os
	10	import os.path
	11	import re
	12	import socket
	13	import string
	14	import sys
	15	import time
	16	import urllib
	17	import urllib2
	18
	19	std_headers = {
7414bdf1	20	'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
4fa74b52 RG	21	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	22	'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,/;q=0.5',
	23	'Accept-Language': 'en-us,en;q=0.5',
	24	}
	25
	26	simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
	27
	28	class FileDownloader(object):
	29	"""File Downloader class.
	30
	31	File downloader objects are the ones responsible of downloading the
	32	actual video file and writing it to disk if the user has requested
	33	it, among some other tasks. In most cases there should be one per
	34	program. As, given a video URL, the downloader doesn't know how to
	35	extract all the needed information, task that InfoExtractors do, it
	36	has to pass the URL to one of them.
	37
	38	For this, file downloader objects have a method that allows
	39	InfoExtractors to be registered in a given order. When it is passed
	40	a URL, the file downloader handles it to the first InfoExtractor it
b4634726	41	finds that reports being able to handle it. The InfoExtractor returns
4fa74b52 RG	42	all the information to the FileDownloader and the latter downloads the
	43	file or does whatever it's instructed to do.
	44
	45	File downloaders accept a lot of parameters. In order not to saturate
	46	the object constructor with arguments, it receives a dictionary of
	47	options instead. These options are available through the get_params()
	48	method for the InfoExtractors to use. The FileDownloader also registers
	49	itself as the downloader in charge for the InfoExtractors that are
	50	added to it, so this is a "mutual registration".
	51
	52	Available options:
	53
	54	username: Username for authentication purposes.
	55	password: Password for authentication purposes.
	56	usenetrc: Use netrc for authentication instead.
	57	quiet: Do not print messages to stdout.
05a84b35 RG	58	forceurl: Force printing final URL.
05a84b35 RG	59	forcetitle: Force printing title.
b609fd54	60	simulate: Do not download the video files.
4fa74b52 RG	61	format: Video format code.
	62	outtmpl: Template for output names.
	63	"""
	64
	65	_params = None
	66	_ies = []
	67
	68	def __init__(self, params):
	69	self._ies = []
	70	self.set_params(params)
	71
	72	@staticmethod
	73	def pmkdir(filename):
	74	"""Create directory components in filename. Similar to Unix "mkdir -p"."""
	75	components = filename.split(os.sep)
	76	aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
	77	for dir in aggregate:
	78	if not os.path.exists(dir):
	79	os.mkdir(dir)
	80
	81	@staticmethod
	82	def format_bytes(bytes):
	83	if bytes is None:
	84	return 'N/A'
	85	if bytes == 0:
	86	exponent = 0
	87	else:
	88	exponent = long(math.log(float(bytes), 1024.0))
	89	suffix = 'bkMGTPEZY'[exponent]
4fa74b52 RG	90	converted = float(bytes) / float(1024**exponent)
	91	return '%.2f%s' % (converted, suffix)
	92
	93	@staticmethod
	94	def calc_percent(byte_counter, data_len):
	95	if data_len is None:
	96	return '---.-%'
	97	return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
	98
	99	@staticmethod
	100	def calc_eta(start, now, total, current):
	101	if total is None:
	102	return '--:--'
	103	dif = now - start
	104	if current == 0 or dif < 0.001: # One millisecond
	105	return '--:--'
	106	rate = float(current) / dif
	107	eta = long((float(total) - float(current)) / rate)
	108	(eta_mins, eta_secs) = divmod(eta, 60)
	109	if eta_mins > 99:
	110	return '--:--'
	111	return '%02d:%02d' % (eta_mins, eta_secs)
	112
	113	@staticmethod
	114	def calc_speed(start, now, bytes):
	115	dif = now - start
	116	if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355	117	return '%10s' % '---b/s'
4fa74b52 RG	118	return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
	119
	120	@staticmethod
	121	def best_block_size(elapsed_time, bytes):
	122	new_min = max(bytes / 2.0, 1.0)
	123	new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
	124	if elapsed_time < 0.001:
	125	return int(new_max)
	126	rate = bytes / elapsed_time
	127	if rate > new_max:
	128	return int(new_max)
	129	if rate < new_min:
	130	return int(new_min)
	131	return int(rate)
	132
	133	def set_params(self, params):
	134	"""Sets parameters."""
	135	if type(params) != dict:
	136	raise ValueError('params: dictionary expected')
	137	self._params = params
	138
	139	def get_params(self):
	140	"""Get parameters."""
	141	return self._params
	142
	143	def add_info_extractor(self, ie):
	144	"""Add an InfoExtractor object to the end of the list."""
	145	self._ies.append(ie)
	146	ie.set_downloader(self)
	147
9fcd8355 RG	148	def to_stdout(self, message, skip_eol=False):
	149	"""Print message to stdout if not in quiet mode."""
	150	if not self._params.get('quiet', False):
	151	sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
	152	sys.stdout.flush()
7e5cab67 RG	153
	154	def to_stderr(self, message):
	155	"""Print message to stderr."""
	156	sys.stderr.write('%s\n' % message)
22899cea RG	157
	158	def fixed_template(self):
	159	"""Checks if the output template is fixed."""
f97c8db7	160	return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
9fcd8355	161
4fa74b52 RG	162	def download(self, url_list):
4fa74b52 RG	163	"""Download a given list of URLs."""
22899cea RG	164	if len(url_list) > 1 and self.fixed_template():
	165	sys.exit('ERROR: fixed output name but more than one file to download')
	166
4fa74b52 RG	167	for url in url_list:
	168	suitable_found = False
	169	for ie in self._ies:
	170	if not ie.suitable(url):
	171	continue
	172	# Suitable InfoExtractor found
	173	suitable_found = True
b4634726 RG	174	results = [x for x in ie.extract(url) if x is not None]
b4634726 RG	175
22899cea	176	if len(results) > 1 and self.fixed_template():
b4634726 RG	177	sys.exit('ERROR: fixed output name but more than one file to download')
	178
	179	for result in results:
05a84b35 RG	180
	181	# Forced printings
	182	if self._params.get('forcetitle', False):
	183	print result['title']
	184	if self._params.get('forceurl', False):
	185	print result['url']
	186
	187	# Do nothing else if in simulate mode
	188	if self._params.get('simulate', False):
	189	continue
	190
4fa74b52 RG	191	try:
4fa74b52 RG	192	filename = self._params['outtmpl'] % result
14c30068	193	except (ValueError, KeyError), err:
7e5cab67	194	self.to_stderr('ERROR: invalid output template: %s' % str(err))
4fa74b52 RG	195	continue
	196	try:
	197	self.pmkdir(filename)
	198	except (OSError, IOError), err:
7e5cab67	199	self.to_stderr('ERROR: unable to create directories: %s' % str(err))
4fa74b52 RG	200	continue
	201	try:
	202	outstream = open(filename, 'wb')
	203	except (OSError, IOError), err:
7e5cab67	204	self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
4fa74b52 RG	205	continue
	206	try:
	207	self._do_download(outstream, result['url'])
	208	outstream.close()
	209	except (OSError, IOError), err:
7e5cab67	210	self.to_stderr('ERROR: unable to write video data: %s' % str(err))
4fa74b52 RG	211	continue
4fa74b52 RG	212	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
7e5cab67	213	self.to_stderr('ERROR: unable to download video data: %s' % str(err))
4fa74b52 RG	214	continue
	215	break
	216	if not suitable_found:
7e5cab67	217	self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
4fa74b52 RG	218
	219	def _do_download(self, stream, url):
	220	request = urllib2.Request(url, None, std_headers)
	221	data = urllib2.urlopen(request)
	222	data_len = data.info().get('Content-length', None)
	223	data_len_str = self.format_bytes(data_len)
	224	byte_counter = 0
	225	block_size = 1024
	226	start = time.time()
	227	while True:
	228	percent_str = self.calc_percent(byte_counter, data_len)
	229	eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
	230	speed_str = self.calc_speed(start, time.time(), byte_counter)
9fcd8355 RG	231	self.to_stdout('\r[download] %s of %s at %s ETA %s' %
9fcd8355 RG	232	(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
4fa74b52 RG	233
	234	before = time.time()
	235	data_block = data.read(block_size)
	236	after = time.time()
	237	data_block_len = len(data_block)
	238	if data_block_len == 0:
	239	break
	240	byte_counter += data_block_len
	241	stream.write(data_block)
	242	block_size = self.best_block_size(after - before, data_block_len)
	243
9fcd8355	244	self.to_stdout('')
4fa74b52 RG	245	if data_len is not None and str(byte_counter) != data_len:
	246	raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
	247
	248	class InfoExtractor(object):
	249	"""Information Extractor class.
	250
	251	Information extractors are the classes that, given a URL, extract
	252	information from the video (or videos) the URL refers to. This
	253	information includes the real video URL, the video title and simplified
	254	title, author and others. It is returned in a list of dictionaries when
	255	calling its extract() method. It is a list because a URL can refer to
	256	more than one video (think of playlists). The dictionaries must include
	257	the following fields:
	258
	259	id: Video identifier.
	260	url: Final video URL.
	261	uploader: Nickname of the video uploader.
	262	title: Literal title.
	263	stitle: Simplified title.
	264	ext: Video filename extension.
	265
	266	Subclasses of this one should re-define the _real_initialize() and
	267	_real_extract() methods, as well as the suitable() static method.
	268	Probably, they should also be instantiated and added to the main
	269	downloader.
	270	"""
	271
	272	_ready = False
	273	_downloader = None
	274
	275	def __init__(self, downloader=None):
	276	"""Constructor. Receives an optional downloader."""
	277	self._ready = False
	278	self.set_downloader(downloader)
	279
	280	@staticmethod
	281	def suitable(url):
	282	"""Receives a URL and returns True if suitable for this IE."""
	283	return True
	284
	285	def initialize(self):
	286	"""Initializes an instance (login, etc)."""
	287	if not self._ready:
	288	self._real_initialize()
	289	self._ready = True
	290
	291	def extract(self, url):
	292	"""Extracts URL information and returns it in list of dicts."""
	293	self.initialize()
	294	return self._real_extract(url)
	295
	296	def set_downloader(self, downloader):
	297	"""Sets the downloader for this IE."""
	298	self._downloader = downloader
	299
	300	def to_stdout(self, message):
	301	if self._downloader is None or not self._downloader.get_params().get('quiet', False):
	302	print message
	303
	304	def to_stderr(self, message):
	305	sys.stderr.write('%s\n' % message)
	306
	307	def _real_initialize(self):
	308	"""Real initialization process. Redefine in subclasses."""
309	pass
310
311	def _real_extract(self, url):
312	"""Real extraction process. Redefine in subclasses."""
313	pass
314
315	class YoutubeIE(InfoExtractor):
316	"""Information extractor for youtube.com."""
317
318	_LOGIN_URL = 'http://www.youtube.com/login?next=/'
319	_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
320	_NETRC_MACHINE = 'youtube'
321
322	def _real_initialize(self):
323	if self._downloader is None:
324	return
325
326	username = None
327	password = None
328	downloader_params = self._downloader.get_params()
329
330	# Attempt to use provided username and password or .netrc data
331	if downloader_params.get('username', None) is not None:
332	username = downloader_params['username']
333	password = downloader_params['password']
334	elif downloader_params.get('usenetrc', False):
335	try:
336	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
337	if info is not None:
338	username = info[0]
339	password = info[2]
340	else:
341	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
342	except (IOError, netrc.NetrcParseError), err:
343	self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
344	return
345
346	if username is None:
347	return
348
349	# Log in
9fcd8355 RG	350	login_form = {
9fcd8355 RG	351	'current_form': 'loginForm',
4fa74b52 RG	352	'next': '/',
	353	'action_login': 'Log In',
	354	'username': username,
9fcd8355 RG	355	'password': password,
9fcd8355 RG	356	}
4fa74b52 RG	357	request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
	358	try:
	359	self.to_stdout('[youtube] Logging in')
	360	login_results = urllib2.urlopen(request).read()
	361	if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
	362	self.to_stderr('WARNING: Unable to log in: bad username or password')
	363	return
	364	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	365	self.to_stderr('WARNING: Unable to log in: %s' % str(err))
	366	return
	367
	368	# Confirm age
9fcd8355 RG	369	age_form = {
	370	'next_url': '/',
	371	'action_confirm': 'Confirm',
	372	}
4fa74b52 RG	373	request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
	374	try:
	375	self.to_stdout('[youtube] Confirming age')
	376	age_results = urllib2.urlopen(request).read()
	377	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	378	sys.exit('ERROR: Unable to confirm age: %s' % str(err))
	379
	380	def _real_extract(self, url):
	381	# Extract video id from URL
	382	mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)\|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
	383	if mobj is None:
	384	self.to_stderr('ERROR: Invalid URL: %s' % url)
	385	return [None]
	386	video_id = mobj.group(2)
	387
	388	# Downloader parameters
	389	format_param = None
	390	if self._downloader is not None:
	391	params = self._downloader.get_params()
	392	format_param = params.get('format', None)
	393
	394	# Extension
f9f1e798	395	video_extension = {'18': 'mp4'}.get(format_param, 'flv')
4fa74b52 RG	396
	397	# Normalize URL, including format
	398	normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
	399	if format_param is not None:
	400	normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
	401	request = urllib2.Request(normalized_url, None, std_headers)
	402	try:
	403	self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
	404	video_webpage = urllib2.urlopen(request).read()
	405	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	406	sys.exit('ERROR: Unable to download video: %s' % str(err))
	407	self.to_stdout('[youtube] %s: Extracting video information' % video_id)
	408
	409	# "t" param
	410	mobj = re.search(r', "t": "([^"]+)"', video_webpage)
	411	if mobj is None:
	412	self.to_stderr('ERROR: Unable to extract "t" parameter')
	413	return [None]
	414	video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
	415	if format_param is not None:
	416	video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
9fcd8355	417	self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
4fa74b52 RG	418
	419	# uploader
	420	mobj = re.search(r'More From: ([^<]*)<', video_webpage)
	421	if mobj is None:
	422	self.to_stderr('ERROR: Unable to extract uploader nickname')
	423	return [None]
	424	video_uploader = mobj.group(1)
	425
	426	# title
	427	mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
	428	if mobj is None:
	429	self.to_stderr('ERROR: Unable to extract video title')
	430	return [None]
	431	video_title = mobj.group(1).decode('utf-8')
f97c8db7	432	video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
4fa74b52 RG	433
4fa74b52 RG	434	# simplified title
f97c8db7 RG	435	simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
f97c8db7 RG	436	simple_title = simple_title.strip(ur'_')
4fa74b52 RG	437
4fa74b52 RG	438	# Return information
9fcd8355 RG	439	return [{
	440	'id': video_id,
	441	'url': video_real_url,
	442	'uploader': video_uploader,
	443	'title': video_title,
	444	'stitle': simple_title,
	445	'ext': video_extension,
	446	}]
4fa74b52 RG	447
	448	if __name__ == '__main__':
	449	try:
f9f1e798 RG	450	# Modules needed only when running the main program
	451	import optparse
	452
4fa74b52 RG	453	# General configuration
	454	urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
	455	urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
f9f1e798 RG	456	socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
	457
	458	# Parse command line
4fa74b52 RG	459
	460	# Information extractors
	461	youtube_ie = YoutubeIE()
	462
	463	# File downloader
9fcd8355 RG	464	fd = FileDownloader({
	465	'usenetrc': False,
	466	'username': None,
	467	'password': None,
f9f1e798 RG	468	'quiet': True,
	469	'forceurl': True,
	470	'forcetitle': True,
	471	'simulate': True,
9fcd8355	472	'format': None,
f9f1e798	473	'outtmpl': '%(id)s.%(ext)s'
9fcd8355	474	})
4fa74b52	475	fd.add_info_extractor(youtube_ie)
9fcd8355 RG	476	fd.download([
	477	'http://www.youtube.com/watch?v=t7qdwI7TVe8',
	478	'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
	479	'http://www.youtube.com/watch?v=DZRXe1wtC-M',
	480	])
4fa74b52 RG	481
	482	except KeyboardInterrupt:
	483	sys.exit('\nERROR: Interrupted by user')