[yt-dlp.git] / youtube-dl

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# License: Public domain code
import htmlentitydefs
import httplib
import math
import netrc
import os
import os.path
import re
import socket
import string
import sys
import time
import urllib
import urllib2

std_headers = {	
	'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
	'Accept-Language': 'en-us,en;q=0.5',
}

simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')

class FileDownloader(object):
	"""File Downloader class.

	File downloader objects are the ones responsible of downloading the
	actual video file and writing it to disk if the user has requested
	it, among some other tasks. In most cases there should be one per
	program. As, given a video URL, the downloader doesn't know how to
	extract all the needed information, task that InfoExtractors do, it
	has to pass the URL to one of them.

	For this, file downloader objects have a method that allows
	InfoExtractors to be registered in a given order. When it is passed
	a URL, the file downloader handles it to the first InfoExtractor it
	finds that reports being able to handle it. The InfoExtractor returns
	all the information to the FileDownloader and the latter downloads the
	file or does whatever it's instructed to do.

	File downloaders accept a lot of parameters. In order not to saturate
	the object constructor with arguments, it receives a dictionary of
	options instead. These options are available through the get_params()
	method for the InfoExtractors to use. The FileDownloader also registers
	itself as the downloader in charge for the InfoExtractors that are
	added to it, so this is a "mutual registration".

	Available options:

	username:	Username for authentication purposes.
	password:	Password for authentication purposes.
	usenetrc:	Use netrc for authentication instead.
	quiet:		Do not print messages to stdout.
	simulate:	Do not download the video files.
	format:		Video format code.
	outtmpl:	Template for output names.
	"""

	_params = None
	_ies = []

	def __init__(self, params):
		self._ies = []
		self.set_params(params)
	
	@staticmethod
	def pmkdir(filename):
		"""Create directory components in filename. Similar to Unix "mkdir -p"."""
		components = filename.split(os.sep)
		aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
		for dir in aggregate:
			if not os.path.exists(dir):
				os.mkdir(dir)
	
	@staticmethod
	def format_bytes(bytes):
		if bytes is None:
			return 'N/A'
		if bytes == 0:
			exponent = 0
		else:
			exponent = long(math.log(float(bytes), 1024.0))
		suffix = 'bkMGTPEZY'[exponent]
		converted = float(bytes) / float(1024**exponent)
		return '%.2f%s' % (converted, suffix)

	@staticmethod
	def calc_percent(byte_counter, data_len):
		if data_len is None:
			return '---.-%'
		return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))

	@staticmethod
	def calc_eta(start, now, total, current):
		if total is None:
			return '--:--'
		dif = now - start
		if current == 0 or dif < 0.001: # One millisecond
			return '--:--'
		rate = float(current) / dif
		eta = long((float(total) - float(current)) / rate)
		(eta_mins, eta_secs) = divmod(eta, 60)
		if eta_mins > 99:
			return '--:--'
		return '%02d:%02d' % (eta_mins, eta_secs)

 	@staticmethod
	def calc_speed(start, now, bytes):
		dif = now - start
		if bytes == 0 or dif < 0.001: # One millisecond
			return '%10s' % '---b/s'
		return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))

	@staticmethod
	def best_block_size(elapsed_time, bytes):
		new_min = max(bytes / 2.0, 1.0)
		new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
		if elapsed_time < 0.001:
			return int(new_max)
		rate = bytes / elapsed_time
		if rate > new_max:
			return int(new_max)
		if rate < new_min:
			return int(new_min)
		return int(rate)

	def set_params(self, params):
		"""Sets parameters."""
		if type(params) != dict:
			raise ValueError('params: dictionary expected')
		self._params = params
	
	def get_params(self):
		"""Get parameters."""
		return self._params

	def add_info_extractor(self, ie):
		"""Add an InfoExtractor object to the end of the list."""
		self._ies.append(ie)
		ie.set_downloader(self)
	
	def to_stdout(self, message, skip_eol=False):
		"""Print message to stdout if not in quiet mode."""
		if not self._params.get('quiet', False):
			sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
			sys.stdout.flush()
	
	def to_stderr(self, message):
		"""Print message to stderr."""
		sys.stderr.write('%s\n' % message)

	def download(self, url_list):
		"""Download a given list of URLs."""
		for url in url_list:
			suitable_found = False
			for ie in self._ies:
				if not ie.suitable(url):
					continue
				# Suitable InfoExtractor found
				suitable_found = True
				results = [x for x in ie.extract(url) if x is not None]

				if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
					sys.exit('ERROR: fixed output name but more than one file to download')

				if self._params.get('simulate', False):
					continue

				for result in results:
					try:
						filename = self._params['outtmpl'] % result
					except (KeyError), err:
						self.to_stderr('ERROR: invalid output template: %s' % str(err))
						continue
					try:
						self.pmkdir(filename)
					except (OSError, IOError), err:
						self.to_stderr('ERROR: unable to create directories: %s' % str(err))
						continue
					try:
						outstream = open(filename, 'wb')
					except (OSError, IOError), err:
						self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
						continue
					try:
						self._do_download(outstream, result['url'])
						outstream.close()
					except (OSError, IOError), err:
						self.to_stderr('ERROR: unable to write video data: %s' % str(err))
						continue
					except (urllib2.URLError, httplib.HTTPException, socket.error), err:
						self.to_stderr('ERROR: unable to download video data: %s' % str(err))
						continue
				break
			if not suitable_found:
				self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
	
	def _do_download(self, stream, url):
		request = urllib2.Request(url, None, std_headers)
		data = urllib2.urlopen(request)
		data_len = data.info().get('Content-length', None)
		data_len_str = self.format_bytes(data_len)
		byte_counter = 0
		block_size = 1024
		start = time.time()
		while True:
			percent_str = self.calc_percent(byte_counter, data_len)
			eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
			speed_str = self.calc_speed(start, time.time(), byte_counter)
			self.to_stdout('\r[download] %s of %s at %s ETA %s' %
					(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)

			before = time.time()
			data_block = data.read(block_size)
			after = time.time()
			data_block_len = len(data_block)
			if data_block_len == 0:
				break
			byte_counter += data_block_len
			stream.write(data_block)
			block_size = self.best_block_size(after - before, data_block_len)

		self.to_stdout('')
		if data_len is not None and str(byte_counter) != data_len:
			raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))

class InfoExtractor(object):
	"""Information Extractor class.

	Information extractors are the classes that, given a URL, extract
	information from the video (or videos) the URL refers to. This
	information includes the real video URL, the video title and simplified
	title, author and others. It is returned in a list of dictionaries when
	calling its extract() method. It is a list because a URL can refer to
	more than one video (think of playlists). The dictionaries must include
	the following fields:

	id:		Video identifier.
	url:		Final video URL.
	uploader:	Nickname of the video uploader.
	title:		Literal title.
	stitle:		Simplified title.
	ext:		Video filename extension.

	Subclasses of this one should re-define the _real_initialize() and
	_real_extract() methods, as well as the suitable() static method.
	Probably, they should also be instantiated and added to the main
	downloader.
	"""

	_ready = False
	_downloader = None

	def __init__(self, downloader=None):
		"""Constructor. Receives an optional downloader."""
		self._ready = False
		self.set_downloader(downloader)

	@staticmethod
	def suitable(url):
		"""Receives a URL and returns True if suitable for this IE."""
		return True

	def initialize(self):
		"""Initializes an instance (login, etc)."""
		if not self._ready:
			self._real_initialize()
			self._ready = True

	def extract(self, url):
		"""Extracts URL information and returns it in list of dicts."""
		self.initialize()
		return self._real_extract(url)

	def set_downloader(self, downloader):
		"""Sets the downloader for this IE."""
		self._downloader = downloader
	
	def to_stdout(self, message):
		if self._downloader is None or not self._downloader.get_params().get('quiet', False):
			print message
	
	def to_stderr(self, message):
		sys.stderr.write('%s\n' % message)

	def _real_initialize(self):
		"""Real initialization process. Redefine in subclasses."""
		pass

	def _real_extract(self, url):
		"""Real extraction process. Redefine in subclasses."""
		pass

class YoutubeIE(InfoExtractor):
	"""Information extractor for youtube.com."""

	_LOGIN_URL = 'http://www.youtube.com/login?next=/'
	_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
	_NETRC_MACHINE = 'youtube'

	def _real_initialize(self):
		if self._downloader is None:
			return

		username = None
		password = None
		downloader_params = self._downloader.get_params()

		# Attempt to use provided username and password or .netrc data
		if downloader_params.get('username', None) is not None:
			username = downloader_params['username']
			password = downloader_params['password']
		elif downloader_params.get('usenetrc', False):
			try:
				info = netrc.netrc().authenticators(self._NETRC_MACHINE)
				if info is not None:
					username = info[0]
					password = info[2]
				else:
					raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
			except (IOError, netrc.NetrcParseError), err:
				self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
				return

		if username is None:
			return

		# Log in
		login_form = {
				'current_form': 'loginForm',
				'next':		'/',
				'action_login':	'Log In',
				'username':	username,
				'password':	password,
				}
		request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
		try:
			self.to_stdout('[youtube] Logging in')
			login_results = urllib2.urlopen(request).read()
			if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
				self.to_stderr('WARNING: Unable to log in: bad username or password')
				return
		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			self.to_stderr('WARNING: Unable to log in: %s' % str(err))
			return
	
		# Confirm age
		age_form = {
				'next_url':		'/',
				'action_confirm':	'Confirm',
				}
		request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
		try:
			self.to_stdout('[youtube] Confirming age')
			age_results = urllib2.urlopen(request).read()
		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			sys.exit('ERROR: Unable to confirm age: %s' % str(err))

	def _real_extract(self, url):
		# Extract video id from URL
		mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
		if mobj is None:
			self.to_stderr('ERROR: Invalid URL: %s' % url)
			return [None]
		video_id = mobj.group(2)

		# Downloader parameters
		format_param = None
		if self._downloader is not None:
			params = self._downloader.get_params()
			format_param = params.get('format', None)

		# Extension
		video_extension = {18: 'mp4'}.get(format_param, 'flv')

		# Normalize URL, including format
		normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
		if format_param is not None:
			normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
		request = urllib2.Request(normalized_url, None, std_headers)
		try:
			self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
			video_webpage = urllib2.urlopen(request).read()
		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			sys.exit('ERROR: Unable to download video: %s' % str(err))
		self.to_stdout('[youtube] %s: Extracting video information' % video_id)
		
		# "t" param
		mobj = re.search(r', "t": "([^"]+)"', video_webpage)
		if mobj is None:
			self.to_stderr('ERROR: Unable to extract "t" parameter')
			return [None]
		video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
		if format_param is not None:
			video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
		self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))

		# uploader
		mobj = re.search(r'More From: ([^<]*)<', video_webpage)
		if mobj is None:
			self.to_stderr('ERROR: Unable to extract uploader nickname')
			return [None]
		video_uploader = mobj.group(1)

		# title
		mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
		if mobj is None:
			self.to_stderr('ERROR: Unable to extract video title')
			return [None]
		video_title = mobj.group(1).decode('utf-8')
		video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)

		# simplified title
		simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
		simple_title = simple_title.strip(u'_')

		# Return information
		return [{
			'id':		video_id,
			'url':		video_real_url,
			'uploader':	video_uploader,
			'title':	video_title,
			'stitle':	simple_title,
			'ext':		video_extension,
			}]

if __name__ == '__main__':
	try:
		# General configuration
		urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
		urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))

		# Information extractors
		youtube_ie = YoutubeIE()

		# File downloader
		fd = FileDownloader({
			'usenetrc': False,
			'username': None,
			'password': None,
			'quiet': False,
			'simulate': True,
			'format': None,
			'outtmpl': '%(id)s.%(ext)s'
			})
		fd.add_info_extractor(youtube_ie)
		fd.download([
			'http://www.youtube.com/watch?v=t7qdwI7TVe8',
			'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
			'http://www.youtube.com/watch?v=DZRXe1wtC-M',
			])

	except KeyboardInterrupt:
		sys.exit('\nERROR: Interrupted by user')
Commit	Line	Data
4fa74b52 RG	1	#!/usr/bin/env python
	2	# -- coding: utf-8 --
	3	# Author: Ricardo Garcia Gonzalez
	4	# License: Public domain code
	5	import htmlentitydefs
	6	import httplib
	7	import math
	8	import netrc
	9	import os
	10	import os.path
	11	import re
	12	import socket
	13	import string
	14	import sys
	15	import time
	16	import urllib
	17	import urllib2
	18
	19	std_headers = {
	20	'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
	21	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	22	'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,/;q=0.5',
	23	'Accept-Language': 'en-us,en;q=0.5',
	24	}
	25
	26	simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
	27
	28	class FileDownloader(object):
	29	"""File Downloader class.
	30
	31	File downloader objects are the ones responsible of downloading the
	32	actual video file and writing it to disk if the user has requested
	33	it, among some other tasks. In most cases there should be one per
	34	program. As, given a video URL, the downloader doesn't know how to
	35	extract all the needed information, task that InfoExtractors do, it
	36	has to pass the URL to one of them.
	37
	38	For this, file downloader objects have a method that allows
	39	InfoExtractors to be registered in a given order. When it is passed
	40	a URL, the file downloader handles it to the first InfoExtractor it
b4634726	41	finds that reports being able to handle it. The InfoExtractor returns
4fa74b52 RG	42	all the information to the FileDownloader and the latter downloads the
	43	file or does whatever it's instructed to do.
	44
	45	File downloaders accept a lot of parameters. In order not to saturate
	46	the object constructor with arguments, it receives a dictionary of
	47	options instead. These options are available through the get_params()
	48	method for the InfoExtractors to use. The FileDownloader also registers
	49	itself as the downloader in charge for the InfoExtractors that are
	50	added to it, so this is a "mutual registration".
	51
	52	Available options:
	53
	54	username: Username for authentication purposes.
	55	password: Password for authentication purposes.
	56	usenetrc: Use netrc for authentication instead.
	57	quiet: Do not print messages to stdout.
b609fd54	58	simulate: Do not download the video files.
4fa74b52 RG	59	format: Video format code.
	60	outtmpl: Template for output names.
	61	"""
	62
	63	_params = None
	64	_ies = []
	65
	66	def __init__(self, params):
	67	self._ies = []
	68	self.set_params(params)
	69
	70	@staticmethod
	71	def pmkdir(filename):
	72	"""Create directory components in filename. Similar to Unix "mkdir -p"."""
	73	components = filename.split(os.sep)
	74	aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
	75	for dir in aggregate:
	76	if not os.path.exists(dir):
	77	os.mkdir(dir)
	78
	79	@staticmethod
	80	def format_bytes(bytes):
	81	if bytes is None:
	82	return 'N/A'
	83	if bytes == 0:
	84	exponent = 0
	85	else:
	86	exponent = long(math.log(float(bytes), 1024.0))
	87	suffix = 'bkMGTPEZY'[exponent]
4fa74b52 RG	88	converted = float(bytes) / float(1024**exponent)
	89	return '%.2f%s' % (converted, suffix)
	90
	91	@staticmethod
	92	def calc_percent(byte_counter, data_len):
	93	if data_len is None:
	94	return '---.-%'
	95	return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
	96
	97	@staticmethod
	98	def calc_eta(start, now, total, current):
	99	if total is None:
	100	return '--:--'
	101	dif = now - start
	102	if current == 0 or dif < 0.001: # One millisecond
	103	return '--:--'
	104	rate = float(current) / dif
	105	eta = long((float(total) - float(current)) / rate)
	106	(eta_mins, eta_secs) = divmod(eta, 60)
	107	if eta_mins > 99:
	108	return '--:--'
	109	return '%02d:%02d' % (eta_mins, eta_secs)
	110
	111	@staticmethod
	112	def calc_speed(start, now, bytes):
	113	dif = now - start
	114	if bytes == 0 or dif < 0.001: # One millisecond
9fcd8355	115	return '%10s' % '---b/s'
4fa74b52 RG	116	return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
	117
	118	@staticmethod
	119	def best_block_size(elapsed_time, bytes):
	120	new_min = max(bytes / 2.0, 1.0)
	121	new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
	122	if elapsed_time < 0.001:
	123	return int(new_max)
	124	rate = bytes / elapsed_time
	125	if rate > new_max:
	126	return int(new_max)
	127	if rate < new_min:
	128	return int(new_min)
	129	return int(rate)
	130
	131	def set_params(self, params):
	132	"""Sets parameters."""
	133	if type(params) != dict:
	134	raise ValueError('params: dictionary expected')
	135	self._params = params
	136
	137	def get_params(self):
	138	"""Get parameters."""
	139	return self._params
	140
	141	def add_info_extractor(self, ie):
	142	"""Add an InfoExtractor object to the end of the list."""
	143	self._ies.append(ie)
	144	ie.set_downloader(self)
	145
9fcd8355 RG	146	def to_stdout(self, message, skip_eol=False):
	147	"""Print message to stdout if not in quiet mode."""
	148	if not self._params.get('quiet', False):
	149	sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
	150	sys.stdout.flush()
7e5cab67 RG	151
	152	def to_stderr(self, message):
	153	"""Print message to stderr."""
	154	sys.stderr.write('%s\n' % message)
9fcd8355	155
4fa74b52 RG	156	def download(self, url_list):
	157	"""Download a given list of URLs."""
	158	for url in url_list:
	159	suitable_found = False
	160	for ie in self._ies:
	161	if not ie.suitable(url):
	162	continue
	163	# Suitable InfoExtractor found
	164	suitable_found = True
b4634726 RG	165	results = [x for x in ie.extract(url) if x is not None]
	166
	167	if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
	168	sys.exit('ERROR: fixed output name but more than one file to download')
	169
b609fd54 RG	170	if self._params.get('simulate', False):
	171	continue
	172
b4634726	173	for result in results:
4fa74b52 RG	174	try:
	175	filename = self._params['outtmpl'] % result
	176	except (KeyError), err:
7e5cab67	177	self.to_stderr('ERROR: invalid output template: %s' % str(err))
4fa74b52 RG	178	continue
	179	try:
	180	self.pmkdir(filename)
	181	except (OSError, IOError), err:
7e5cab67	182	self.to_stderr('ERROR: unable to create directories: %s' % str(err))
4fa74b52 RG	183	continue
	184	try:
	185	outstream = open(filename, 'wb')
	186	except (OSError, IOError), err:
7e5cab67	187	self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
4fa74b52 RG	188	continue
	189	try:
	190	self._do_download(outstream, result['url'])
	191	outstream.close()
	192	except (OSError, IOError), err:
7e5cab67	193	self.to_stderr('ERROR: unable to write video data: %s' % str(err))
4fa74b52 RG	194	continue
4fa74b52 RG	195	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
7e5cab67	196	self.to_stderr('ERROR: unable to download video data: %s' % str(err))
4fa74b52 RG	197	continue
	198	break
	199	if not suitable_found:
7e5cab67	200	self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
4fa74b52 RG	201
	202	def _do_download(self, stream, url):
	203	request = urllib2.Request(url, None, std_headers)
	204	data = urllib2.urlopen(request)
	205	data_len = data.info().get('Content-length', None)
	206	data_len_str = self.format_bytes(data_len)
	207	byte_counter = 0
	208	block_size = 1024
	209	start = time.time()
	210	while True:
	211	percent_str = self.calc_percent(byte_counter, data_len)
	212	eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
	213	speed_str = self.calc_speed(start, time.time(), byte_counter)
9fcd8355 RG	214	self.to_stdout('\r[download] %s of %s at %s ETA %s' %
9fcd8355 RG	215	(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
4fa74b52 RG	216
	217	before = time.time()
	218	data_block = data.read(block_size)
	219	after = time.time()
	220	data_block_len = len(data_block)
	221	if data_block_len == 0:
	222	break
	223	byte_counter += data_block_len
	224	stream.write(data_block)
	225	block_size = self.best_block_size(after - before, data_block_len)
	226
9fcd8355	227	self.to_stdout('')
4fa74b52 RG	228	if data_len is not None and str(byte_counter) != data_len:
	229	raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
	230
	231	class InfoExtractor(object):
	232	"""Information Extractor class.
	233
	234	Information extractors are the classes that, given a URL, extract
	235	information from the video (or videos) the URL refers to. This
	236	information includes the real video URL, the video title and simplified
	237	title, author and others. It is returned in a list of dictionaries when
	238	calling its extract() method. It is a list because a URL can refer to
	239	more than one video (think of playlists). The dictionaries must include
	240	the following fields:
	241
	242	id: Video identifier.
	243	url: Final video URL.
	244	uploader: Nickname of the video uploader.
	245	title: Literal title.
	246	stitle: Simplified title.
	247	ext: Video filename extension.
	248
	249	Subclasses of this one should re-define the _real_initialize() and
	250	_real_extract() methods, as well as the suitable() static method.
	251	Probably, they should also be instantiated and added to the main
	252	downloader.
	253	"""
	254
	255	_ready = False
	256	_downloader = None
	257
	258	def __init__(self, downloader=None):
	259	"""Constructor. Receives an optional downloader."""
	260	self._ready = False
	261	self.set_downloader(downloader)
	262
	263	@staticmethod
	264	def suitable(url):
	265	"""Receives a URL and returns True if suitable for this IE."""
	266	return True
	267
	268	def initialize(self):
	269	"""Initializes an instance (login, etc)."""
	270	if not self._ready:
	271	self._real_initialize()
	272	self._ready = True
	273
	274	def extract(self, url):
	275	"""Extracts URL information and returns it in list of dicts."""
	276	self.initialize()
	277	return self._real_extract(url)
	278
	279	def set_downloader(self, downloader):
	280	"""Sets the downloader for this IE."""
	281	self._downloader = downloader
	282
	283	def to_stdout(self, message):
	284	if self._downloader is None or not self._downloader.get_params().get('quiet', False):
	285	print message
	286
	287	def to_stderr(self, message):
	288	sys.stderr.write('%s\n' % message)
	289
	290	def _real_initialize(self):
	291	"""Real initialization process. Redefine in subclasses."""
292	pass
293
294	def _real_extract(self, url):
295	"""Real extraction process. Redefine in subclasses."""
296	pass
297
298	class YoutubeIE(InfoExtractor):
299	"""Information extractor for youtube.com."""
300
301	_LOGIN_URL = 'http://www.youtube.com/login?next=/'
302	_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
303	_NETRC_MACHINE = 'youtube'
304
305	def _real_initialize(self):
306	if self._downloader is None:
307	return
308
309	username = None
310	password = None
311	downloader_params = self._downloader.get_params()
312
313	# Attempt to use provided username and password or .netrc data
314	if downloader_params.get('username', None) is not None:
315	username = downloader_params['username']
316	password = downloader_params['password']
317	elif downloader_params.get('usenetrc', False):
318	try:
319	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
320	if info is not None:
321	username = info[0]
322	password = info[2]
323	else:
324	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
325	except (IOError, netrc.NetrcParseError), err:
326	self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
327	return
328
329	if username is None:
330	return
331
332	# Log in
9fcd8355 RG	333	login_form = {
9fcd8355 RG	334	'current_form': 'loginForm',
4fa74b52 RG	335	'next': '/',
	336	'action_login': 'Log In',
	337	'username': username,
9fcd8355 RG	338	'password': password,
9fcd8355 RG	339	}
4fa74b52 RG	340	request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
	341	try:
	342	self.to_stdout('[youtube] Logging in')
	343	login_results = urllib2.urlopen(request).read()
	344	if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
	345	self.to_stderr('WARNING: Unable to log in: bad username or password')
	346	return
	347	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	348	self.to_stderr('WARNING: Unable to log in: %s' % str(err))
	349	return
	350
	351	# Confirm age
9fcd8355 RG	352	age_form = {
	353	'next_url': '/',
	354	'action_confirm': 'Confirm',
	355	}
4fa74b52 RG	356	request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
	357	try:
	358	self.to_stdout('[youtube] Confirming age')
	359	age_results = urllib2.urlopen(request).read()
	360	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	361	sys.exit('ERROR: Unable to confirm age: %s' % str(err))
	362
	363	def _real_extract(self, url):
	364	# Extract video id from URL
	365	mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)\|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
	366	if mobj is None:
	367	self.to_stderr('ERROR: Invalid URL: %s' % url)
	368	return [None]
	369	video_id = mobj.group(2)
	370
	371	# Downloader parameters
	372	format_param = None
	373	if self._downloader is not None:
	374	params = self._downloader.get_params()
	375	format_param = params.get('format', None)
	376
	377	# Extension
	378	video_extension = {18: 'mp4'}.get(format_param, 'flv')
	379
	380	# Normalize URL, including format
	381	normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
	382	if format_param is not None:
	383	normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
	384	request = urllib2.Request(normalized_url, None, std_headers)
	385	try:
	386	self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
	387	video_webpage = urllib2.urlopen(request).read()
	388	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	389	sys.exit('ERROR: Unable to download video: %s' % str(err))
	390	self.to_stdout('[youtube] %s: Extracting video information' % video_id)
	391
	392	# "t" param
	393	mobj = re.search(r', "t": "([^"]+)"', video_webpage)
	394	if mobj is None:
	395	self.to_stderr('ERROR: Unable to extract "t" parameter')
	396	return [None]
	397	video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
	398	if format_param is not None:
	399	video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
9fcd8355	400	self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
4fa74b52 RG	401
	402	# uploader
	403	mobj = re.search(r'More From: ([^<]*)<', video_webpage)
	404	if mobj is None:
	405	self.to_stderr('ERROR: Unable to extract uploader nickname')
	406	return [None]
	407	video_uploader = mobj.group(1)
	408
	409	# title
	410	mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
	411	if mobj is None:
	412	self.to_stderr('ERROR: Unable to extract video title')
	413	return [None]
	414	video_title = mobj.group(1).decode('utf-8')
	415	video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
	416
	417	# simplified title
	418	simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
	419	simple_title = simple_title.strip(u'_')
	420
	421	# Return information
9fcd8355 RG	422	return [{
	423	'id': video_id,
	424	'url': video_real_url,
	425	'uploader': video_uploader,
	426	'title': video_title,
	427	'stitle': simple_title,
	428	'ext': video_extension,
	429	}]
4fa74b52 RG	430
	431	if __name__ == '__main__':
	432	try:
	433	# General configuration
	434	urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
	435	urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
	436
	437	# Information extractors
	438	youtube_ie = YoutubeIE()
	439
	440	# File downloader
9fcd8355 RG	441	fd = FileDownloader({
	442	'usenetrc': False,
	443	'username': None,
	444	'password': None,
	445	'quiet': False,
b609fd54	446	'simulate': True,
9fcd8355 RG	447	'format': None,
	448	'outtmpl': '%(id)s.%(ext)s'
	449	})
4fa74b52	450	fd.add_info_extractor(youtube_ie)
9fcd8355 RG	451	fd.download([
	452	'http://www.youtube.com/watch?v=t7qdwI7TVe8',
	453	'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
	454	'http://www.youtube.com/watch?v=DZRXe1wtC-M',
	455	])
4fa74b52 RG	456
	457	except KeyboardInterrupt:
	458	sys.exit('\nERROR: Interrupted by user')