jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python
	2	# -- coding: utf-8 --
	3
	4	from __future__ import absolute_import
	5
	6	import base64
	7	import datetime
	8	import itertools
	9	import netrc
	10	import os
	11	import re
	12	import socket
	13	import time
	14	import email.utils
	15	import xml.etree.ElementTree
	16	import random
	17	import math
	18	import operator
	19
	20	from .utils import *
	21
	22
	23	class InfoExtractor(object):
	24	"""Information Extractor class.
	25
	26	Information extractors are the classes that, given a URL, extract
	27	information about the video (or videos) the URL refers to. This
	28	information includes the real video URL, the video title, author and
	29	others. The information is stored in a dictionary which is then
	30	passed to the FileDownloader. The FileDownloader processes this
	31	information possibly downloading the video to the file system, among
	32	other possible outcomes.
	33
	34	The dictionaries must include the following fields:
	35
	36	id: Video identifier.
	37	url: Final video URL.
	38	title: Video title, unescaped.
	39	ext: Video filename extension.
	40
	41	The following fields are optional:
	42
	43	format: The video format, defaults to ext (used for --get-format)
	44	thumbnail: Full URL to a video thumbnail image.
	45	description: One-line video description.
	46	uploader: Full name of the video uploader.
	47	upload_date: Video upload date (YYYYMMDD).
	48	uploader_id: Nickname or id of the video uploader.
	49	location: Physical location of the video.
	50	player_url: SWF Player URL (used for rtmpdump).
	51	subtitles: The subtitle file contents.
	52	urlhandle: [internal] The urlHandle to be used to download the file,
	53	like returned by urllib.request.urlopen
	54
	55	The fields should all be Unicode strings.
	56
	57	Subclasses of this one should re-define the _real_initialize() and
	58	_real_extract() methods and define a _VALID_URL regexp.
	59	Probably, they should also be added to the list of extractors.
	60
	61	_real_extract() must return a list of information dictionaries as
	62	described above.
	63
	64	Finally, the _WORKING attribute should be set to False for broken IEs
	65	in order to warn the users and skip the tests.
	66	"""
	67
	68	_ready = False
	69	_downloader = None
	70	_WORKING = True
	71
	72	def __init__(self, downloader=None):
	73	"""Constructor. Receives an optional downloader."""
	74	self._ready = False
	75	self.set_downloader(downloader)
	76
	77	@classmethod
	78	def suitable(cls, url):
	79	"""Receives a URL and returns True if suitable for this IE."""
	80	return re.match(cls._VALID_URL, url) is not None
	81
	82	@classmethod
	83	def working(cls):
	84	"""Getter method for _WORKING."""
	85	return cls._WORKING
	86
	87	def initialize(self):
	88	"""Initializes an instance (authentication, etc)."""
	89	if not self._ready:
	90	self._real_initialize()
	91	self._ready = True
	92
	93	def extract(self, url):
	94	"""Extracts URL information and returns it in list of dicts."""
	95	self.initialize()
	96	return self._real_extract(url)
	97
	98	def set_downloader(self, downloader):
	99	"""Sets the downloader for this IE."""
	100	self._downloader = downloader
	101
	102	def _real_initialize(self):
	103	"""Real initialization process. Redefine in subclasses."""
	104	pass
	105
	106	def _real_extract(self, url):
	107	"""Real extraction process. Redefine in subclasses."""
	108	pass
	109
	110	@property
	111	def IE_NAME(self):
	112	return type(self).__name__[:-2]
	113
	114	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
	115	""" Returns the response handle """
	116	if note is None:
	117	self.report_download_webpage(video_id)
	118	elif note is not False:
	119	self.to_screen(u'%s: %s' % (video_id, note))
	120	try:
	121	return compat_urllib_request.urlopen(url_or_request)
	122	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	123	if errnote is None:
	124	errnote = u'Unable to download webpage'
	125	raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
	126
	127	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
	128	""" Returns a tuple (page content as string, URL handle) """
	129	urlh = self._request_webpage(url_or_request, video_id, note, errnote)
	130	content_type = urlh.headers.get('Content-Type', '')
	131	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	132	if m:
	133	encoding = m.group(1)
	134	else:
	135	encoding = 'utf-8'
	136	webpage_bytes = urlh.read()
	137	if self._downloader.params.get('dump_intermediate_pages', False):
	138	try:
	139	url = url_or_request.get_full_url()
	140	except AttributeError:
	141	url = url_or_request
	142	self.to_screen(u'Dumping request to ' + url)
	143	dump = base64.b64encode(webpage_bytes).decode('ascii')
	144	self._downloader.to_screen(dump)
	145	content = webpage_bytes.decode(encoding, 'replace')
	146	return (content, urlh)
	147
	148	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
	149	""" Returns the data of the page as a string """
	150	return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
	151
	152	def to_screen(self, msg):
	153	"""Print msg to screen, prefixing it with '[ie_name]'"""
	154	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	155
	156	def report_extraction(self, id_or_name):
	157	"""Report information extraction."""
	158	self.to_screen(u'%s: Extracting information' % id_or_name)
	159
	160	def report_download_webpage(self, video_id):
	161	"""Report webpage download."""
	162	self.to_screen(u'%s: Downloading webpage' % video_id)
	163
	164	def report_age_confirmation(self):
	165	"""Report attempt to confirm age."""
	166	self.to_screen(u'Confirming age')
	167
	168	#Methods for following #608
	169	#They set the correct value of the '_type' key
	170	def video_result(self, video_info):
	171	"""Returns a video"""
	172	video_info['_type'] = 'video'
	173	return video_info
	174	def url_result(self, url, ie=None):
	175	"""Returns a url that points to a page that should be processed"""
	176	#TODO: ie should be the class used for getting the info
	177	video_info = {'_type': 'url',
	178	'url': url,
	179	'ie_key': ie}
	180	return video_info
	181	def playlist_result(self, entries, playlist_id=None, playlist_title=None):
	182	"""Returns a playlist"""
	183	video_info = {'_type': 'playlist',
	184	'entries': entries}
	185	if playlist_id:
	186	video_info['id'] = playlist_id
	187	if playlist_title:
	188	video_info['title'] = playlist_title
	189	return video_info
	190
	191
	192	class YoutubeIE(InfoExtractor):
	193	"""Information extractor for youtube.com."""
	194
	195	_VALID_URL = r"""^
	196	(
	197	(?:https?://)? # http(s):// (optional)
	198	(?:youtu\.be/\|(?:\w+\.)?youtube(?:-nocookie)?\.com/\|
	199	tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
	200	(?:.*?\#/)? # handle anchor (#/) redirect urls
	201	(?: # the various things that can precede the ID:
	202	(?:(?:v\|embed\|e)/) # v/ or embed/ or e/
	203	\|(?: # or the v= param in all its forms
	204	(?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	205	(?:\?\|\#!?) # the params delimiter ? or # or #!
	206	(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
	207	v=
	208	)
	209	)? # optional -> youtube.com/xxxx is OK
	210	)? # all until now is optional -> you can pass the naked ID
	211	([0-9A-Za-z_-]+) # here is it! the YouTube video ID
	212	(?(1).+)? # if we found the ID, everything can follow
	213	$"""
	214	_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
	215	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	216	_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
	217	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	218	_NETRC_MACHINE = 'youtube'
	219	# Listed in order of quality
	220	_available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
	221	_available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
	222	_video_extensions = {
	223	'13': '3gp',
	224	'17': 'mp4',
	225	'18': 'mp4',
	226	'22': 'mp4',
	227	'37': 'mp4',
	228	'38': 'video', # You actually don't know if this will be MOV, AVI or whatever
	229	'43': 'webm',
	230	'44': 'webm',
	231	'45': 'webm',
	232	'46': 'webm',
	233	}
	234	_video_dimensions = {
	235	'5': '240x400',
	236	'6': '???',
	237	'13': '???',
	238	'17': '144x176',
	239	'18': '360x640',
	240	'22': '720x1280',
	241	'34': '360x640',
	242	'35': '480x854',
	243	'37': '1080x1920',
	244	'38': '3072x4096',
	245	'43': '360x640',
	246	'44': '480x854',
	247	'45': '720x1280',
	248	'46': '1080x1920',
	249	}
	250	IE_NAME = u'youtube'
	251
	252	@classmethod
	253	def suitable(cls, url):
	254	"""Receives a URL and returns True if suitable for this IE."""
	255	if YoutubePlaylistIE.suitable(url): return False
	256	return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
	257
	258	def report_lang(self):
	259	"""Report attempt to set language."""
	260	self.to_screen(u'Setting language')
	261
	262	def report_login(self):
	263	"""Report attempt to log in."""
	264	self.to_screen(u'Logging in')
	265
	266	def report_video_webpage_download(self, video_id):
	267	"""Report attempt to download video webpage."""
	268	self.to_screen(u'%s: Downloading video webpage' % video_id)
	269
	270	def report_video_info_webpage_download(self, video_id):
	271	"""Report attempt to download video info webpage."""
	272	self.to_screen(u'%s: Downloading video info webpage' % video_id)
	273
	274	def report_video_subtitles_download(self, video_id):
	275	"""Report attempt to download video info webpage."""
	276	self.to_screen(u'%s: Checking available subtitles' % video_id)
	277
	278	def report_video_subtitles_request(self, video_id, sub_lang, format):
	279	"""Report attempt to download video info webpage."""
	280	self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
	281
	282	def report_video_subtitles_available(self, video_id, sub_lang_list):
	283	"""Report available subtitles."""
	284	sub_lang = ",".join(list(sub_lang_list.keys()))
	285	self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
	286
	287	def report_information_extraction(self, video_id):
	288	"""Report attempt to extract video information."""
	289	self.to_screen(u'%s: Extracting video information' % video_id)
	290
	291	def report_unavailable_format(self, video_id, format):
	292	"""Report extracted video URL."""
	293	self.to_screen(u'%s: Format %s not available' % (video_id, format))
	294
	295	def report_rtmp_download(self):
	296	"""Indicate the download will use the RTMP protocol."""
	297	self.to_screen(u'RTMP download detected')
	298
	299	def _get_available_subtitles(self, video_id):
	300	self.report_video_subtitles_download(video_id)
	301	request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
	302	try:
	303	sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
	304	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	305	return (u'unable to download video subtitles: %s' % compat_str(err), None)
	306	sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
	307	sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
	308	if not sub_lang_list:
	309	return (u'video doesn\'t have subtitles', None)
	310	return sub_lang_list
	311
	312	def _list_available_subtitles(self, video_id):
	313	sub_lang_list = self._get_available_subtitles(video_id)
	314	self.report_video_subtitles_available(video_id, sub_lang_list)
	315
	316	def _request_subtitle(self, sub_lang, sub_name, video_id, format):
	317	"""
	318	Return tuple:
	319	(error_message, sub_lang, sub)
	320	"""
	321	self.report_video_subtitles_request(video_id, sub_lang, format)
	322	params = compat_urllib_parse.urlencode({
	323	'lang': sub_lang,
	324	'name': sub_name,
	325	'v': video_id,
	326	'fmt': format,
	327	})
	328	url = 'http://www.youtube.com/api/timedtext?' + params
	329	try:
	330	sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
	331	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	332	return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
	333	if not sub:
	334	return (u'Did not fetch video subtitles', None, None)
	335	return (None, sub_lang, sub)
	336
	337	def _extract_subtitle(self, video_id):
	338	"""
	339	Return a list with a tuple:
	340	[(error_message, sub_lang, sub)]
	341	"""
	342	sub_lang_list = self._get_available_subtitles(video_id)
	343	sub_format = self._downloader.params.get('subtitlesformat')
	344	if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
	345	return [(sub_lang_list[0], None, None)]
	346	if self._downloader.params.get('subtitleslang', False):
	347	sub_lang = self._downloader.params.get('subtitleslang')
	348	elif 'en' in sub_lang_list:
	349	sub_lang = 'en'
	350	else:
	351	sub_lang = list(sub_lang_list.keys())[0]
	352	if not sub_lang in sub_lang_list:
	353	return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
	354
	355	subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
	356	return [subtitle]
	357
	358	def _extract_all_subtitles(self, video_id):
	359	sub_lang_list = self._get_available_subtitles(video_id)
	360	sub_format = self._downloader.params.get('subtitlesformat')
	361	if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
	362	return [(sub_lang_list[0], None, None)]
	363	subtitles = []
	364	for sub_lang in sub_lang_list:
	365	subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
	366	subtitles.append(subtitle)
	367	return subtitles
	368
	369	def _print_formats(self, formats):
	370	print('Available formats:')
	371	for x in formats:
	372	print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
	373
	374	def _real_initialize(self):
	375	if self._downloader is None:
	376	return
	377
	378	username = None
	379	password = None
	380	downloader_params = self._downloader.params
	381
	382	# Attempt to use provided username and password or .netrc data
	383	if downloader_params.get('username', None) is not None:
	384	username = downloader_params['username']
	385	password = downloader_params['password']
	386	elif downloader_params.get('usenetrc', False):
	387	try:
	388	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	389	if info is not None:
	390	username = info[0]
	391	password = info[2]
	392	else:
	393	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	394	except (IOError, netrc.NetrcParseError) as err:
	395	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	396	return
	397
	398	# Set language
	399	request = compat_urllib_request.Request(self._LANG_URL)
	400	try:
	401	self.report_lang()
	402	compat_urllib_request.urlopen(request).read()
	403	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	404	self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
	405	return
	406
	407	# No authentication to be performed
	408	if username is None:
	409	return
	410
	411	request = compat_urllib_request.Request(self._LOGIN_URL)
	412	try:
	413	login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
	414	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	415	self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
	416	return
	417
	418	galx = None
	419	dsh = None
	420	match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
	421	if match:
	422	galx = match.group(1)
	423
	424	match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
	425	if match:
	426	dsh = match.group(1)
	427
	428	# Log in
	429	login_form_strs = {
	430	u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	431	u'Email': username,
	432	u'GALX': galx,
	433	u'Passwd': password,
	434	u'PersistentCookie': u'yes',
	435	u'_utf8': u'霱',
	436	u'bgresponse': u'js_disabled',
	437	u'checkConnection': u'',
	438	u'checkedDomains': u'youtube',
	439	u'dnConn': u'',
	440	u'dsh': dsh,
	441	u'pstMsg': u'0',
	442	u'rmShown': u'1',
	443	u'secTok': u'',
	444	u'signIn': u'Sign in',
	445	u'timeStmp': u'',
	446	u'service': u'youtube',
	447	u'uilel': u'3',
	448	u'hl': u'en_US',
	449	}
	450	# Convert to UTF-8 before urlencode because Python 2.x's urlencode
	451	# chokes on unicode
	452	login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
	453	login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
	454	request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
	455	try:
	456	self.report_login()
	457	login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
	458	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	459	self._downloader.report_warning(u'unable to log in: bad username or password')
	460	return
	461	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	462	self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
	463	return
	464
	465	# Confirm age
	466	age_form = {
	467	'next_url': '/',
	468	'action_confirm': 'Confirm',
	469	}
	470	request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
	471	try:
	472	self.report_age_confirmation()
	473	age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
	474	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	475	raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
	476
	477	def _extract_id(self, url):
	478	mobj = re.match(self._VALID_URL, url, re.VERBOSE)
	479	if mobj is None:
	480	raise ExtractorError(u'Invalid URL: %s' % url)
	481	video_id = mobj.group(2)
	482	return video_id
	483
	484	def _real_extract(self, url):
	485	# Extract original video URL from URL with redirection, like age verification, using next_url parameter
	486	mobj = re.search(self._NEXT_URL_RE, url)
	487	if mobj:
	488	url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
	489	video_id = self._extract_id(url)
	490
	491	# Get video webpage
	492	self.report_video_webpage_download(video_id)
	493	url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
	494	request = compat_urllib_request.Request(url)
	495	try:
	496	video_webpage_bytes = compat_urllib_request.urlopen(request).read()
	497	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	498	raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
	499
	500	video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
	501
	502	# Attempt to extract SWF player URL
	503	mobj = re.search(r'swfConfig.?"(http:\\/\\/.?watch.?-.?\.swf)"', video_webpage)
	504	if mobj is not None:
	505	player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
	506	else:
	507	player_url = None
	508
	509	# Get video info
	510	self.report_video_info_webpage_download(video_id)
	511	for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
	512	video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
	513	% (video_id, el_type))
	514	video_info_webpage = self._download_webpage(video_info_url, video_id,
	515	note=False,
	516	errnote='unable to download video info webpage')
	517	video_info = compat_parse_qs(video_info_webpage)
	518	if 'token' in video_info:
	519	break
	520	if 'token' not in video_info:
	521	if 'reason' in video_info:
	522	raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
	523	else:
	524	raise ExtractorError(u'"token" parameter not in video info for unknown reason')
	525
	526	# Check for "rental" videos
	527	if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
	528	raise ExtractorError(u'"rental" videos not supported')
	529
	530	# Start extracting information
	531	self.report_information_extraction(video_id)
	532
	533	# uploader
	534	if 'author' not in video_info:
	535	raise ExtractorError(u'Unable to extract uploader name')
	536	video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
	537
	538	# uploader_id
	539	video_uploader_id = None
	540	mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user\|channel)/([^"]+)">', video_webpage)
	541	if mobj is not None:
	542	video_uploader_id = mobj.group(1)
	543	else:
	544	self._downloader.report_warning(u'unable to extract uploader nickname')
	545
	546	# title
	547	if 'title' not in video_info:
	548	raise ExtractorError(u'Unable to extract video title')
	549	video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
	550
	551	# thumbnail image
	552	if 'thumbnail_url' not in video_info:
	553	self._downloader.report_warning(u'unable to extract video thumbnail')
	554	video_thumbnail = ''
	555	else: # don't panic if we can't find it
	556	video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
	557
	558	# upload date
	559	upload_date = None
	560	mobj = re.search(r'id="eow-date.?>(.?)</span>', video_webpage, re.DOTALL)
	561	if mobj is not None:
	562	upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
	563	upload_date = unified_strdate(upload_date)
	564
	565	# description
	566	video_description = get_element_by_id("eow-description", video_webpage)
	567	if video_description:
	568	video_description = clean_html(video_description)
	569	else:
	570	fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
	571	if fd_mobj:
	572	video_description = unescapeHTML(fd_mobj.group(1))
	573	else:
	574	video_description = u''
	575
	576	# subtitles
	577	video_subtitles = None
	578
	579	if self._downloader.params.get('writesubtitles', False):
	580	video_subtitles = self._extract_subtitle(video_id)
	581	if video_subtitles:
	582	(sub_error, sub_lang, sub) = video_subtitles[0]
	583	if sub_error:
	584	self._downloader.report_error(sub_error)
	585
	586	if self._downloader.params.get('allsubtitles', False):
	587	video_subtitles = self._extract_all_subtitles(video_id)
	588	for video_subtitle in video_subtitles:
	589	(sub_error, sub_lang, sub) = video_subtitle
	590	if sub_error:
	591	self._downloader.report_error(sub_error)
	592
	593	if self._downloader.params.get('listsubtitles', False):
	594	sub_lang_list = self._list_available_subtitles(video_id)
	595	return
	596
	597	if 'length_seconds' not in video_info:
	598	self._downloader.report_warning(u'unable to extract video duration')
	599	video_duration = ''
	600	else:
	601	video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
	602
	603	# token
	604	video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
	605
	606	# Decide which formats to download
	607	req_format = self._downloader.params.get('format', None)
	608
	609	if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
	610	self.report_rtmp_download()
	611	video_url_list = [(None, video_info['conn'][0])]
	612	elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
	613	url_map = {}
	614	for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
	615	url_data = compat_parse_qs(url_data_str)
	616	if 'itag' in url_data and 'url' in url_data:
	617	url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
	618	if not 'ratebypass' in url: url += '&ratebypass=yes'
	619	url_map[url_data['itag'][0]] = url
	620
	621	format_limit = self._downloader.params.get('format_limit', None)
	622	available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
	623	if format_limit is not None and format_limit in available_formats:
	624	format_list = available_formats[available_formats.index(format_limit):]
	625	else:
	626	format_list = available_formats
	627	existing_formats = [x for x in format_list if x in url_map]
	628	if len(existing_formats) == 0:
	629	raise ExtractorError(u'no known formats available for video')
	630	if self._downloader.params.get('listformats', None):
	631	self._print_formats(existing_formats)
	632	return
	633	if req_format is None or req_format == 'best':
	634	video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
	635	elif req_format == 'worst':
	636	video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
	637	elif req_format in ('-1', 'all'):
	638	video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
	639	else:
	640	# Specific formats. We pick the first in a slash-delimeted sequence.
	641	# For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
	642	req_formats = req_format.split('/')
	643	video_url_list = None
	644	for rf in req_formats:
	645	if rf in url_map:
	646	video_url_list = [(rf, url_map[rf])]
	647	break
	648	if video_url_list is None:
	649	raise ExtractorError(u'requested format not available')
	650	else:
	651	raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
	652
	653	results = []
	654	for format_param, video_real_url in video_url_list:
	655	# Extension
	656	video_extension = self._video_extensions.get(format_param, 'flv')
	657
	658	video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
	659	self._video_dimensions.get(format_param, '???'))
	660
	661	results.append({
	662	'id': video_id,
	663	'url': video_real_url,
	664	'uploader': video_uploader,
	665	'uploader_id': video_uploader_id,
	666	'upload_date': upload_date,
	667	'title': video_title,
	668	'ext': video_extension,
	669	'format': video_format,
	670	'thumbnail': video_thumbnail,
	671	'description': video_description,
	672	'player_url': player_url,
	673	'subtitles': video_subtitles,
	674	'duration': video_duration
	675	})
	676	return results
	677
	678
	679	class MetacafeIE(InfoExtractor):
	680	"""Information Extractor for metacafe.com."""
	681
	682	_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
	683	_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
	684	_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
	685	IE_NAME = u'metacafe'
	686
	687	def report_disclaimer(self):
	688	"""Report disclaimer retrieval."""
	689	self.to_screen(u'Retrieving disclaimer')
	690
	691	def _real_initialize(self):
	692	# Retrieve disclaimer
	693	request = compat_urllib_request.Request(self._DISCLAIMER)
	694	try:
	695	self.report_disclaimer()
	696	disclaimer = compat_urllib_request.urlopen(request).read()
	697	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	698	raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
	699
	700	# Confirm age
	701	disclaimer_form = {
	702	'filters': '0',
	703	'submit': "Continue - I'm over 18",
	704	}
	705	request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
	706	try:
	707	self.report_age_confirmation()
	708	disclaimer = compat_urllib_request.urlopen(request).read()
	709	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	710	raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
	711
	712	def _real_extract(self, url):
	713	# Extract id and simplified title from URL
	714	mobj = re.match(self._VALID_URL, url)
	715	if mobj is None:
	716	raise ExtractorError(u'Invalid URL: %s' % url)
	717
	718	video_id = mobj.group(1)
	719
	720	# Check if video comes from YouTube
	721	mobj2 = re.match(r'^yt-(.*)$', video_id)
	722	if mobj2 is not None:
	723	return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
	724
	725	# Retrieve video webpage to extract further information
	726	webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
	727
	728	# Extract URL, uploader and title from webpage
	729	self.report_extraction(video_id)
	730	mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
	731	if mobj is not None:
	732	mediaURL = compat_urllib_parse.unquote(mobj.group(1))
	733	video_extension = mediaURL[-3:]
	734
	735	# Extract gdaKey if available
	736	mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
	737	if mobj is None:
	738	video_url = mediaURL
	739	else:
	740	gdaKey = mobj.group(1)
	741	video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
	742	else:
	743	mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
	744	if mobj is None:
	745	raise ExtractorError(u'Unable to extract media URL')
	746	vardict = compat_parse_qs(mobj.group(1))
	747	if 'mediaData' not in vardict:
	748	raise ExtractorError(u'Unable to extract media URL')
	749	mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.?)",(.?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
	750	if mobj is None:
	751	raise ExtractorError(u'Unable to extract media URL')
	752	mediaURL = mobj.group('mediaURL').replace('\\/', '/')
	753	video_extension = mediaURL[-3:]
	754	video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
	755
	756	mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
	757	if mobj is None:
	758	raise ExtractorError(u'Unable to extract title')
	759	video_title = mobj.group(1).decode('utf-8')
	760
	761	mobj = re.search(r'submitter=(.*?);', webpage)
	762	if mobj is None:
	763	raise ExtractorError(u'Unable to extract uploader nickname')
	764	video_uploader = mobj.group(1)
	765
	766	return [{
	767	'id': video_id.decode('utf-8'),
	768	'url': video_url.decode('utf-8'),
	769	'uploader': video_uploader.decode('utf-8'),
	770	'upload_date': None,
	771	'title': video_title,
	772	'ext': video_extension.decode('utf-8'),
	773	}]
	774
	775	class DailymotionIE(InfoExtractor):
	776	"""Information Extractor for Dailymotion"""
	777
	778	_VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
	779	IE_NAME = u'dailymotion'
	780
	781	def _real_extract(self, url):
	782	# Extract id and simplified title from URL
	783	mobj = re.match(self._VALID_URL, url)
	784	if mobj is None:
	785	raise ExtractorError(u'Invalid URL: %s' % url)
	786
	787	video_id = mobj.group(1).split('_')[0].split('?')[0]
	788
	789	video_extension = 'mp4'
	790
	791	# Retrieve video webpage to extract further information
	792	request = compat_urllib_request.Request(url)
	793	request.add_header('Cookie', 'family_filter=off')
	794	webpage = self._download_webpage(request, video_id)
	795
	796	# Extract URL, uploader and title from webpage
	797	self.report_extraction(video_id)
	798	mobj = re.search(r'\svar flashvars = (.)', webpage)
	799	if mobj is None:
	800	raise ExtractorError(u'Unable to extract media URL')
	801	flashvars = compat_urllib_parse.unquote(mobj.group(1))
	802
	803	for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
	804	if key in flashvars:
	805	max_quality = key
	806	self.to_screen(u'Using %s' % key)
	807	break
	808	else:
	809	raise ExtractorError(u'Unable to extract video URL')
	810
	811	mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
	812	if mobj is None:
	813	raise ExtractorError(u'Unable to extract video URL')
	814
	815	video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
	816
	817	# TODO: support choosing qualities
	818
	819	mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
	820	if mobj is None:
	821	raise ExtractorError(u'Unable to extract title')
	822	video_title = unescapeHTML(mobj.group('title'))
	823
	824	video_uploader = None
	825	mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
	826	if mobj is None:
	827	# lookin for official user
	828	mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
	829	if mobj_official is None:
	830	self._downloader.report_warning(u'unable to extract uploader nickname')
	831	else:
	832	video_uploader = mobj_official.group(1)
	833	else:
	834	video_uploader = mobj.group(1)
	835
	836	video_upload_date = None
	837	mobj = re.search(r'<div class="[^"]uploaded_cont[^"]" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
	838	if mobj is not None:
	839	video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
	840
	841	return [{
	842	'id': video_id,
	843	'url': video_url,
	844	'uploader': video_uploader,
	845	'upload_date': video_upload_date,
	846	'title': video_title,
	847	'ext': video_extension,
	848	}]
	849
	850
	851	class PhotobucketIE(InfoExtractor):
	852	"""Information extractor for photobucket.com."""
	853
	854	# TODO: the original _VALID_URL was:
	855	# r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.[\?\&]current=(.\.flv)'
	856	# Check if it's necessary to keep the old extracion process
	857	_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.(([\?\&]current=)\|_)(?P<id>.)\.(?P<ext>(flv)\|(mp4))'
	858	IE_NAME = u'photobucket'
	859
	860	def _real_extract(self, url):
	861	# Extract id from URL
	862	mobj = re.match(self._VALID_URL, url)
	863	if mobj is None:
	864	raise ExtractorError(u'Invalid URL: %s' % url)
	865
	866	video_id = mobj.group('id')
	867
	868	video_extension = mobj.group('ext')
	869
	870	# Retrieve video webpage to extract further information
	871	webpage = self._download_webpage(url, video_id)
	872
	873	# Extract URL, uploader, and title from webpage
	874	self.report_extraction(video_id)
	875	# We try first by looking the javascript code:
	876	mobj = re.search(r'Pb\.Data\.Shared\.put$Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)$;', webpage)
	877	if mobj is not None:
	878	info = json.loads(mobj.group('json'))
	879	return [{
	880	'id': video_id,
	881	'url': info[u'downloadUrl'],
	882	'uploader': info[u'username'],
	883	'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
	884	'title': info[u'title'],
	885	'ext': video_extension,
	886	'thumbnail': info[u'thumbUrl'],
	887	}]
	888
	889	# We try looking in other parts of the webpage
	890	mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
	891	if mobj is None:
	892	raise ExtractorError(u'Unable to extract media URL')
	893	mediaURL = compat_urllib_parse.unquote(mobj.group(1))
	894
	895	video_url = mediaURL
	896
	897	mobj = re.search(r'<title>(.) video by (.) - Photobucket</title>', webpage)
	898	if mobj is None:
	899	raise ExtractorError(u'Unable to extract title')
	900	video_title = mobj.group(1).decode('utf-8')
	901
	902	video_uploader = mobj.group(2).decode('utf-8')
	903
	904	return [{
	905	'id': video_id.decode('utf-8'),
	906	'url': video_url.decode('utf-8'),
	907	'uploader': video_uploader,
	908	'upload_date': None,
	909	'title': video_title,
	910	'ext': video_extension.decode('utf-8'),
	911	}]
	912
	913
	914	class YahooIE(InfoExtractor):
	915	"""Information extractor for screen.yahoo.com."""
	916	_VALID_URL = r'http://screen\.yahoo\.com/.?-(?P<id>\d?)\.html'
	917
	918	def _real_extract(self, url):
	919	mobj = re.match(self._VALID_URL, url)
	920	if mobj is None:
	921	raise ExtractorError(u'Invalid URL: %s' % url)
	922	video_id = mobj.group('id')
	923	webpage = self._download_webpage(url, video_id)
	924	m_id = re.search(r'YUI\.namespace$"Media"$\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
	925
	926	if m_id is None:
	927	# TODO: Check which url parameters are required
	928	info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
	929	webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
	930	info_re = r'''<title><!\[CDATA\[(?P<title>.?)\]\]></title>.
	931	<description><!\[CDATA\[(?P<description>.?)\]\]></description>.
	932	<media:pubStart><!\[CDATA\[(?P<date>.?)\ .\]\]></media:pubStart>.*
	933	<media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
	934	'''
	935	self.report_extraction(video_id)
	936	m_info = re.search(info_re, webpage, re.VERBOSE\|re.DOTALL)
	937	if m_info is None:
	938	raise ExtractorError(u'Unable to extract video info')
	939	video_title = m_info.group('title')
	940	video_description = m_info.group('description')
	941	video_thumb = m_info.group('thumb')
	942	video_date = m_info.group('date')
	943	video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
	944
	945	# TODO: Find a way to get mp4 videos
	946	rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
	947	webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
	948	m_rest = re.search(r'<media:content url="(?P<url>.?)" path="(?P<path>.?)"', webpage)
	949	video_url = m_rest.group('url')
	950	video_path = m_rest.group('path')
	951	if m_rest is None:
	952	raise ExtractorError(u'Unable to extract video url')
	953
	954	else: # We have to use a different method if another id is defined
	955	long_id = m_id.group('new_id')
	956	info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
	957	webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
	958	json_str = re.search(r'YUI.Env.JSONP.yui.?$(.?)$;', webpage).group(1)
	959	info = json.loads(json_str)
	960	res = info[u'query'][u'results'][u'mediaObj'][0]
	961	stream = res[u'streams'][0]
	962	video_path = stream[u'path']
	963	video_url = stream[u'host']
	964	meta = res[u'meta']
	965	video_title = meta[u'title']
	966	video_description = meta[u'description']
	967	video_thumb = meta[u'thumbnail']
	968	video_date = None # I can't find it
	969
	970	info_dict = {
	971	'id': video_id,
	972	'url': video_url,
	973	'play_path': video_path,
	974	'title':video_title,
	975	'description': video_description,
	976	'thumbnail': video_thumb,
	977	'upload_date': video_date,
	978	'ext': 'flv',
	979	}
	980	return info_dict
	981
	982	class VimeoIE(InfoExtractor):
	983	"""Information extractor for vimeo.com."""
	984
	985	# _VALID_URL matches Vimeo URLs
	986	_VALID_URL = r'(?P<proto>https?://)?(?:(?:www\|player)\.)?vimeo\.com/(?:(?:groups\|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
	987	IE_NAME = u'vimeo'
	988
	989	def _real_extract(self, url, new_video=True):
	990	# Extract ID from URL
	991	mobj = re.match(self._VALID_URL, url)
	992	if mobj is None:
	993	raise ExtractorError(u'Invalid URL: %s' % url)
	994
	995	video_id = mobj.group('id')
	996	if not mobj.group('proto'):
	997	url = 'https://' + url
	998	if mobj.group('direct_link'):
	999	url = 'https://vimeo.com/' + video_id
	1000
	1001	# Retrieve video webpage to extract further information
	1002	request = compat_urllib_request.Request(url, None, std_headers)
	1003	webpage = self._download_webpage(request, video_id)
	1004
	1005	# Now we begin extracting as much information as we can from what we
	1006	# retrieved. First we extract the information common to all extractors,
	1007	# and latter we extract those that are Vimeo specific.
	1008	self.report_extraction(video_id)
	1009
	1010	# Extract the config JSON
	1011	try:
	1012	config = webpage.split(' = {config:')[1].split(',assets:')[0]
	1013	config = json.loads(config)
	1014	except:
	1015	if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
	1016	raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
	1017	else:
	1018	raise ExtractorError(u'Unable to extract info section')
	1019
	1020	# Extract title
	1021	video_title = config["video"]["title"]
	1022
	1023	# Extract uploader and uploader_id
	1024	video_uploader = config["video"]["owner"]["name"]
	1025	video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
	1026
	1027	# Extract video thumbnail
	1028	video_thumbnail = config["video"]["thumbnail"]
	1029
	1030	# Extract video description
	1031	video_description = get_element_by_attribute("itemprop", "description", webpage)
	1032	if video_description: video_description = clean_html(video_description)
	1033	else: video_description = u''
	1034
	1035	# Extract upload date
	1036	video_upload_date = None
	1037	mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
	1038	if mobj is not None:
	1039	video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
	1040
	1041	# Vimeo specific: extract request signature and timestamp
	1042	sig = config['request']['signature']
	1043	timestamp = config['request']['timestamp']
	1044
	1045	# Vimeo specific: extract video codec and quality information
	1046	# First consider quality, then codecs, then take everything
	1047	# TODO bind to format param
	1048	codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
	1049	files = { 'hd': [], 'sd': [], 'other': []}
	1050	for codec_name, codec_extension in codecs:
	1051	if codec_name in config["video"]["files"]:
	1052	if 'hd' in config["video"]["files"][codec_name]:
	1053	files['hd'].append((codec_name, codec_extension, 'hd'))
	1054	elif 'sd' in config["video"]["files"][codec_name]:
	1055	files['sd'].append((codec_name, codec_extension, 'sd'))
	1056	else:
	1057	files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
	1058
	1059	for quality in ('hd', 'sd', 'other'):
	1060	if len(files[quality]) > 0:
	1061	video_quality = files[quality][0][2]
	1062	video_codec = files[quality][0][0]
	1063	video_extension = files[quality][0][1]
	1064	self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
	1065	break
	1066	else:
	1067	raise ExtractorError(u'No known codec found')
	1068
	1069	video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
	1070	%(video_id, sig, timestamp, video_quality, video_codec.upper())
	1071
	1072	return [{
	1073	'id': video_id,
	1074	'url': video_url,
	1075	'uploader': video_uploader,
	1076	'uploader_id': video_uploader_id,
	1077	'upload_date': video_upload_date,
	1078	'title': video_title,
	1079	'ext': video_extension,
	1080	'thumbnail': video_thumbnail,
	1081	'description': video_description,
	1082	}]
	1083
	1084
	1085	class ArteTvIE(InfoExtractor):
	1086	"""arte.tv information extractor."""
	1087
	1088	_VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr\|de)/videos/.*'
	1089	_LIVE_URL = r'index-[0-9]+\.html$'
	1090
	1091	IE_NAME = u'arte.tv'
	1092
	1093	def fetch_webpage(self, url):
	1094	request = compat_urllib_request.Request(url)
	1095	try:
	1096	self.report_download_webpage(url)
	1097	webpage = compat_urllib_request.urlopen(request).read()
	1098	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	1099	raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
	1100	except ValueError as err:
	1101	raise ExtractorError(u'Invalid URL: %s' % url)
	1102	return webpage
	1103
	1104	def grep_webpage(self, url, regex, regexFlags, matchTuples):
	1105	page = self.fetch_webpage(url)
	1106	mobj = re.search(regex, page, regexFlags)
	1107	info = {}
	1108
	1109	if mobj is None:
	1110	raise ExtractorError(u'Invalid URL: %s' % url)
	1111
	1112	for (i, key, err) in matchTuples:
	1113	if mobj.group(i) is None:
	1114	raise ExtractorError(err)
	1115	else:
	1116	info[key] = mobj.group(i)
	1117
	1118	return info
	1119
	1120	def extractLiveStream(self, url):
	1121	video_lang = url.split('/')[-4]
	1122	info = self.grep_webpage(
	1123	url,
	1124	r'src="(.?/videothek_js.?\.js)',
	1125	0,
	1126	[
	1127	(1, 'url', u'Invalid URL: %s' % url)
	1128	]
	1129	)
	1130	http_host = url.split('/')[2]
	1131	next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
	1132	info = self.grep_webpage(
	1133	next_url,
	1134	r'(s_artestras_scst_geoFRDE_' + video_lang + '.?)\'.?' +
	1135	'(http://.?\.swf).?' +
	1136	'(rtmp://.*?)\'',
	1137	re.DOTALL,
	1138	[
	1139	(1, 'path', u'could not extract video path: %s' % url),
	1140	(2, 'player', u'could not extract video player: %s' % url),
	1141	(3, 'url', u'could not extract video url: %s' % url)
	1142	]
	1143	)
	1144	video_url = u'%s/%s' % (info.get('url'), info.get('path'))
	1145
	1146	def extractPlus7Stream(self, url):
	1147	video_lang = url.split('/')[-3]
	1148	info = self.grep_webpage(
	1149	url,
	1150	r'param name="movie".?videorefFileUrl=(http[^\'"&])',
	1151	0,
	1152	[
	1153	(1, 'url', u'Invalid URL: %s' % url)
	1154	]
	1155	)
	1156	next_url = compat_urllib_parse.unquote(info.get('url'))
	1157	info = self.grep_webpage(
	1158	next_url,
	1159	r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
	1160	0,
	1161	[
	1162	(1, 'url', u'Could not find <video> tag: %s' % url)
	1163	]
	1164	)
	1165	next_url = compat_urllib_parse.unquote(info.get('url'))
	1166
	1167	info = self.grep_webpage(
	1168	next_url,
	1169	r'<video id="(.?)".?>.*?' +
	1170	'<name>(.?)</name>.?' +
	1171	'<dateVideo>(.?)</dateVideo>.?' +
	1172	'<url quality="hd">(.*?)</url>',
	1173	re.DOTALL,
	1174	[
	1175	(1, 'id', u'could not extract video id: %s' % url),
	1176	(2, 'title', u'could not extract video title: %s' % url),
	1177	(3, 'date', u'could not extract video date: %s' % url),
	1178	(4, 'url', u'could not extract video url: %s' % url)
	1179	]
	1180	)
	1181
	1182	return {
	1183	'id': info.get('id'),
	1184	'url': compat_urllib_parse.unquote(info.get('url')),
	1185	'uploader': u'arte.tv',
	1186	'upload_date': unified_strdate(info.get('date')),
	1187	'title': info.get('title').decode('utf-8'),
	1188	'ext': u'mp4',
	1189	'format': u'NA',
	1190	'player_url': None,
	1191	}
	1192
	1193	def _real_extract(self, url):
	1194	video_id = url.split('/')[-1]
	1195	self.report_extraction(video_id)
	1196
	1197	if re.search(self._LIVE_URL, video_id) is not None:
	1198	self.extractLiveStream(url)
	1199	return
	1200	else:
	1201	info = self.extractPlus7Stream(url)
	1202
	1203	return [info]
	1204
	1205
	1206	class GenericIE(InfoExtractor):
	1207	"""Generic last-resort information extractor."""
	1208
	1209	_VALID_URL = r'.*'
	1210	IE_NAME = u'generic'
	1211
	1212	def report_download_webpage(self, video_id):
	1213	"""Report webpage download."""
	1214	if not self._downloader.params.get('test', False):
	1215	self._downloader.report_warning(u'Falling back on generic information extractor.')
	1216	super(GenericIE, self).report_download_webpage(video_id)
	1217
	1218	def report_following_redirect(self, new_url):
	1219	"""Report information extraction."""
	1220	self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
	1221
	1222	def _test_redirect(self, url):
	1223	"""Check if it is a redirect, like url shorteners, in case return the new url."""
	1224	class HeadRequest(compat_urllib_request.Request):
	1225	def get_method(self):
	1226	return "HEAD"
	1227
	1228	class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
	1229	"""
	1230	Subclass the HTTPRedirectHandler to make it use our
	1231	HeadRequest also on the redirected URL
	1232	"""
	1233	def redirect_request(self, req, fp, code, msg, headers, newurl):
	1234	if code in (301, 302, 303, 307):
	1235	newurl = newurl.replace(' ', '%20')
	1236	newheaders = dict((k,v) for k,v in req.headers.items()
	1237	if k.lower() not in ("content-length", "content-type"))
	1238	return HeadRequest(newurl,
	1239	headers=newheaders,
	1240	origin_req_host=req.get_origin_req_host(),
	1241	unverifiable=True)
	1242	else:
	1243	raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
	1244
	1245	class HTTPMethodFallback(compat_urllib_request.BaseHandler):
	1246	"""
	1247	Fallback to GET if HEAD is not allowed (405 HTTP error)
	1248	"""
	1249	def http_error_405(self, req, fp, code, msg, headers):
	1250	fp.read()
	1251	fp.close()
	1252
	1253	newheaders = dict((k,v) for k,v in req.headers.items()
	1254	if k.lower() not in ("content-length", "content-type"))
	1255	return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
	1256	headers=newheaders,
	1257	origin_req_host=req.get_origin_req_host(),
	1258	unverifiable=True))
	1259
	1260	# Build our opener
	1261	opener = compat_urllib_request.OpenerDirector()
	1262	for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
	1263	HTTPMethodFallback, HEADRedirectHandler,
	1264	compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
	1265	opener.add_handler(handler())
	1266
	1267	response = opener.open(HeadRequest(url))
	1268	new_url = response.geturl()
	1269
	1270	if url == new_url:
	1271	return False
	1272
	1273	self.report_following_redirect(new_url)
	1274	return new_url
	1275
	1276	def _real_extract(self, url):
	1277	new_url = self._test_redirect(url)
	1278	if new_url: return [self.url_result(new_url)]
	1279
	1280	video_id = url.split('/')[-1]
	1281	try:
	1282	webpage = self._download_webpage(url, video_id)
	1283	except ValueError as err:
	1284	# since this is the last-resort InfoExtractor, if
	1285	# this error is thrown, it'll be thrown here
	1286	raise ExtractorError(u'Invalid URL: %s' % url)
	1287
	1288	self.report_extraction(video_id)
	1289	# Start with something easy: JW Player in SWFObject
	1290	mobj = re.search(r'flashvars: [\'"](?:.&)?file=(http[^\'"&])', webpage)
	1291	if mobj is None:
	1292	# Broaden the search a little bit
	1293	mobj = re.search(r'[^A-Za-z0-9]?(?:file\|source)=(http[^\'"&]*)', webpage)
	1294	if mobj is None:
	1295	# Broaden the search a little bit: JWPlayer JS loader
	1296	mobj = re.search(r'[^A-Za-z0-9]?file:\s["\'](http[^\'"&])', webpage)
	1297	if mobj is None:
	1298	raise ExtractorError(u'Invalid URL: %s' % url)
	1299
	1300	# It's possible that one of the regexes
	1301	# matched, but returned an empty group:
	1302	if mobj.group(1) is None:
	1303	raise ExtractorError(u'Invalid URL: %s' % url)
	1304
	1305	video_url = compat_urllib_parse.unquote(mobj.group(1))
	1306	video_id = os.path.basename(video_url)
	1307
	1308	# here's a fun little line of code for you:
	1309	video_extension = os.path.splitext(video_id)[1][1:]
	1310	video_id = os.path.splitext(video_id)[0]
	1311
	1312	# it's tempting to parse this further, but you would
	1313	# have to take into account all the variations like
	1314	# Video Title - Site Name
	1315	# Site Name \| Video Title
	1316	# Video Title - Tagline \| Site Name
	1317	# and so on and so forth; it's just not practical
	1318	mobj = re.search(r'<title>(.*)</title>', webpage)
	1319	if mobj is None:
	1320	raise ExtractorError(u'Unable to extract title')
	1321	video_title = mobj.group(1)
	1322
	1323	# video uploader is domain name
	1324	mobj = re.match(r'(?:https?://)?([^/])/.', url)
	1325	if mobj is None:
	1326	raise ExtractorError(u'Unable to extract title')
	1327	video_uploader = mobj.group(1)
	1328
	1329	return [{
	1330	'id': video_id,
	1331	'url': video_url,
	1332	'uploader': video_uploader,
	1333	'upload_date': None,
	1334	'title': video_title,
	1335	'ext': video_extension,
	1336	}]
	1337
	1338
	1339	class YoutubeSearchIE(InfoExtractor):
	1340	"""Information Extractor for YouTube search queries."""
	1341	_VALID_URL = r'ytsearch(\d+\|all)?:[\s\S]+'
	1342	_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
	1343	_max_youtube_results = 1000
	1344	IE_NAME = u'youtube:search'
	1345
	1346	def report_download_page(self, query, pagenum):
	1347	"""Report attempt to download search page with given number."""
	1348	query = query.decode(preferredencoding())
	1349	self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
	1350
	1351	def _real_extract(self, query):
	1352	mobj = re.match(self._VALID_URL, query)
	1353	if mobj is None:
	1354	raise ExtractorError(u'Invalid search query "%s"' % query)
	1355
	1356	prefix, query = query.split(':')
	1357	prefix = prefix[8:]
	1358	query = query.encode('utf-8')
	1359	if prefix == '':
	1360	return self._get_n_results(query, 1)
	1361	elif prefix == 'all':
	1362	self._get_n_results(query, self._max_youtube_results)
	1363	else:
	1364	try:
	1365	n = int(prefix)
	1366	if n <= 0:
	1367	raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
	1368	elif n > self._max_youtube_results:
	1369	self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
	1370	n = self._max_youtube_results
	1371	return self._get_n_results(query, n)
	1372	except ValueError: # parsing prefix as integer fails
	1373	return self._get_n_results(query, 1)
	1374
	1375	def _get_n_results(self, query, n):
	1376	"""Get a specified number of results for a query"""
	1377
	1378	video_ids = []
	1379	pagenum = 0
	1380	limit = n
	1381
	1382	while (50 * pagenum) < limit:
	1383	self.report_download_page(query, pagenum+1)
	1384	result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
	1385	request = compat_urllib_request.Request(result_url)
	1386	try:
	1387	data = compat_urllib_request.urlopen(request).read().decode('utf-8')
	1388	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	1389	raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
	1390	api_response = json.loads(data)['data']
	1391
	1392	if not 'items' in api_response:
	1393	raise ExtractorError(u'[youtube] No video results')
	1394
	1395	new_ids = list(video['id'] for video in api_response['items'])
	1396	video_ids += new_ids
	1397
	1398	limit = min(n, api_response['totalItems'])
	1399	pagenum += 1
	1400
	1401	if len(video_ids) > n:
	1402	video_ids = video_ids[:n]
	1403	videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
	1404	return videos
	1405
	1406
	1407	class GoogleSearchIE(InfoExtractor):
	1408	"""Information Extractor for Google Video search queries."""
	1409	_VALID_URL = r'gvsearch(?P<prefix>\|\d+\|all):(?P<query>[\s\S]+)'
	1410	_MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
	1411	_max_google_results = 1000
	1412	IE_NAME = u'video.google:search'
	1413
	1414	def _real_extract(self, query):
	1415	mobj = re.match(self._VALID_URL, query)
	1416
	1417	prefix = mobj.group('prefix')
	1418	query = mobj.group('query')
	1419	if prefix == '':
	1420	return self._get_n_results(query, 1)
	1421	elif prefix == 'all':
	1422	return self._get_n_results(query, self._max_google_results)
	1423	else:
	1424	n = int(prefix)
	1425	if n <= 0:
	1426	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	1427	elif n > self._max_google_results:
	1428	self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
	1429	n = self._max_google_results
	1430	return self._get_n_results(query, n)
	1431
	1432	def _get_n_results(self, query, n):
	1433	"""Get a specified number of results for a query"""
	1434
	1435	res = {
	1436	'_type': 'playlist',
	1437	'id': query,
	1438	'entries': []
	1439	}
	1440
	1441	for pagenum in itertools.count(1):
	1442	result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
	1443	print(result_url)
	1444	webpage = self._download_webpage(result_url, u'gvsearch:' + query,
	1445	note='Downloading result page ' + str(pagenum))
	1446
	1447	for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
	1448	e = {
	1449	'_type': 'url',
	1450	'url': mobj.group(1)
	1451	}
	1452	res['entries'].append(e)
	1453
	1454	if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
	1455	return res
	1456
	1457	class YahooSearchIE(InfoExtractor):
	1458	"""Information Extractor for Yahoo! Video search queries."""
	1459
	1460	_VALID_URL = r'yvsearch(\d+\|all)?:[\s\S]+'
	1461
	1462	_max_yahoo_results = 1000
	1463	IE_NAME = u'screen.yahoo:search'
	1464
	1465	def _real_extract(self, query):
	1466	mobj = re.match(self._VALID_URL, query)
	1467	if mobj is None:
	1468	raise ExtractorError(u'Invalid search query "%s"' % query)
	1469
	1470	prefix, query = query.split(':')
	1471	prefix = prefix[8:]
	1472	query = query.encode('utf-8')
	1473	if prefix == '':
	1474	return self._get_n_results(query, 1)
	1475	elif prefix == 'all':
	1476	return self._get_n_results(query, self._max_yahoo_results)
	1477	else:
	1478	try:
	1479	n = int(prefix)
	1480	if n <= 0:
	1481	raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
	1482	elif n > self._max_yahoo_results:
	1483	self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
	1484	n = self._max_yahoo_results
	1485	return self._get_n_results(query, n)
	1486	except ValueError: # parsing prefix as integer fails
	1487	return self._get_n_results(query, 1)
	1488
	1489	def _get_n_results(self, query, n):
	1490	"""Get a specified number of results for a query"""
	1491
	1492	res = {
	1493	'_type': 'playlist',
	1494	'id': query,
	1495	'entries': []
	1496	}
	1497	for pagenum in itertools.count(0):
	1498	result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
	1499	webpage = self._download_webpage(result_url, query,
	1500	note='Downloading results page '+str(pagenum+1))
	1501	info = json.loads(webpage)
	1502	m = info[u'm']
	1503	results = info[u'results']
	1504
	1505	for (i, r) in enumerate(results):
	1506	if (pagenum * 30) +i >= n:
	1507	break
	1508	mobj = re.search(r'(?P<url>screen\.yahoo\.com/.?-\d?\.html)"', r)
	1509	e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
	1510	res['entries'].append(e)
	1511	if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
	1512	break
	1513
	1514	return res
	1515
	1516
	1517	class YoutubePlaylistIE(InfoExtractor):
	1518	"""Information Extractor for YouTube playlists."""
	1519
	1520	_VALID_URL = r"""(?:
	1521	(?:https?://)?
	1522	(?:\w+\.)?
	1523	youtube\.com/
	1524	(?:
	1525	(?:course\|view_play_list\|my_playlists\|artist\|playlist\|watch)
	1526	\? (?:.?&)? (?:p\|a\|list)=
	1527	\| p/
	1528	)
	1529	((?:PL\|EC\|UU)?[0-9A-Za-z-_]{10,})
	1530	.*
	1531	\|
	1532	((?:PL\|EC\|UU)[0-9A-Za-z-_]{10,})
	1533	)"""
	1534	_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
	1535	_MAX_RESULTS = 50
	1536	IE_NAME = u'youtube:playlist'
	1537
	1538	@classmethod
	1539	def suitable(cls, url):
	1540	"""Receives a URL and returns True if suitable for this IE."""
	1541	return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
	1542
	1543	def _real_extract(self, url):
	1544	# Extract playlist id
	1545	mobj = re.match(self._VALID_URL, url, re.VERBOSE)
	1546	if mobj is None:
	1547	raise ExtractorError(u'Invalid URL: %s' % url)
	1548
	1549	# Download playlist videos from API
	1550	playlist_id = mobj.group(1) or mobj.group(2)
	1551	page_num = 1
	1552	videos = []
	1553
	1554	while True:
	1555	url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
	1556	page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
	1557
	1558	try:
	1559	response = json.loads(page)
	1560	except ValueError as err:
	1561	raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
	1562
	1563	if 'feed' not in response:
	1564	raise ExtractorError(u'Got a malformed response from YouTube API')
	1565	playlist_title = response['feed']['title']['$t']
	1566	if 'entry' not in response['feed']:
	1567	# Number of videos is a multiple of self._MAX_RESULTS
	1568	break
	1569
	1570	videos += [ (entry['yt$position']['$t'], entry['content']['src'])
	1571	for entry in response['feed']['entry']
	1572	if 'content' in entry ]
	1573
	1574	if len(response['feed']['entry']) < self._MAX_RESULTS:
	1575	break
	1576	page_num += 1
	1577
	1578	videos = [v[1] for v in sorted(videos)]
	1579
	1580	url_results = [self.url_result(url, 'Youtube') for url in videos]
	1581	return [self.playlist_result(url_results, playlist_id, playlist_title)]
	1582
	1583
	1584	class YoutubeChannelIE(InfoExtractor):
	1585	"""Information Extractor for YouTube channels."""
	1586
	1587	_VALID_URL = r"^(?:https?://)?(?:youtu\.be\|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
	1588	_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
	1589	_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
	1590	_MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
	1591	IE_NAME = u'youtube:channel'
	1592
	1593	def extract_videos_from_page(self, page):
	1594	ids_in_page = []
	1595	for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
	1596	if mobj.group(1) not in ids_in_page:
	1597	ids_in_page.append(mobj.group(1))
	1598	return ids_in_page
	1599
	1600	def _real_extract(self, url):
	1601	# Extract channel id
	1602	mobj = re.match(self._VALID_URL, url)
	1603	if mobj is None:
	1604	raise ExtractorError(u'Invalid URL: %s' % url)
	1605
	1606	# Download channel page
	1607	channel_id = mobj.group(1)
	1608	video_ids = []
	1609	pagenum = 1
	1610
	1611	url = self._TEMPLATE_URL % (channel_id, pagenum)
	1612	page = self._download_webpage(url, channel_id,
	1613	u'Downloading page #%s' % pagenum)
	1614
	1615	# Extract video identifiers
	1616	ids_in_page = self.extract_videos_from_page(page)
	1617	video_ids.extend(ids_in_page)
	1618
	1619	# Download any subsequent channel pages using the json-based channel_ajax query
	1620	if self._MORE_PAGES_INDICATOR in page:
	1621	while True:
	1622	pagenum = pagenum + 1
	1623
	1624	url = self._MORE_PAGES_URL % (pagenum, channel_id)
	1625	page = self._download_webpage(url, channel_id,
	1626	u'Downloading page #%s' % pagenum)
	1627
	1628	page = json.loads(page)
	1629
	1630	ids_in_page = self.extract_videos_from_page(page['content_html'])
	1631	video_ids.extend(ids_in_page)
	1632
	1633	if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
	1634	break
	1635
	1636	self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
	1637
	1638	urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
	1639	url_entries = [self.url_result(url, 'Youtube') for url in urls]
	1640	return [self.playlist_result(url_entries, channel_id)]
	1641
	1642
	1643	class YoutubeUserIE(InfoExtractor):
	1644	"""Information Extractor for YouTube users."""
	1645
	1646	_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)\|ytuser:)([A-Za-z0-9_-]+)'
	1647	_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
	1648	_GDATA_PAGE_SIZE = 50
	1649	_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
	1650	_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
	1651	IE_NAME = u'youtube:user'
	1652
	1653	def _real_extract(self, url):
	1654	# Extract username
	1655	mobj = re.match(self._VALID_URL, url)
	1656	if mobj is None:
	1657	raise ExtractorError(u'Invalid URL: %s' % url)
	1658
	1659	username = mobj.group(1)
	1660
	1661	# Download video ids using YouTube Data API. Result size per
	1662	# query is limited (currently to 50 videos) so we need to query
	1663	# page by page until there are no video ids - it means we got
	1664	# all of them.
	1665
	1666	video_ids = []
	1667	pagenum = 0
	1668
	1669	while True:
	1670	start_index = pagenum * self._GDATA_PAGE_SIZE + 1
	1671
	1672	gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
	1673	page = self._download_webpage(gdata_url, username,
	1674	u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
	1675
	1676	# Extract video identifiers
	1677	ids_in_page = []
	1678
	1679	for mobj in re.finditer(self._VIDEO_INDICATOR, page):
	1680	if mobj.group(1) not in ids_in_page:
	1681	ids_in_page.append(mobj.group(1))
	1682
	1683	video_ids.extend(ids_in_page)
	1684
	1685	# A little optimization - if current page is not
	1686	# "full", ie. does not contain PAGE_SIZE video ids then
	1687	# we can assume that this page is the last one - there
	1688	# are no more ids on further pages - no need to query
	1689	# again.
	1690
	1691	if len(ids_in_page) < self._GDATA_PAGE_SIZE:
	1692	break
	1693
	1694	pagenum += 1
	1695
	1696	urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
	1697	url_results = [self.url_result(url, 'Youtube') for url in urls]
	1698	return [self.playlist_result(url_results, playlist_title = username)]
	1699
	1700
	1701	class BlipTVUserIE(InfoExtractor):
	1702	"""Information Extractor for blip.tv users."""
	1703
	1704	_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)\|bliptvuser:)([^/]+)/*$'
	1705	_PAGE_SIZE = 12
	1706	IE_NAME = u'blip.tv:user'
	1707
	1708	def _real_extract(self, url):
	1709	# Extract username
	1710	mobj = re.match(self._VALID_URL, url)
	1711	if mobj is None:
	1712	raise ExtractorError(u'Invalid URL: %s' % url)
	1713
	1714	username = mobj.group(1)
	1715
	1716	page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
	1717
	1718	page = self._download_webpage(url, username, u'Downloading user page')
	1719	mobj = re.search(r'data-users-id="([^"]+)"', page)
	1720	page_base = page_base % mobj.group(1)
	1721
	1722
	1723	# Download video ids using BlipTV Ajax calls. Result size per
	1724	# query is limited (currently to 12 videos) so we need to query
	1725	# page by page until there are no video ids - it means we got
	1726	# all of them.
	1727
	1728	video_ids = []
	1729	pagenum = 1
	1730
	1731	while True:
	1732	url = page_base + "&page=" + str(pagenum)
	1733	page = self._download_webpage(url, username,
	1734	u'Downloading video ids from page %d' % pagenum)
	1735
	1736	# Extract video identifiers
	1737	ids_in_page = []
	1738
	1739	for mobj in re.finditer(r'href="/([^"]+)"', page):
	1740	if mobj.group(1) not in ids_in_page:
	1741	ids_in_page.append(unescapeHTML(mobj.group(1)))
	1742
	1743	video_ids.extend(ids_in_page)
	1744
	1745	# A little optimization - if current page is not
	1746	# "full", ie. does not contain PAGE_SIZE video ids then
	1747	# we can assume that this page is the last one - there
	1748	# are no more ids on further pages - no need to query
	1749	# again.
	1750
	1751	if len(ids_in_page) < self._PAGE_SIZE:
	1752	break
	1753
	1754	pagenum += 1
	1755
	1756	urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
	1757	url_entries = [self.url_result(url, 'BlipTV') for url in urls]
	1758	return [self.playlist_result(url_entries, playlist_title = username)]
	1759
	1760
	1761	class DepositFilesIE(InfoExtractor):
	1762	"""Information extractor for depositfiles.com"""
	1763
	1764	_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
	1765
	1766	def _real_extract(self, url):
	1767	file_id = url.split('/')[-1]
	1768	# Rebuild url in english locale
	1769	url = 'http://depositfiles.com/en/files/' + file_id
	1770
	1771	# Retrieve file webpage with 'Free download' button pressed
	1772	free_download_indication = { 'gateway_result' : '1' }
	1773	request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
	1774	try:
	1775	self.report_download_webpage(file_id)
	1776	webpage = compat_urllib_request.urlopen(request).read()
	1777	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	1778	raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
	1779
	1780	# Search for the real file URL
	1781	mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
	1782	if (mobj is None) or (mobj.group(1) is None):
	1783	# Try to figure out reason of the error.
	1784	mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
	1785	if (mobj is not None) and (mobj.group(1) is not None):
	1786	restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
	1787	raise ExtractorError(u'%s' % restriction_message)
	1788	else:
	1789	raise ExtractorError(u'Unable to extract download URL from: %s' % url)
	1790
	1791	file_url = mobj.group(1)
	1792	file_extension = os.path.splitext(file_url)[1][1:]
	1793
	1794	# Search for file title
	1795	mobj = re.search(r'<b title="(.*?)">', webpage)
	1796	if mobj is None:
	1797	raise ExtractorError(u'Unable to extract title')
	1798	file_title = mobj.group(1).decode('utf-8')
	1799
	1800	return [{
	1801	'id': file_id.decode('utf-8'),
	1802	'url': file_url.decode('utf-8'),
	1803	'uploader': None,
	1804	'upload_date': None,
	1805	'title': file_title,
	1806	'ext': file_extension.decode('utf-8'),
	1807	}]
	1808
	1809
	1810	class FacebookIE(InfoExtractor):
	1811	"""Information Extractor for Facebook"""
	1812
	1813	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video\|photo)\.php\?(?:.?)v=(?P<ID>\d+)(?:.)'
	1814	_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
	1815	_NETRC_MACHINE = 'facebook'
	1816	IE_NAME = u'facebook'
	1817
	1818	def report_login(self):
	1819	"""Report attempt to log in."""
	1820	self.to_screen(u'Logging in')
	1821
	1822	def _real_initialize(self):
	1823	if self._downloader is None:
	1824	return
	1825
	1826	useremail = None
	1827	password = None
	1828	downloader_params = self._downloader.params
	1829
	1830	# Attempt to use provided username and password or .netrc data
	1831	if downloader_params.get('username', None) is not None:
	1832	useremail = downloader_params['username']
	1833	password = downloader_params['password']
	1834	elif downloader_params.get('usenetrc', False):
	1835	try:
	1836	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	1837	if info is not None:
	1838	useremail = info[0]
	1839	password = info[2]
	1840	else:
	1841	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	1842	except (IOError, netrc.NetrcParseError) as err:
	1843	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	1844	return
	1845
	1846	if useremail is None:
	1847	return
	1848
	1849	# Log in
	1850	login_form = {
	1851	'email': useremail,
	1852	'pass': password,
	1853	'login': 'Log+In'
	1854	}
	1855	request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
	1856	try:
	1857	self.report_login()
	1858	login_results = compat_urllib_request.urlopen(request).read()
	1859	if re.search(r'<form(.)name="login"(.)</form>', login_results) is not None:
	1860	self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
	1861	return
	1862	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	1863	self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
	1864	return
	1865
	1866	def _real_extract(self, url):
	1867	mobj = re.match(self._VALID_URL, url)
	1868	if mobj is None:
	1869	raise ExtractorError(u'Invalid URL: %s' % url)
	1870	video_id = mobj.group('ID')
	1871
	1872	url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
	1873	webpage = self._download_webpage(url, video_id)
	1874
	1875	BEFORE = '{swf.addParam(param[0], param[1]);});\n'
	1876	AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
	1877	m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
	1878	if not m:
	1879	raise ExtractorError(u'Cannot parse data')
	1880	data = dict(json.loads(m.group(1)))
	1881	params_raw = compat_urllib_parse.unquote(data['params'])
	1882	params = json.loads(params_raw)
	1883	video_data = params['video_data'][0]
	1884	video_url = video_data.get('hd_src')
	1885	if not video_url:
	1886	video_url = video_data['sd_src']
	1887	if not video_url:
	1888	raise ExtractorError(u'Cannot find video URL')
	1889	video_duration = int(video_data['video_duration'])
	1890	thumbnail = video_data['thumbnail_src']
	1891
	1892	m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
	1893	if not m:
	1894	raise ExtractorError(u'Cannot find title in webpage')
	1895	video_title = unescapeHTML(m.group(1))
	1896
	1897	info = {
	1898	'id': video_id,
	1899	'title': video_title,
	1900	'url': video_url,
	1901	'ext': 'mp4',
	1902	'duration': video_duration,
	1903	'thumbnail': thumbnail,
	1904	}
	1905	return [info]
	1906
	1907
	1908	class BlipTVIE(InfoExtractor):
	1909	"""Information extractor for blip.tv"""
	1910
	1911	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
	1912	_URL_EXT = r'^.*\.([a-z0-9]+)$'
	1913	IE_NAME = u'blip.tv'
	1914
	1915	def report_direct_download(self, title):
	1916	"""Report information extraction."""
	1917	self.to_screen(u'%s: Direct download detected' % title)
	1918
	1919	def _real_extract(self, url):
	1920	mobj = re.match(self._VALID_URL, url)
	1921	if mobj is None:
	1922	raise ExtractorError(u'Invalid URL: %s' % url)
	1923
	1924	urlp = compat_urllib_parse_urlparse(url)
	1925	if urlp.path.startswith('/play/'):
	1926	request = compat_urllib_request.Request(url)
	1927	response = compat_urllib_request.urlopen(request)
	1928	redirecturl = response.geturl()
	1929	rurlp = compat_urllib_parse_urlparse(redirecturl)
	1930	file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
	1931	url = 'http://blip.tv/a/a-' + file_id
	1932	return self._real_extract(url)
	1933
	1934
	1935	if '?' in url:
	1936	cchar = '&'
	1937	else:
	1938	cchar = '?'
	1939	json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
	1940	request = compat_urllib_request.Request(json_url)
	1941	request.add_header('User-Agent', 'iTunes/10.6.1')
	1942	self.report_extraction(mobj.group(1))
	1943	info = None
	1944	try:
	1945	urlh = compat_urllib_request.urlopen(request)
	1946	if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
	1947	basename = url.split('/')[-1]
	1948	title,ext = os.path.splitext(basename)
	1949	title = title.decode('UTF-8')
	1950	ext = ext.replace('.', '')
	1951	self.report_direct_download(title)
	1952	info = {
	1953	'id': title,
	1954	'url': url,
	1955	'uploader': None,
	1956	'upload_date': None,
	1957	'title': title,
	1958	'ext': ext,
	1959	'urlhandle': urlh
	1960	}
	1961	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	1962	raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
	1963	if info is None: # Regular URL
	1964	try:
	1965	json_code_bytes = urlh.read()
	1966	json_code = json_code_bytes.decode('utf-8')
	1967	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	1968	raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
	1969
	1970	try:
	1971	json_data = json.loads(json_code)
	1972	if 'Post' in json_data:
	1973	data = json_data['Post']
	1974	else:
	1975	data = json_data
	1976
	1977	upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
	1978	video_url = data['media']['url']
	1979	umobj = re.match(self._URL_EXT, video_url)
	1980	if umobj is None:
	1981	raise ValueError('Can not determine filename extension')
	1982	ext = umobj.group(1)
	1983
	1984	info = {
	1985	'id': data['item_id'],
	1986	'url': video_url,
	1987	'uploader': data['display_name'],
	1988	'upload_date': upload_date,
	1989	'title': data['title'],
	1990	'ext': ext,
	1991	'format': data['media']['mimeType'],
	1992	'thumbnail': data['thumbnailUrl'],
	1993	'description': data['description'],
	1994	'player_url': data['embedUrl'],
	1995	'user_agent': 'iTunes/10.6.1',
	1996	}
	1997	except (ValueError,KeyError) as err:
	1998	raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
	1999
	2000	return [info]
	2001
	2002
	2003	class MyVideoIE(InfoExtractor):
	2004	"""Information Extractor for myvideo.de."""
	2005
	2006	_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
	2007	IE_NAME = u'myvideo'
	2008
	2009	def _real_extract(self,url):
	2010	mobj = re.match(self._VALID_URL, url)
	2011	if mobj is None:
	2012	raise ExtractorError(u'Invalid URL: %s' % url)
	2013
	2014	video_id = mobj.group(1)
	2015
	2016	# Get video webpage
	2017	webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
	2018	webpage = self._download_webpage(webpage_url, video_id)
	2019
	2020	self.report_extraction(video_id)
	2021	mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
	2022	webpage)
	2023	if mobj is None:
	2024	raise ExtractorError(u'Unable to extract media URL')
	2025	video_url = mobj.group(1) + ('/%s.flv' % video_id)
	2026
	2027	mobj = re.search('<title>([^<]+)</title>', webpage)
	2028	if mobj is None:
	2029	raise ExtractorError(u'Unable to extract title')
	2030
	2031	video_title = mobj.group(1)
	2032
	2033	return [{
	2034	'id': video_id,
	2035	'url': video_url,
	2036	'uploader': None,
	2037	'upload_date': None,
	2038	'title': video_title,
	2039	'ext': u'flv',
	2040	}]
	2041
	2042	class ComedyCentralIE(InfoExtractor):
	2043	"""Information extractor for The Daily Show and Colbert Report """
	2044
	2045	# urls can be abbreviations like :thedailyshow or :colbert
	2046	# urls for episodes like:
	2047	# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
	2048	# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
	2049	# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
	2050	_VALID_URL = r"""^(:(?P<shortname>tds\|thedailyshow\|cr\|colbert\|colbertnation\|colbertreport)
	2051	\|(https?://)?(www\.)?
	2052	(?P<showname>thedailyshow\|colbertnation)\.com/
	2053	(full-episodes/(?P<episode>.*)\|
	2054	(?P<clip>
	2055	(the-colbert-report-(videos\|collections)/(?P<clipID>[0-9]+)/[^/]/(?P<cntitle>.?))
	2056	\|(watch/(?P<date>[^/])/(?P<tdstitle>.)))))
	2057	$"""
	2058
	2059	_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
	2060
	2061	_video_extensions = {
	2062	'3500': 'mp4',
	2063	'2200': 'mp4',
	2064	'1700': 'mp4',
	2065	'1200': 'mp4',
	2066	'750': 'mp4',
	2067	'400': 'mp4',
	2068	}
	2069	_video_dimensions = {
	2070	'3500': '1280x720',
	2071	'2200': '960x540',
	2072	'1700': '768x432',
	2073	'1200': '640x360',
	2074	'750': '512x288',
	2075	'400': '384x216',
	2076	}
	2077
	2078	@classmethod
	2079	def suitable(cls, url):
	2080	"""Receives a URL and returns True if suitable for this IE."""
	2081	return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
	2082
	2083	def _print_formats(self, formats):
	2084	print('Available formats:')
	2085	for x in formats:
	2086	print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
	2087
	2088
	2089	def _real_extract(self, url):
	2090	mobj = re.match(self._VALID_URL, url, re.VERBOSE)
	2091	if mobj is None:
	2092	raise ExtractorError(u'Invalid URL: %s' % url)
	2093
	2094	if mobj.group('shortname'):
	2095	if mobj.group('shortname') in ('tds', 'thedailyshow'):
	2096	url = u'http://www.thedailyshow.com/full-episodes/'
	2097	else:
	2098	url = u'http://www.colbertnation.com/full-episodes/'
	2099	mobj = re.match(self._VALID_URL, url, re.VERBOSE)
	2100	assert mobj is not None
	2101
	2102	if mobj.group('clip'):
	2103	if mobj.group('showname') == 'thedailyshow':
	2104	epTitle = mobj.group('tdstitle')
	2105	else:
	2106	epTitle = mobj.group('cntitle')
	2107	dlNewest = False
	2108	else:
	2109	dlNewest = not mobj.group('episode')
	2110	if dlNewest:
	2111	epTitle = mobj.group('showname')
	2112	else:
	2113	epTitle = mobj.group('episode')
	2114
	2115	self.report_extraction(epTitle)
	2116	webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
	2117	if dlNewest:
	2118	url = htmlHandle.geturl()
	2119	mobj = re.match(self._VALID_URL, url, re.VERBOSE)
	2120	if mobj is None:
	2121	raise ExtractorError(u'Invalid redirected URL: ' + url)
	2122	if mobj.group('episode') == '':
	2123	raise ExtractorError(u'Redirected URL is still not specific: ' + url)
	2124	epTitle = mobj.group('episode')
	2125
	2126	mMovieParams = re.findall('(?:<param name="movie" value="\|var url = ")(http://media.mtvnservices.com/([^"](?:episode\|video).?:.*?))"', webpage)
	2127
	2128	if len(mMovieParams) == 0:
	2129	# The Colbert Report embeds the information in a without
	2130	# a URL prefix; so extract the alternate reference
	2131	# and then add the URL prefix manually.
	2132
	2133	altMovieParams = re.findall('data-mgid="([^"](?:episode\|video).?:.*?)"', webpage)
	2134	if len(altMovieParams) == 0:
	2135	raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
	2136	else:
	2137	mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
	2138
	2139	uri = mMovieParams[0][1]
	2140	indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
	2141	indexXml = self._download_webpage(indexUrl, epTitle,
	2142	u'Downloading show index',
	2143	u'unable to download episode index')
	2144
	2145	results = []
	2146
	2147	idoc = xml.etree.ElementTree.fromstring(indexXml)
	2148	itemEls = idoc.findall('.//item')
	2149	for partNum,itemEl in enumerate(itemEls):
	2150	mediaId = itemEl.findall('./guid')[0].text
	2151	shortMediaId = mediaId.split(':')[-1]
	2152	showId = mediaId.split(':')[-2].replace('.com', '')
	2153	officialTitle = itemEl.findall('./title')[0].text
	2154	officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
	2155
	2156	configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
	2157	compat_urllib_parse.urlencode({'uri': mediaId}))
	2158	configXml = self._download_webpage(configUrl, epTitle,
	2159	u'Downloading configuration for %s' % shortMediaId)
	2160
	2161	cdoc = xml.etree.ElementTree.fromstring(configXml)
	2162	turls = []
	2163	for rendition in cdoc.findall('.//rendition'):
	2164	finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
	2165	turls.append(finfo)
	2166
	2167	if len(turls) == 0:
	2168	self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
	2169	continue
	2170
	2171	if self._downloader.params.get('listformats', None):
	2172	self._print_formats([i[0] for i in turls])
	2173	return
	2174
	2175	# For now, just pick the highest bitrate
	2176	format,rtmp_video_url = turls[-1]
	2177
	2178	# Get the format arg from the arg stream
	2179	req_format = self._downloader.params.get('format', None)
	2180
	2181	# Select format if we can find one
	2182	for f,v in turls:
	2183	if f == req_format:
	2184	format, rtmp_video_url = f, v
	2185	break
	2186
	2187	m = re.match(r'^rtmpe?://.?/(?P<finalid>gsp.comedystor/.)$', rtmp_video_url)
	2188	if not m:
	2189	raise ExtractorError(u'Cannot transform RTMP url')
	2190	base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
	2191	video_url = base + m.group('finalid')
	2192
	2193	effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
	2194	info = {
	2195	'id': shortMediaId,
	2196	'url': video_url,
	2197	'uploader': showId,
	2198	'upload_date': officialDate,
	2199	'title': effTitle,
	2200	'ext': 'mp4',
	2201	'format': format,
	2202	'thumbnail': None,
	2203	'description': officialTitle,
	2204	}
	2205	results.append(info)
	2206
	2207	return results
	2208
	2209
	2210	class EscapistIE(InfoExtractor):
	2211	"""Information extractor for The Escapist """
	2212
	2213	_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
	2214	IE_NAME = u'escapist'
	2215
	2216	def _real_extract(self, url):
	2217	mobj = re.match(self._VALID_URL, url)
	2218	if mobj is None:
	2219	raise ExtractorError(u'Invalid URL: %s' % url)
	2220	showName = mobj.group('showname')
	2221	videoId = mobj.group('episode')
	2222
	2223	self.report_extraction(showName)
	2224	webPage = self._download_webpage(url, showName)
	2225
	2226	descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
	2227	description = unescapeHTML(descMatch.group(1))
	2228	imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
	2229	imgUrl = unescapeHTML(imgMatch.group(1))
	2230	playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
	2231	playerUrl = unescapeHTML(playerUrlMatch.group(1))
	2232	configUrlMatch = re.search('config=(.*)$', playerUrl)
	2233	configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
	2234
	2235	configJSON = self._download_webpage(configUrl, showName,
	2236	u'Downloading configuration',
	2237	u'unable to download configuration')
	2238
	2239	# Technically, it's JavaScript, not JSON
	2240	configJSON = configJSON.replace("'", '"')
	2241
	2242	try:
	2243	config = json.loads(configJSON)
	2244	except (ValueError,) as err:
	2245	raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
	2246
	2247	playlist = config['playlist']
	2248	videoUrl = playlist[1]['url']
	2249
	2250	info = {
	2251	'id': videoId,
	2252	'url': videoUrl,
	2253	'uploader': showName,
	2254	'upload_date': None,
	2255	'title': showName,
	2256	'ext': 'mp4',
	2257	'thumbnail': imgUrl,
	2258	'description': description,
	2259	'player_url': playerUrl,
	2260	}
	2261
	2262	return [info]
	2263
	2264	class CollegeHumorIE(InfoExtractor):
	2265	"""Information extractor for collegehumor.com"""
	2266
	2267	_WORKING = False
	2268	_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
	2269	IE_NAME = u'collegehumor'
	2270
	2271	def report_manifest(self, video_id):
	2272	"""Report information extraction."""
	2273	self.to_screen(u'%s: Downloading XML manifest' % video_id)
	2274
	2275	def _real_extract(self, url):
	2276	mobj = re.match(self._VALID_URL, url)
	2277	if mobj is None:
	2278	raise ExtractorError(u'Invalid URL: %s' % url)
	2279	video_id = mobj.group('videoid')
	2280
	2281	info = {
	2282	'id': video_id,
	2283	'uploader': None,
	2284	'upload_date': None,
	2285	}
	2286
	2287	self.report_extraction(video_id)
	2288	xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
	2289	try:
	2290	metaXml = compat_urllib_request.urlopen(xmlUrl).read()
	2291	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2292	raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
	2293
	2294	mdoc = xml.etree.ElementTree.fromstring(metaXml)
	2295	try:
	2296	videoNode = mdoc.findall('./video')[0]
	2297	info['description'] = videoNode.findall('./description')[0].text
	2298	info['title'] = videoNode.findall('./caption')[0].text
	2299	info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
	2300	manifest_url = videoNode.findall('./file')[0].text
	2301	except IndexError:
	2302	raise ExtractorError(u'Invalid metadata XML file')
	2303
	2304	manifest_url += '?hdcore=2.10.3'
	2305	self.report_manifest(video_id)
	2306	try:
	2307	manifestXml = compat_urllib_request.urlopen(manifest_url).read()
	2308	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2309	raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
	2310
	2311	adoc = xml.etree.ElementTree.fromstring(manifestXml)
	2312	try:
	2313	media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
	2314	node_id = media_node.attrib['url']
	2315	video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
	2316	except IndexError as err:
	2317	raise ExtractorError(u'Invalid manifest file')
	2318
	2319	url_pr = compat_urllib_parse_urlparse(manifest_url)
	2320	url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
	2321
	2322	info['url'] = url
	2323	info['ext'] = 'f4f'
	2324	return [info]
	2325
	2326
	2327	class XVideosIE(InfoExtractor):
	2328	"""Information extractor for xvideos.com"""
	2329
	2330	_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
	2331	IE_NAME = u'xvideos'
	2332
	2333	def _real_extract(self, url):
	2334	mobj = re.match(self._VALID_URL, url)
	2335	if mobj is None:
	2336	raise ExtractorError(u'Invalid URL: %s' % url)
	2337	video_id = mobj.group(1)
	2338
	2339	webpage = self._download_webpage(url, video_id)
	2340
	2341	self.report_extraction(video_id)
	2342
	2343
	2344	# Extract video URL
	2345	mobj = re.search(r'flv_url=(.+?)&', webpage)
	2346	if mobj is None:
	2347	raise ExtractorError(u'Unable to extract video url')
	2348	video_url = compat_urllib_parse.unquote(mobj.group(1))
	2349
	2350
	2351	# Extract title
	2352	mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
	2353	if mobj is None:
	2354	raise ExtractorError(u'Unable to extract video title')
	2355	video_title = mobj.group(1)
	2356
	2357
	2358	# Extract video thumbnail
	2359	mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
	2360	if mobj is None:
	2361	raise ExtractorError(u'Unable to extract video thumbnail')
	2362	video_thumbnail = mobj.group(0)
	2363
	2364	info = {
	2365	'id': video_id,
	2366	'url': video_url,
	2367	'uploader': None,
	2368	'upload_date': None,
	2369	'title': video_title,
	2370	'ext': 'flv',
	2371	'thumbnail': video_thumbnail,
	2372	'description': None,
	2373	}
	2374
	2375	return [info]
	2376
	2377
	2378	class SoundcloudIE(InfoExtractor):
	2379	"""Information extractor for soundcloud.com
	2380	To access the media, the uid of the song and a stream token
	2381	must be extracted from the page source and the script must make
	2382	a request to media.soundcloud.com/crossdomain.xml. Then
	2383	the media can be grabbed by requesting from an url composed
	2384	of the stream token and uid
	2385	"""
	2386
	2387	_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
	2388	IE_NAME = u'soundcloud'
	2389
	2390	def report_resolve(self, video_id):
	2391	"""Report information extraction."""
	2392	self.to_screen(u'%s: Resolving id' % video_id)
	2393
	2394	def _real_extract(self, url):
	2395	mobj = re.match(self._VALID_URL, url)
	2396	if mobj is None:
	2397	raise ExtractorError(u'Invalid URL: %s' % url)
	2398
	2399	# extract uploader (which is in the url)
	2400	uploader = mobj.group(1)
	2401	# extract simple title (uploader + slug of song title)
	2402	slug_title = mobj.group(2)
	2403	simple_title = uploader + u'-' + slug_title
	2404	full_title = '%s/%s' % (uploader, slug_title)
	2405
	2406	self.report_resolve(full_title)
	2407
	2408	url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
	2409	resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
	2410	info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
	2411
	2412	info = json.loads(info_json)
	2413	video_id = info['id']
	2414	self.report_extraction(full_title)
	2415
	2416	streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
	2417	stream_json = self._download_webpage(streams_url, full_title,
	2418	u'Downloading stream definitions',
	2419	u'unable to download stream definitions')
	2420
	2421	streams = json.loads(stream_json)
	2422	mediaURL = streams['http_mp3_128_url']
	2423	upload_date = unified_strdate(info['created_at'])
	2424
	2425	return [{
	2426	'id': info['id'],
	2427	'url': mediaURL,
	2428	'uploader': info['user']['username'],
	2429	'upload_date': upload_date,
	2430	'title': info['title'],
	2431	'ext': u'mp3',
	2432	'description': info['description'],
	2433	}]
	2434
	2435	class SoundcloudSetIE(InfoExtractor):
	2436	"""Information extractor for soundcloud.com sets
	2437	To access the media, the uid of the song and a stream token
	2438	must be extracted from the page source and the script must make
	2439	a request to media.soundcloud.com/crossdomain.xml. Then
	2440	the media can be grabbed by requesting from an url composed
	2441	of the stream token and uid
	2442	"""
	2443
	2444	_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
	2445	IE_NAME = u'soundcloud:set'
	2446
	2447	def report_resolve(self, video_id):
	2448	"""Report information extraction."""
	2449	self.to_screen(u'%s: Resolving id' % video_id)
	2450
	2451	def _real_extract(self, url):
	2452	mobj = re.match(self._VALID_URL, url)
	2453	if mobj is None:
	2454	raise ExtractorError(u'Invalid URL: %s' % url)
	2455
	2456	# extract uploader (which is in the url)
	2457	uploader = mobj.group(1)
	2458	# extract simple title (uploader + slug of song title)
	2459	slug_title = mobj.group(2)
	2460	simple_title = uploader + u'-' + slug_title
	2461	full_title = '%s/sets/%s' % (uploader, slug_title)
	2462
	2463	self.report_resolve(full_title)
	2464
	2465	url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
	2466	resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
	2467	info_json = self._download_webpage(resolv_url, full_title)
	2468
	2469	videos = []
	2470	info = json.loads(info_json)
	2471	if 'errors' in info:
	2472	for err in info['errors']:
	2473	self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
	2474	return
	2475
	2476	self.report_extraction(full_title)
	2477	for track in info['tracks']:
	2478	video_id = track['id']
	2479
	2480	streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
	2481	stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
	2482
	2483	self.report_extraction(video_id)
	2484	streams = json.loads(stream_json)
	2485	mediaURL = streams['http_mp3_128_url']
	2486
	2487	videos.append({
	2488	'id': video_id,
	2489	'url': mediaURL,
	2490	'uploader': track['user']['username'],
	2491	'upload_date': unified_strdate(track['created_at']),
	2492	'title': track['title'],
	2493	'ext': u'mp3',
	2494	'description': track['description'],
	2495	})
	2496	return videos
	2497
	2498
	2499	class InfoQIE(InfoExtractor):
	2500	"""Information extractor for infoq.com"""
	2501	_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
	2502
	2503	def _real_extract(self, url):
	2504	mobj = re.match(self._VALID_URL, url)
	2505	if mobj is None:
	2506	raise ExtractorError(u'Invalid URL: %s' % url)
	2507
	2508	webpage = self._download_webpage(url, video_id=url)
	2509	self.report_extraction(url)
	2510
	2511	# Extract video URL
	2512	mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
	2513	if mobj is None:
	2514	raise ExtractorError(u'Unable to extract video url')
	2515	real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
	2516	video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
	2517
	2518	# Extract title
	2519	mobj = re.search(r'contentTitle = "(.*?)";', webpage)
	2520	if mobj is None:
	2521	raise ExtractorError(u'Unable to extract video title')
	2522	video_title = mobj.group(1)
	2523
	2524	# Extract description
	2525	video_description = u'No description available.'
	2526	mobj = re.search(r'<meta name="description" content="(.)"(?:\s/)?>', webpage)
	2527	if mobj is not None:
	2528	video_description = mobj.group(1)
	2529
	2530	video_filename = video_url.split('/')[-1]
	2531	video_id, extension = video_filename.split('.')
	2532
	2533	info = {
	2534	'id': video_id,
	2535	'url': video_url,
	2536	'uploader': None,
	2537	'upload_date': None,
	2538	'title': video_title,
	2539	'ext': extension, # Extension is always(?) mp4, but seems to be flv
	2540	'thumbnail': None,
	2541	'description': video_description,
	2542	}
	2543
	2544	return [info]
	2545
	2546	class MixcloudIE(InfoExtractor):
	2547	"""Information extractor for www.mixcloud.com"""
	2548
	2549	_WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
	2550	_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
	2551	IE_NAME = u'mixcloud'
	2552
	2553	def report_download_json(self, file_id):
	2554	"""Report JSON download."""
	2555	self.to_screen(u'Downloading json')
	2556
	2557	def get_urls(self, jsonData, fmt, bitrate='best'):
	2558	"""Get urls from 'audio_formats' section in json"""
	2559	file_url = None
	2560	try:
	2561	bitrate_list = jsonData[fmt]
	2562	if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
	2563	bitrate = max(bitrate_list) # select highest
	2564
	2565	url_list = jsonData[fmt][bitrate]
	2566	except TypeError: # we have no bitrate info.
	2567	url_list = jsonData[fmt]
	2568	return url_list
	2569
	2570	def check_urls(self, url_list):
	2571	"""Returns 1st active url from list"""
	2572	for url in url_list:
	2573	try:
	2574	compat_urllib_request.urlopen(url)
	2575	return url
	2576	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2577	url = None
	2578
	2579	return None
	2580
	2581	def _print_formats(self, formats):
	2582	print('Available formats:')
	2583	for fmt in formats.keys():
	2584	for b in formats[fmt]:
	2585	try:
	2586	ext = formats[fmt][b][0]
	2587	print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
	2588	except TypeError: # we have no bitrate info
	2589	ext = formats[fmt][0]
	2590	print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
	2591	break
	2592
	2593	def _real_extract(self, url):
	2594	mobj = re.match(self._VALID_URL, url)
	2595	if mobj is None:
	2596	raise ExtractorError(u'Invalid URL: %s' % url)
	2597	# extract uploader & filename from url
	2598	uploader = mobj.group(1).decode('utf-8')
	2599	file_id = uploader + "-" + mobj.group(2).decode('utf-8')
	2600
	2601	# construct API request
	2602	file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
	2603	# retrieve .json file with links to files
	2604	request = compat_urllib_request.Request(file_url)
	2605	try:
	2606	self.report_download_json(file_url)
	2607	jsonData = compat_urllib_request.urlopen(request).read()
	2608	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2609	raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
	2610
	2611	# parse JSON
	2612	json_data = json.loads(jsonData)
	2613	player_url = json_data['player_swf_url']
	2614	formats = dict(json_data['audio_formats'])
	2615
	2616	req_format = self._downloader.params.get('format', None)
	2617	bitrate = None
	2618
	2619	if self._downloader.params.get('listformats', None):
	2620	self._print_formats(formats)
	2621	return
	2622
	2623	if req_format is None or req_format == 'best':
	2624	for format_param in formats.keys():
	2625	url_list = self.get_urls(formats, format_param)
	2626	# check urls
	2627	file_url = self.check_urls(url_list)
	2628	if file_url is not None:
	2629	break # got it!
	2630	else:
	2631	if req_format not in formats:
	2632	raise ExtractorError(u'Format is not available')
	2633
	2634	url_list = self.get_urls(formats, req_format)
	2635	file_url = self.check_urls(url_list)
	2636	format_param = req_format
	2637
	2638	return [{
	2639	'id': file_id.decode('utf-8'),
	2640	'url': file_url.decode('utf-8'),
	2641	'uploader': uploader.decode('utf-8'),
	2642	'upload_date': None,
	2643	'title': json_data['name'],
	2644	'ext': file_url.split('.')[-1].decode('utf-8'),
	2645	'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
	2646	'thumbnail': json_data['thumbnail_url'],
	2647	'description': json_data['description'],
	2648	'player_url': player_url.decode('utf-8'),
	2649	}]
	2650
	2651	class StanfordOpenClassroomIE(InfoExtractor):
	2652	"""Information extractor for Stanford's Open ClassRoom"""
	2653
	2654	_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?\|(/MainFolder/(?:HomePage\|CoursePage\|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
	2655	IE_NAME = u'stanfordoc'
	2656
	2657	def _real_extract(self, url):
	2658	mobj = re.match(self._VALID_URL, url)
	2659	if mobj is None:
	2660	raise ExtractorError(u'Invalid URL: %s' % url)
	2661
	2662	if mobj.group('course') and mobj.group('video'): # A specific video
	2663	course = mobj.group('course')
	2664	video = mobj.group('video')
	2665	info = {
	2666	'id': course + '_' + video,
	2667	'uploader': None,
	2668	'upload_date': None,
	2669	}
	2670
	2671	self.report_extraction(info['id'])
	2672	baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
	2673	xmlUrl = baseUrl + video + '.xml'
	2674	try:
	2675	metaXml = compat_urllib_request.urlopen(xmlUrl).read()
	2676	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2677	raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
	2678	mdoc = xml.etree.ElementTree.fromstring(metaXml)
	2679	try:
	2680	info['title'] = mdoc.findall('./title')[0].text
	2681	info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
	2682	except IndexError:
	2683	raise ExtractorError(u'Invalid metadata XML file')
	2684	info['ext'] = info['url'].rpartition('.')[2]
	2685	return [info]
	2686	elif mobj.group('course'): # A course page
	2687	course = mobj.group('course')
	2688	info = {
	2689	'id': course,
	2690	'type': 'playlist',
	2691	'uploader': None,
	2692	'upload_date': None,
	2693	}
	2694
	2695	coursepage = self._download_webpage(url, info['id'],
	2696	note='Downloading course info page',
	2697	errnote='Unable to download course info page')
	2698
	2699	m = re.search('<h1>([^<]+)</h1>', coursepage)
	2700	if m:
	2701	info['title'] = unescapeHTML(m.group(1))
	2702	else:
	2703	info['title'] = info['id']
	2704
	2705	m = re.search('<description>([^<]+)</description>', coursepage)
	2706	if m:
	2707	info['description'] = unescapeHTML(m.group(1))
	2708
	2709	links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
	2710	info['list'] = [
	2711	{
	2712	'type': 'reference',
	2713	'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
	2714	}
	2715	for vpage in links]
	2716	results = []
	2717	for entry in info['list']:
	2718	assert entry['type'] == 'reference'
	2719	results += self.extract(entry['url'])
	2720	return results
	2721	else: # Root page
	2722	info = {
	2723	'id': 'Stanford OpenClassroom',
	2724	'type': 'playlist',
	2725	'uploader': None,
	2726	'upload_date': None,
	2727	}
	2728
	2729	self.report_download_webpage(info['id'])
	2730	rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
	2731	try:
	2732	rootpage = compat_urllib_request.urlopen(rootURL).read()
	2733	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2734	raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
	2735
	2736	info['title'] = info['id']
	2737
	2738	links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
	2739	info['list'] = [
	2740	{
	2741	'type': 'reference',
	2742	'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
	2743	}
	2744	for cpage in links]
	2745
	2746	results = []
	2747	for entry in info['list']:
	2748	assert entry['type'] == 'reference'
	2749	results += self.extract(entry['url'])
	2750	return results
	2751
	2752	class MTVIE(InfoExtractor):
	2753	"""Information extractor for MTV.com"""
	2754
	2755	_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
	2756	IE_NAME = u'mtv'
	2757
	2758	def _real_extract(self, url):
	2759	mobj = re.match(self._VALID_URL, url)
	2760	if mobj is None:
	2761	raise ExtractorError(u'Invalid URL: %s' % url)
	2762	if not mobj.group('proto'):
	2763	url = 'http://' + url
	2764	video_id = mobj.group('videoid')
	2765
	2766	webpage = self._download_webpage(url, video_id)
	2767
	2768	mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
	2769	if mobj is None:
	2770	raise ExtractorError(u'Unable to extract song name')
	2771	song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
	2772	mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
	2773	if mobj is None:
	2774	raise ExtractorError(u'Unable to extract performer')
	2775	performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
	2776	video_title = performer + ' - ' + song_name
	2777
	2778	mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
	2779	if mobj is None:
	2780	raise ExtractorError(u'Unable to mtvn_uri')
	2781	mtvn_uri = mobj.group(1)
	2782
	2783	mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
	2784	if mobj is None:
	2785	raise ExtractorError(u'Unable to extract content id')
	2786	content_id = mobj.group(1)
	2787
	2788	videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
	2789	self.report_extraction(video_id)
	2790	request = compat_urllib_request.Request(videogen_url)
	2791	try:
	2792	metadataXml = compat_urllib_request.urlopen(request).read()
	2793	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	2794	raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
	2795
	2796	mdoc = xml.etree.ElementTree.fromstring(metadataXml)
	2797	renditions = mdoc.findall('.//rendition')
	2798
	2799	# For now, always pick the highest quality.
	2800	rendition = renditions[-1]
	2801
	2802	try:
	2803	_,_,ext = rendition.attrib['type'].partition('/')
	2804	format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
	2805	video_url = rendition.find('./src').text
	2806	except KeyError:
	2807	raise ExtractorError('Invalid rendition field.')
	2808
	2809	info = {
	2810	'id': video_id,
	2811	'url': video_url,
	2812	'uploader': performer,
	2813	'upload_date': None,
	2814	'title': video_title,
	2815	'ext': ext,
	2816	'format': format,
	2817	}
	2818
	2819	return [info]
	2820
	2821
	2822	class YoukuIE(InfoExtractor):
	2823	_VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
	2824
	2825	def _gen_sid(self):
	2826	nowTime = int(time.time() * 1000)
	2827	random1 = random.randint(1000,1998)
	2828	random2 = random.randint(1000,9999)
	2829
	2830	return "%d%d%d" %(nowTime,random1,random2)
	2831
	2832	def _get_file_ID_mix_string(self, seed):
	2833	mixed = []
	2834	source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
	2835	seed = float(seed)
	2836	for i in range(len(source)):
	2837	seed = (seed * 211 + 30031 ) % 65536
	2838	index = math.floor(seed / 65536 * len(source) )
	2839	mixed.append(source[int(index)])
	2840	source.remove(source[int(index)])
	2841	#return ''.join(mixed)
	2842	return mixed
	2843
	2844	def _get_file_id(self, fileId, seed):
	2845	mixed = self._get_file_ID_mix_string(seed)
	2846	ids = fileId.split('*')
	2847	realId = []
	2848	for ch in ids:
	2849	if ch:
	2850	realId.append(mixed[int(ch)])
	2851	return ''.join(realId)
	2852
	2853	def _real_extract(self, url):
	2854	mobj = re.match(self._VALID_URL, url)
	2855	if mobj is None:
	2856	raise ExtractorError(u'Invalid URL: %s' % url)
	2857	video_id = mobj.group('ID')
	2858
	2859	info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
	2860
	2861	jsondata = self._download_webpage(info_url, video_id)
	2862
	2863	self.report_extraction(video_id)
	2864	try:
	2865	config = json.loads(jsondata)
	2866
	2867	video_title = config['data'][0]['title']
	2868	seed = config['data'][0]['seed']
	2869
	2870	format = self._downloader.params.get('format', None)
	2871	supported_format = list(config['data'][0]['streamfileids'].keys())
	2872
	2873	if format is None or format == 'best':
	2874	if 'hd2' in supported_format:
	2875	format = 'hd2'
	2876	else:
	2877	format = 'flv'
	2878	ext = u'flv'
	2879	elif format == 'worst':
	2880	format = 'mp4'
	2881	ext = u'mp4'
	2882	else:
	2883	format = 'flv'
	2884	ext = u'flv'
	2885
	2886
	2887	fileid = config['data'][0]['streamfileids'][format]
	2888	keys = [s['k'] for s in config['data'][0]['segs'][format]]
	2889	except (UnicodeDecodeError, ValueError, KeyError):
	2890	raise ExtractorError(u'Unable to extract info section')
	2891
	2892	files_info=[]
	2893	sid = self._gen_sid()
	2894	fileid = self._get_file_id(fileid, seed)
	2895
	2896	#column 8,9 of fileid represent the segment number
	2897	#fileid[7:9] should be changed
	2898	for index, key in enumerate(keys):
	2899
	2900	temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
	2901	download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
	2902
	2903	info = {
	2904	'id': '%s_part%02d' % (video_id, index),
	2905	'url': download_url,
	2906	'uploader': None,
	2907	'upload_date': None,
	2908	'title': video_title,
	2909	'ext': ext,
	2910	}
	2911	files_info.append(info)
	2912
	2913	return files_info
	2914
	2915
	2916	class XNXXIE(InfoExtractor):
	2917	"""Information extractor for xnxx.com"""
	2918
	2919	_VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
	2920	IE_NAME = u'xnxx'
	2921	VIDEO_URL_RE = r'flv_url=(.*?)&'
	2922	VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
	2923	VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
	2924
	2925	def _real_extract(self, url):
	2926	mobj = re.match(self._VALID_URL, url)
	2927	if mobj is None:
	2928	raise ExtractorError(u'Invalid URL: %s' % url)
	2929	video_id = mobj.group(1)
	2930
	2931	# Get webpage content
	2932	webpage = self._download_webpage(url, video_id)
	2933
	2934	result = re.search(self.VIDEO_URL_RE, webpage)
	2935	if result is None:
	2936	raise ExtractorError(u'Unable to extract video url')
	2937	video_url = compat_urllib_parse.unquote(result.group(1))
	2938
	2939	result = re.search(self.VIDEO_TITLE_RE, webpage)
	2940	if result is None:
	2941	raise ExtractorError(u'Unable to extract video title')
	2942	video_title = result.group(1)
	2943
	2944	result = re.search(self.VIDEO_THUMB_RE, webpage)
	2945	if result is None:
	2946	raise ExtractorError(u'Unable to extract video thumbnail')
	2947	video_thumbnail = result.group(1)
	2948
	2949	return [{
	2950	'id': video_id,
	2951	'url': video_url,
	2952	'uploader': None,
	2953	'upload_date': None,
	2954	'title': video_title,
	2955	'ext': 'flv',
	2956	'thumbnail': video_thumbnail,
	2957	'description': None,
	2958	}]
	2959
	2960
	2961	class GooglePlusIE(InfoExtractor):
	2962	"""Information extractor for plus.google.com."""
	2963
	2964	_VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
	2965	IE_NAME = u'plus.google'
	2966
	2967	def report_extract_entry(self, url):
	2968	"""Report downloading extry"""
	2969	self.to_screen(u'Downloading entry: %s' % url)
	2970
	2971	def report_date(self, upload_date):
	2972	"""Report downloading extry"""
	2973	self.to_screen(u'Entry date: %s' % upload_date)
	2974
	2975	def report_uploader(self, uploader):
	2976	"""Report downloading extry"""
	2977	self.to_screen(u'Uploader: %s' % uploader)
	2978
	2979	def report_title(self, video_title):
	2980	"""Report downloading extry"""
	2981	self.to_screen(u'Title: %s' % video_title)
	2982
	2983	def report_extract_vid_page(self, video_page):
	2984	"""Report information extraction."""
	2985	self.to_screen(u'Extracting video page: %s' % video_page)
	2986
	2987	def _real_extract(self, url):
	2988	# Extract id from URL
	2989	mobj = re.match(self._VALID_URL, url)
	2990	if mobj is None:
	2991	raise ExtractorError(u'Invalid URL: %s' % url)
	2992
	2993	post_url = mobj.group(0)
	2994	video_id = mobj.group(1)
	2995
	2996	video_extension = 'flv'
	2997
	2998	# Step 1, Retrieve post webpage to extract further information
	2999	self.report_extract_entry(post_url)
	3000	webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
	3001
	3002	# Extract update date
	3003	upload_date = None
	3004	pattern = 'title="Timestamp">(.*?)</a>'
	3005	mobj = re.search(pattern, webpage)
	3006	if mobj:
	3007	upload_date = mobj.group(1)
	3008	# Convert timestring to a format suitable for filename
	3009	upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
	3010	upload_date = upload_date.strftime('%Y%m%d')
	3011	self.report_date(upload_date)
	3012
	3013	# Extract uploader
	3014	uploader = None
	3015	pattern = r'rel\="author".?>(.?)</a>'
	3016	mobj = re.search(pattern, webpage)
	3017	if mobj:
	3018	uploader = mobj.group(1)
	3019	self.report_uploader(uploader)
	3020
	3021	# Extract title
	3022	# Get the first line for title
	3023	video_title = u'NA'
	3024	pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
	3025	mobj = re.search(pattern, webpage)
	3026	if mobj:
	3027	video_title = mobj.group(1)
	3028	self.report_title(video_title)
	3029
	3030	# Step 2, Stimulate clicking the image box to launch video
	3031	pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
	3032	mobj = re.search(pattern, webpage)
	3033	if mobj is None:
	3034	raise ExtractorError(u'Unable to extract video page URL')
	3035
	3036	video_page = mobj.group(1)
	3037	webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
	3038	self.report_extract_vid_page(video_page)
	3039
	3040
	3041	# Extract video links on video page
	3042	"""Extract video links of all sizes"""
	3043	pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
	3044	mobj = re.findall(pattern, webpage)
	3045	if len(mobj) == 0:
	3046	raise ExtractorError(u'Unable to extract video links')
	3047
	3048	# Sort in resolution
	3049	links = sorted(mobj)
	3050
	3051	# Choose the lowest of the sort, i.e. highest resolution
	3052	video_url = links[-1]
	3053	# Only get the url. The resolution part in the tuple has no use anymore
	3054	video_url = video_url[-1]
	3055	# Treat escaped \u0026 style hex
	3056	try:
	3057	video_url = video_url.decode("unicode_escape")
	3058	except AttributeError: # Python 3
	3059	video_url = bytes(video_url, 'ascii').decode('unicode-escape')
	3060
	3061
	3062	return [{
	3063	'id': video_id,
	3064	'url': video_url,
	3065	'uploader': uploader,
	3066	'upload_date': upload_date,
	3067	'title': video_title,
	3068	'ext': video_extension,
	3069	}]
	3070
	3071	class NBAIE(InfoExtractor):
	3072	_VALID_URL = r'^(?:https?://)?(?:watch\.\|www\.)?nba\.com/(?:nba/)?video(/[^?])(\?.)?$'
	3073	IE_NAME = u'nba'
	3074
	3075	def _real_extract(self, url):
	3076	mobj = re.match(self._VALID_URL, url)
	3077	if mobj is None:
	3078	raise ExtractorError(u'Invalid URL: %s' % url)
	3079
	3080	video_id = mobj.group(1)
	3081	if video_id.endswith('/index.html'):
	3082	video_id = video_id[:-len('/index.html')]
	3083
	3084	webpage = self._download_webpage(url, video_id)
	3085
	3086	video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
	3087	def _findProp(rexp, default=None):
	3088	m = re.search(rexp, webpage)
	3089	if m:
	3090	return unescapeHTML(m.group(1))
	3091	else:
	3092	return default
	3093
	3094	shortened_video_id = video_id.rpartition('/')[2]
	3095	title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
	3096	info = {
	3097	'id': shortened_video_id,
	3098	'url': video_url,
	3099	'ext': 'mp4',
	3100	'title': title,
	3101	'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
	3102	'description': _findProp(r'<div class="description">(.*?)</h1>'),
	3103	}
	3104	return [info]
	3105
	3106	class JustinTVIE(InfoExtractor):
	3107	"""Information extractor for justin.tv and twitch.tv"""
	3108	# TODO: One broadcast may be split into multiple videos. The key
	3109	# 'broadcast_id' is the same for all parts, and 'broadcast_part'
	3110	# starts at 1 and increases. Can we treat all parts as one video?
	3111
	3112	_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch\|justin)\.tv/
	3113	(?:
	3114	(?P<channelid>[^/]+)\|
	3115	(?:(?:[^/]+)/b/(?P<videoid>[^/]+))\|
	3116	(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
	3117	)
	3118	/?(?:\#.*)?$
	3119	"""
	3120	_JUSTIN_PAGE_LIMIT = 100
	3121	IE_NAME = u'justin.tv'
	3122
	3123	def report_download_page(self, channel, offset):
	3124	"""Report attempt to download a single page of videos."""
	3125	self.to_screen(u'%s: Downloading video information from %d to %d' %
	3126	(channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
	3127
	3128	# Return count of items, list of valid items
	3129	def _parse_page(self, url, video_id):
	3130	webpage = self._download_webpage(url, video_id,
	3131	u'Downloading video info JSON',
	3132	u'unable to download video info JSON')
	3133
	3134	response = json.loads(webpage)
	3135	if type(response) != list:
	3136	error_text = response.get('error', 'unknown error')
	3137	raise ExtractorError(u'Justin.tv API: %s' % error_text)
	3138	info = []
	3139	for clip in response:
	3140	video_url = clip['video_file_url']
	3141	if video_url:
	3142	video_extension = os.path.splitext(video_url)[1][1:]
	3143	video_date = re.sub('-', '', clip['start_time'][:10])
	3144	video_uploader_id = clip.get('user_id', clip.get('channel_id'))
	3145	video_id = clip['id']
	3146	video_title = clip.get('title', video_id)
	3147	info.append({
	3148	'id': video_id,
	3149	'url': video_url,
	3150	'title': video_title,
	3151	'uploader': clip.get('channel_name', video_uploader_id),
	3152	'uploader_id': video_uploader_id,
	3153	'upload_date': video_date,
	3154	'ext': video_extension,
	3155	})
	3156	return (len(response), info)
	3157
	3158	def _real_extract(self, url):
	3159	mobj = re.match(self._VALID_URL, url)
	3160	if mobj is None:
	3161	raise ExtractorError(u'invalid URL: %s' % url)
	3162
	3163	api_base = 'http://api.justin.tv'
	3164	paged = False
	3165	if mobj.group('channelid'):
	3166	paged = True
	3167	video_id = mobj.group('channelid')
	3168	api = api_base + '/channel/archives/%s.json' % video_id
	3169	elif mobj.group('chapterid'):
	3170	chapter_id = mobj.group('chapterid')
	3171
	3172	webpage = self._download_webpage(url, chapter_id)
	3173	m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
	3174	if not m:
	3175	raise ExtractorError(u'Cannot find archive of a chapter')
	3176	archive_id = m.group(1)
	3177
	3178	api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
	3179	chapter_info_xml = self._download_webpage(api, chapter_id,
	3180	note=u'Downloading chapter information',
	3181	errnote=u'Chapter information download failed')
	3182	doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
	3183	for a in doc.findall('.//archive'):
	3184	if archive_id == a.find('./id').text:
	3185	break
	3186	else:
	3187	raise ExtractorError(u'Could not find chapter in chapter information')
	3188
	3189	video_url = a.find('./video_file_url').text
	3190	video_ext = video_url.rpartition('.')[2] or u'flv'
	3191
	3192	chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
	3193	chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
	3194	note='Downloading chapter metadata',
	3195	errnote='Download of chapter metadata failed')
	3196	chapter_info = json.loads(chapter_info_json)
	3197
	3198	bracket_start = int(doc.find('.//bracket_start').text)
	3199	bracket_end = int(doc.find('.//bracket_end').text)
	3200
	3201	# TODO determine start (and probably fix up file)
	3202	# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
	3203	#video_url += u'?start=' + TODO:start_timestamp
	3204	# bracket_start is 13290, but we want 51670615
	3205	self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
	3206	u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
	3207
	3208	info = {
	3209	'id': u'c' + chapter_id,
	3210	'url': video_url,
	3211	'ext': video_ext,
	3212	'title': chapter_info['title'],
	3213	'thumbnail': chapter_info['preview'],
	3214	'description': chapter_info['description'],
	3215	'uploader': chapter_info['channel']['display_name'],
	3216	'uploader_id': chapter_info['channel']['name'],
	3217	}
	3218	return [info]
	3219	else:
	3220	video_id = mobj.group('videoid')
	3221	api = api_base + '/broadcast/by_archive/%s.json' % video_id
	3222
	3223	self.report_extraction(video_id)
	3224
	3225	info = []
	3226	offset = 0
	3227	limit = self._JUSTIN_PAGE_LIMIT
	3228	while True:
	3229	if paged:
	3230	self.report_download_page(video_id, offset)
	3231	page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
	3232	page_count, page_info = self._parse_page(page_url, video_id)
	3233	info.extend(page_info)
	3234	if not paged or page_count != limit:
	3235	break
	3236	offset += limit
	3237	return info
	3238
	3239	class FunnyOrDieIE(InfoExtractor):
	3240	_VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
	3241
	3242	def _real_extract(self, url):
	3243	mobj = re.match(self._VALID_URL, url)
	3244	if mobj is None:
	3245	raise ExtractorError(u'invalid URL: %s' % url)
	3246
	3247	video_id = mobj.group('id')
	3248	webpage = self._download_webpage(url, video_id)
	3249
	3250	m = re.search(r'<video[^>]>\s<source[^>]>\s<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
	3251	if not m:
	3252	raise ExtractorError(u'Unable to find video information')
	3253	video_url = unescapeHTML(m.group('url'))
	3254
	3255	m = re.search(r"<h1 class='player_page_h1'.?>(?P<title>.?)</h1>", webpage, flags=re.DOTALL)
	3256	if not m:
	3257	m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
	3258	if not m:
	3259	raise ExtractorError(u'Cannot find video title')
	3260	title = clean_html(m.group('title'))
	3261
	3262	m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
	3263	if m:
	3264	desc = unescapeHTML(m.group('desc'))
	3265	else:
	3266	desc = None
	3267
	3268	info = {
	3269	'id': video_id,
	3270	'url': video_url,
	3271	'ext': 'mp4',
	3272	'title': title,
	3273	'description': desc,
	3274	}
	3275	return [info]
	3276
	3277	class SteamIE(InfoExtractor):
	3278	_VALID_URL = r"""http://store\.steampowered\.com/
	3279	(agecheck/)?
	3280	(?P<urltype>video\|app)/ #If the page is only for videos or for a game
	3281	(?P<gameID>\d+)/?
	3282	(?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
	3283	"""
	3284
	3285	@classmethod
	3286	def suitable(cls, url):
	3287	"""Receives a URL and returns True if suitable for this IE."""
	3288	return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
	3289
	3290	def _real_extract(self, url):
	3291	m = re.match(self._VALID_URL, url, re.VERBOSE)
	3292	gameID = m.group('gameID')
	3293	videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
	3294	self.report_age_confirmation()
	3295	webpage = self._download_webpage(videourl, gameID)
	3296	game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
	3297
	3298	urlRE = r"'movie_(?P<videoID>\d+)': \{\sFILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\sMOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
	3299	mweb = re.finditer(urlRE, webpage)
	3300	namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
	3301	titles = re.finditer(namesRE, webpage)
	3302	thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
	3303	thumbs = re.finditer(thumbsRE, webpage)
	3304	videos = []
	3305	for vid,vtitle,thumb in zip(mweb,titles,thumbs):
	3306	video_id = vid.group('videoID')
	3307	title = vtitle.group('videoName')
	3308	video_url = vid.group('videoURL')
	3309	video_thumb = thumb.group('thumbnail')
	3310	if not video_url:
	3311	raise ExtractorError(u'Cannot find video url for %s' % video_id)
	3312	info = {
	3313	'id':video_id,
	3314	'url':video_url,
	3315	'ext': 'flv',
	3316	'title': unescapeHTML(title),
	3317	'thumbnail': video_thumb
	3318	}
	3319	videos.append(info)
	3320	return [self.playlist_result(videos, gameID, game_title)]
	3321
	3322	class UstreamIE(InfoExtractor):
	3323	_VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
	3324	IE_NAME = u'ustream'
	3325
	3326	def _real_extract(self, url):
	3327	m = re.match(self._VALID_URL, url)
	3328	video_id = m.group('videoID')
	3329	video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
	3330	webpage = self._download_webpage(url, video_id)
	3331	m = re.search(r'data-title="(?P<title>.+)"',webpage)
	3332	title = m.group('title')
	3333	m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
	3334	uploader = m.group('uploader')
	3335	info = {
	3336	'id':video_id,
	3337	'url':video_url,
	3338	'ext': 'flv',
	3339	'title': title,
	3340	'uploader': uploader
	3341	}
	3342	return [info]
	3343
	3344	class WorldStarHipHopIE(InfoExtractor):
	3345	_VALID_URL = r'https?://(?:www\|m)\.worldstar(?:candy\|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
	3346	IE_NAME = u'WorldStarHipHop'
	3347
	3348	def _real_extract(self, url):
	3349	_src_url = r'so\.addVariable$"file","(.*?)"$'
	3350
	3351	m = re.match(self._VALID_URL, url)
	3352	video_id = m.group('id')
	3353
	3354	webpage_src = self._download_webpage(url, video_id)
	3355
	3356	mobj = re.search(_src_url, webpage_src)
	3357
	3358	if mobj is not None:
	3359	video_url = mobj.group(1)
	3360	if 'mp4' in video_url:
	3361	ext = 'mp4'
	3362	else:
	3363	ext = 'flv'
	3364	else:
	3365	raise ExtractorError(u'Cannot find video url for %s' % video_id)
	3366
	3367	mobj = re.search(r"<title>(.*)</title>", webpage_src)
	3368
	3369	if mobj is None:
	3370	raise ExtractorError(u'Cannot determine title')
	3371	title = mobj.group(1)
	3372
	3373	mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
	3374	# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
	3375	if mobj is not None:
	3376	thumbnail = mobj.group(1)
	3377	else:
	3378	_title = r"""candytitles.>(.)</span>"""
	3379	mobj = re.search(_title, webpage_src)
	3380	if mobj is not None:
	3381	title = mobj.group(1)
	3382	thumbnail = None
	3383
	3384	results = [{
	3385	'id': video_id,
	3386	'url' : video_url,
	3387	'title' : title,
	3388	'thumbnail' : thumbnail,
	3389	'ext' : ext,
	3390	}]
	3391	return results
	3392
	3393	class RBMARadioIE(InfoExtractor):
	3394	_VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
	3395
	3396	def _real_extract(self, url):
	3397	m = re.match(self._VALID_URL, url)
	3398	video_id = m.group('videoID')
	3399
	3400	webpage = self._download_webpage(url, video_id)
	3401	m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
	3402	if not m:
	3403	raise ExtractorError(u'Cannot find metadata')
	3404	json_data = m.group(1)
	3405
	3406	try:
	3407	data = json.loads(json_data)
	3408	except ValueError as e:
	3409	raise ExtractorError(u'Invalid JSON: ' + str(e))
	3410
	3411	video_url = data['akamai_url'] + '&cbr=256'
	3412	url_parts = compat_urllib_parse_urlparse(video_url)
	3413	video_ext = url_parts.path.rpartition('.')[2]
	3414	info = {
	3415	'id': video_id,
	3416	'url': video_url,
	3417	'ext': video_ext,
	3418	'title': data['title'],
	3419	'description': data.get('teaser_text'),
	3420	'location': data.get('country_of_origin'),
	3421	'uploader': data.get('host', {}).get('name'),
	3422	'uploader_id': data.get('host', {}).get('slug'),
	3423	'thumbnail': data.get('image', {}).get('large_url_2x'),
	3424	'duration': data.get('duration'),
	3425	}
	3426	return [info]
	3427
	3428
	3429	class YouPornIE(InfoExtractor):
	3430	"""Information extractor for youporn.com."""
	3431	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
	3432
	3433	def _print_formats(self, formats):
	3434	"""Print all available formats"""
	3435	print(u'Available formats:')
	3436	print(u'ext\t\tformat')
	3437	print(u'---------------------------------')
	3438	for format in formats:
	3439	print(u'%s\t\t%s' % (format['ext'], format['format']))
	3440
	3441	def _specific(self, req_format, formats):
	3442	for x in formats:
	3443	if(x["format"]==req_format):
	3444	return x
	3445	return None
	3446
	3447	def _real_extract(self, url):
	3448	mobj = re.match(self._VALID_URL, url)
	3449	if mobj is None:
	3450	raise ExtractorError(u'Invalid URL: %s' % url)
	3451
	3452	video_id = mobj.group('videoid')
	3453
	3454	req = compat_urllib_request.Request(url)
	3455	req.add_header('Cookie', 'age_verified=1')
	3456	webpage = self._download_webpage(req, video_id)
	3457
	3458	# Get the video title
	3459	result = re.search(r'<h1.?>(?P<title>.)</h1>', webpage)
	3460	if result is None:
	3461	raise ExtractorError(u'Unable to extract video title')
	3462	video_title = result.group('title').strip()
	3463
	3464	# Get the video date
	3465	result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
	3466	if result is None:
	3467	self._downloader.report_warning(u'unable to extract video date')
	3468	upload_date = None
	3469	else:
	3470	upload_date = unified_strdate(result.group('date').strip())
	3471
	3472	# Get the video uploader
	3473	result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
	3474	if result is None:
	3475	self._downloader.report_warning(u'unable to extract uploader')
	3476	video_uploader = None
	3477	else:
	3478	video_uploader = result.group('uploader').strip()
	3479	video_uploader = clean_html( video_uploader )
	3480
	3481	# Get all of the formats available
	3482	DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
	3483	result = re.search(DOWNLOAD_LIST_RE, webpage)
	3484	if result is None:
	3485	raise ExtractorError(u'Unable to extract download list')
	3486	download_list_html = result.group('download_list').strip()
	3487
	3488	# Get all of the links from the page
	3489	LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
	3490	links = re.findall(LINK_RE, download_list_html)
	3491	if(len(links) == 0):
	3492	raise ExtractorError(u'ERROR: no known formats available for video')
	3493
	3494	self.to_screen(u'Links found: %d' % len(links))
	3495
	3496	formats = []
	3497	for link in links:
	3498
	3499	# A link looks like this:
	3500	# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
	3501	# A path looks like this:
	3502	# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
	3503	video_url = unescapeHTML( link )
	3504	path = compat_urllib_parse_urlparse( video_url ).path
	3505	extension = os.path.splitext( path )[1][1:]
	3506	format = path.split('/')[4].split('_')[:2]
	3507	size = format[0]
	3508	bitrate = format[1]
	3509	format = "-".join( format )
	3510	title = u'%s-%s-%s' % (video_title, size, bitrate)
	3511
	3512	formats.append({
	3513	'id': video_id,
	3514	'url': video_url,
	3515	'uploader': video_uploader,
	3516	'upload_date': upload_date,
	3517	'title': title,
	3518	'ext': extension,
	3519	'format': format,
	3520	'thumbnail': None,
	3521	'description': None,
	3522	'player_url': None
	3523	})
	3524
	3525	if self._downloader.params.get('listformats', None):
	3526	self._print_formats(formats)
	3527	return
	3528
	3529	req_format = self._downloader.params.get('format', None)
	3530	self.to_screen(u'Format: %s' % req_format)
	3531
	3532	if req_format is None or req_format == 'best':
	3533	return [formats[0]]
	3534	elif req_format == 'worst':
	3535	return [formats[-1]]
	3536	elif req_format in ('-1', 'all'):
	3537	return formats
	3538	else:
	3539	format = self._specific( req_format, formats )
	3540	if result is None:
	3541	raise ExtractorError(u'Requested format not available')
	3542	return [format]
	3543
	3544
	3545
	3546	class PornotubeIE(InfoExtractor):
	3547	"""Information extractor for pornotube.com."""
	3548	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
	3549
	3550	def _real_extract(self, url):
	3551	mobj = re.match(self._VALID_URL, url)
	3552	if mobj is None:
	3553	raise ExtractorError(u'Invalid URL: %s' % url)
	3554
	3555	video_id = mobj.group('videoid')
	3556	video_title = mobj.group('title')
	3557
	3558	# Get webpage content
	3559	webpage = self._download_webpage(url, video_id)
	3560
	3561	# Get the video URL
	3562	VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
	3563	result = re.search(VIDEO_URL_RE, webpage)
	3564	if result is None:
	3565	raise ExtractorError(u'Unable to extract video url')
	3566	video_url = compat_urllib_parse.unquote(result.group('url'))
	3567
	3568	#Get the uploaded date
	3569	VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
	3570	result = re.search(VIDEO_UPLOADED_RE, webpage)
	3571	if result is None:
	3572	raise ExtractorError(u'Unable to extract video title')
	3573	upload_date = unified_strdate(result.group('date'))
	3574
	3575	info = {'id': video_id,
	3576	'url': video_url,
	3577	'uploader': None,
	3578	'upload_date': upload_date,
	3579	'title': video_title,
	3580	'ext': 'flv',
	3581	'format': 'flv'}
	3582
	3583	return [info]
	3584
	3585	class YouJizzIE(InfoExtractor):
	3586	"""Information extractor for youjizz.com."""
	3587	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
	3588
	3589	def _real_extract(self, url):
	3590	mobj = re.match(self._VALID_URL, url)
	3591	if mobj is None:
	3592	raise ExtractorError(u'Invalid URL: %s' % url)
	3593
	3594	video_id = mobj.group('videoid')
	3595
	3596	# Get webpage content
	3597	webpage = self._download_webpage(url, video_id)
	3598
	3599	# Get the video title
	3600	result = re.search(r'<title>(?P<title>.*)</title>', webpage)
	3601	if result is None:
	3602	raise ExtractorError(u'ERROR: unable to extract video title')
	3603	video_title = result.group('title').strip()
	3604
	3605	# Get the embed page
	3606	result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
	3607	if result is None:
	3608	raise ExtractorError(u'ERROR: unable to extract embed page')
	3609
	3610	embed_page_url = result.group(0).strip()
	3611	video_id = result.group('videoid')
	3612
	3613	webpage = self._download_webpage(embed_page_url, video_id)
	3614
	3615	# Get the video URL
	3616	result = re.search(r'so.addVariable$"file",encodeURIComponent\("(?P<source>[^"]+)"$\);', webpage)
	3617	if result is None:
	3618	raise ExtractorError(u'ERROR: unable to extract video url')
	3619	video_url = result.group('source')
	3620
	3621	info = {'id': video_id,
	3622	'url': video_url,
	3623	'title': video_title,
	3624	'ext': 'flv',
	3625	'format': 'flv',
	3626	'player_url': embed_page_url}
	3627
	3628	return [info]
	3629
	3630	class EightTracksIE(InfoExtractor):
	3631	IE_NAME = '8tracks'
	3632	_VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
	3633
	3634	def _real_extract(self, url):
	3635	mobj = re.match(self._VALID_URL, url)
	3636	if mobj is None:
	3637	raise ExtractorError(u'Invalid URL: %s' % url)
	3638	playlist_id = mobj.group('id')
	3639
	3640	webpage = self._download_webpage(url, playlist_id)
	3641
	3642	m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
	3643	if not m:
	3644	raise ExtractorError(u'Cannot find trax information')
	3645	json_like = m.group(1)
	3646	data = json.loads(json_like)
	3647
	3648	session = str(random.randint(0, 1000000000))
	3649	mix_id = data['id']
	3650	track_count = data['tracks_count']
	3651	first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
	3652	next_url = first_url
	3653	res = []
	3654	for i in itertools.count():
	3655	api_json = self._download_webpage(next_url, playlist_id,
	3656	note=u'Downloading song information %s/%s' % (str(i+1), track_count),
	3657	errnote=u'Failed to download song information')
	3658	api_data = json.loads(api_json)
	3659	track_data = api_data[u'set']['track']
	3660	info = {
	3661	'id': track_data['id'],
	3662	'url': track_data['track_file_stream_url'],
	3663	'title': track_data['performer'] + u' - ' + track_data['name'],
	3664	'raw_title': track_data['name'],
	3665	'uploader_id': data['user']['login'],
	3666	'ext': 'm4a',
	3667	}
	3668	res.append(info)
	3669	if api_data['set']['at_last_track']:
	3670	break
	3671	next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
	3672	return res
	3673
	3674	class KeekIE(InfoExtractor):
	3675	_VALID_URL = r'http://(?:www\.)?keek\.com/(?:!\|\w+/keeks/)(?P<videoID>\w+)'
	3676	IE_NAME = u'keek'
	3677
	3678	def _real_extract(self, url):
	3679	m = re.match(self._VALID_URL, url)
	3680	video_id = m.group('videoID')
	3681	video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
	3682	thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
	3683	webpage = self._download_webpage(url, video_id)
	3684	m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
	3685	title = unescapeHTML(m.group('title'))
	3686	m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
	3687	uploader = clean_html(m.group('uploader'))
	3688	info = {
	3689	'id': video_id,
	3690	'url': video_url,
	3691	'ext': 'mp4',
	3692	'title': title,
	3693	'thumbnail': thumbnail,
	3694	'uploader': uploader
	3695	}
	3696	return [info]
	3697
	3698	class TEDIE(InfoExtractor):
	3699	_VALID_URL=r'''http://www\.ted\.com/
	3700	(
	3701	((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
	3702	\|
	3703	((?P<type_talk>talks)) # We have a simple talk
	3704	)
	3705	(/lang/(.*?))? # The url may contain the language
	3706	/(?P<name>\w+) # Here goes the name and then ".html"
	3707	'''
	3708
	3709	@classmethod
	3710	def suitable(cls, url):
	3711	"""Receives a URL and returns True if suitable for this IE."""
	3712	return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
	3713
	3714	def _real_extract(self, url):
	3715	m=re.match(self._VALID_URL, url, re.VERBOSE)
	3716	if m.group('type_talk'):
	3717	return [self._talk_info(url)]
	3718	else :
	3719	playlist_id=m.group('playlist_id')
	3720	name=m.group('name')
	3721	self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
	3722	return [self._playlist_videos_info(url,name,playlist_id)]
	3723
	3724	def _talk_video_link(self,mediaSlug):
	3725	'''Returns the video link for that mediaSlug'''
	3726	return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
	3727
	3728	def _playlist_videos_info(self,url,name,playlist_id=0):
	3729	'''Returns the videos of the playlist'''
	3730	video_RE=r'''
	3731	<li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
	3732	([.\s]*?)data-playlist_item_id="(\d+)"
	3733	([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
	3734	'''
	3735	video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
	3736	webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
	3737	m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
	3738	m_names=re.finditer(video_name_RE,webpage)
	3739
	3740	playlist_RE = r'div class="headline">(\s?)<h1>(\s?)<span>(?P<playlist_title>.*?)</span>'
	3741	m_playlist = re.search(playlist_RE, webpage)
	3742	playlist_title = m_playlist.group('playlist_title')
	3743
	3744	playlist_entries = []
	3745	for m_video, m_name in zip(m_videos,m_names):
	3746	video_id=m_video.group('video_id')
	3747	talk_url='http://www.ted.com%s' % m_name.group('talk_url')
	3748	playlist_entries.append(self.url_result(talk_url, 'TED'))
	3749	return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
	3750
	3751	def _talk_info(self, url, video_id=0):
	3752	"""Return the video for the talk in the url"""
	3753	m=re.match(self._VALID_URL, url,re.VERBOSE)
	3754	videoName=m.group('name')
	3755	webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
	3756	# If the url includes the language we get the title translated
	3757	title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
	3758	title=re.search(title_RE, webpage).group('title')
	3759	info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
	3760	"id":(?P<videoID>[\d]+).*?
	3761	"mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
	3762	thumb_RE=r'</span>[\s.]</div>[\s.]<img src="(?P<thumbnail>.*?)"'
	3763	thumb_match=re.search(thumb_RE,webpage)
	3764	info_match=re.search(info_RE,webpage,re.VERBOSE)
	3765	video_id=info_match.group('videoID')
	3766	mediaSlug=info_match.group('mediaSlug')
	3767	video_url=self._talk_video_link(mediaSlug)
	3768	info = {
	3769	'id': video_id,
	3770	'url': video_url,
	3771	'ext': 'mp4',
	3772	'title': title,
	3773	'thumbnail': thumb_match.group('thumbnail')
	3774	}
	3775	return info
	3776
	3777	class MySpassIE(InfoExtractor):
	3778	_VALID_URL = r'http://www.myspass.de/.*'
	3779
	3780	def _real_extract(self, url):
	3781	META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
	3782
	3783	# video id is the last path element of the URL
	3784	# usually there is a trailing slash, so also try the second but last
	3785	url_path = compat_urllib_parse_urlparse(url).path
	3786	url_parent_path, video_id = os.path.split(url_path)
	3787	if not video_id:
	3788	_, video_id = os.path.split(url_parent_path)
	3789
	3790	# get metadata
	3791	metadata_url = META_DATA_URL_TEMPLATE % video_id
	3792	metadata_text = self._download_webpage(metadata_url, video_id)
	3793	metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
	3794
	3795	# extract values from metadata
	3796	url_flv_el = metadata.find('url_flv')
	3797	if url_flv_el is None:
	3798	raise ExtractorError(u'Unable to extract download url')
	3799	video_url = url_flv_el.text
	3800	extension = os.path.splitext(video_url)[1][1:]
	3801	title_el = metadata.find('title')
	3802	if title_el is None:
	3803	raise ExtractorError(u'Unable to extract title')
	3804	title = title_el.text
	3805	format_id_el = metadata.find('format_id')
	3806	if format_id_el is None:
	3807	format = ext
	3808	else:
	3809	format = format_id_el.text
	3810	description_el = metadata.find('description')
	3811	if description_el is not None:
	3812	description = description_el.text
	3813	else:
	3814	description = None
	3815	imagePreview_el = metadata.find('imagePreview')
	3816	if imagePreview_el is not None:
	3817	thumbnail = imagePreview_el.text
	3818	else:
	3819	thumbnail = None
	3820	info = {
	3821	'id': video_id,
	3822	'url': video_url,
	3823	'title': title,
	3824	'ext': extension,
	3825	'format': format,
	3826	'thumbnail': thumbnail,
	3827	'description': description
	3828	}
	3829	return [info]
	3830
	3831	class SpiegelIE(InfoExtractor):
	3832	_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]-(?P<videoID>[0-9]+)(?:\.html)?(?:#.)?$'
	3833
	3834	def _real_extract(self, url):
	3835	m = re.match(self._VALID_URL, url)
	3836	video_id = m.group('videoID')
	3837
	3838	webpage = self._download_webpage(url, video_id)
	3839	m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
	3840	if not m:
	3841	raise ExtractorError(u'Cannot find title')
	3842	video_title = unescapeHTML(m.group(1))
	3843
	3844	xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
	3845	xml_code = self._download_webpage(xml_url, video_id,
	3846	note=u'Downloading XML', errnote=u'Failed to download XML')
	3847
	3848	idoc = xml.etree.ElementTree.fromstring(xml_code)
	3849	last_type = idoc[-1]
	3850	filename = last_type.findall('./filename')[0].text
	3851	duration = float(last_type.findall('./duration')[0].text)
	3852
	3853	video_url = 'http://video2.spiegel.de/flash/' + filename
	3854	video_ext = filename.rpartition('.')[2]
	3855	info = {
	3856	'id': video_id,
	3857	'url': video_url,
	3858	'ext': video_ext,
	3859	'title': video_title,
	3860	'duration': duration,
	3861	}
	3862	return [info]
	3863
	3864	class LiveLeakIE(InfoExtractor):
	3865
	3866	_VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.?)i=(?P<video_id>[\w_]+)(?:.)'
	3867	IE_NAME = u'liveleak'
	3868
	3869	def _real_extract(self, url):
	3870	mobj = re.match(self._VALID_URL, url)
	3871	if mobj is None:
	3872	raise ExtractorError(u'Invalid URL: %s' % url)
	3873
	3874	video_id = mobj.group('video_id')
	3875
	3876	webpage = self._download_webpage(url, video_id)
	3877
	3878	m = re.search(r'file: "(.*?)",', webpage)
	3879	if not m:
	3880	raise ExtractorError(u'Unable to find video url')
	3881	video_url = m.group(1)
	3882
	3883	m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
	3884	if not m:
	3885	raise ExtractorError(u'Cannot find video title')
	3886	title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
	3887
	3888	m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
	3889	if m:
	3890	desc = unescapeHTML(m.group('desc'))
	3891	else:
	3892	desc = None
	3893
	3894	m = re.search(r'By:.*?(\w+)</a>', webpage)
	3895	if m:
	3896	uploader = clean_html(m.group(1))
	3897	else:
	3898	uploader = None
	3899
	3900	info = {
	3901	'id': video_id,
	3902	'url': video_url,
	3903	'ext': 'mp4',
	3904	'title': title,
	3905	'description': desc,
	3906	'uploader': uploader
	3907	}
	3908
	3909	return [info]
	3910
	3911	class ARDIE(InfoExtractor):
	3912	_VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de\|mediathek\.daserste\.de)/(?:./)(?P<video_id>[^/\?]+)(?:\?.)?'
	3913	_TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
	3914	_MEDIA_STREAM = r'mediaCollection\.addMediaStream$(?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"])", "(?P<video_url>[^"])", "[^"]*"$'
	3915
	3916	def _real_extract(self, url):
	3917	# determine video id from url
	3918	m = re.match(self._VALID_URL, url)
	3919
	3920	numid = re.search(r'documentId=([0-9]+)', url)
	3921	if numid:
	3922	video_id = numid.group(1)
	3923	else:
	3924	video_id = m.group('video_id')
	3925
	3926	# determine title and media streams from webpage
	3927	html = self._download_webpage(url, video_id)
	3928	title = re.search(self._TITLE, html).group('title')
	3929	streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
	3930	if not streams:
	3931	assert '"fsk"' in html
	3932	raise ExtractorError(u'This video is only available after 8:00 pm')
	3933
	3934	# choose default media type and highest quality for now
	3935	stream = max([s for s in streams if int(s["media_type"]) == 0],
	3936	key=lambda s: int(s["quality"]))
	3937
	3938	# there's two possibilities: RTMP stream or HTTP download
	3939	info = {'id': video_id, 'title': title, 'ext': 'mp4'}
	3940	if stream['rtmp_url']:
	3941	self.to_screen(u'RTMP download detected')
	3942	assert stream['video_url'].startswith('mp4:')
	3943	info["url"] = stream["rtmp_url"]
	3944	info["play_path"] = stream['video_url']
	3945	else:
	3946	assert stream["video_url"].endswith('.mp4')
	3947	info["url"] = stream["video_url"]
	3948	return [info]
	3949
	3950	class TumblrIE(InfoExtractor):
	3951	_VALID_URL = r'http://(?P<blog_name>.?)\.tumblr\.com/((post)\|(video))/(?P<id>\d)/(.*?)'
	3952
	3953	def _real_extract(self, url):
	3954	m_url = re.match(self._VALID_URL, url)
	3955	video_id = m_url.group('id')
	3956	blog = m_url.group('blog_name')
	3957
	3958	url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
	3959	webpage = self._download_webpage(url, video_id)
	3960
	3961	re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.?))\\x22 type=\\x22video/(?P<ext>.?)\\x22' % (blog, video_id)
	3962	video = re.search(re_video, webpage)
	3963	if video is None:
	3964	self.to_screen("No video founded")
	3965	return []
	3966	video_url = video.group('video_url')
	3967	ext = video.group('ext')
	3968
	3969	re_thumb = r'posters(.?)\[\\x22(?P<thumb>.?)\\x22' # We pick the first poster
	3970	thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
	3971
	3972	# The only place where you can get a title, it's not complete,
	3973	# but searching in other places doesn't work for all videos
	3974	re_title = r'<title>(?P<title>.*?)</title>'
	3975	title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
	3976
	3977	return [{'id': video_id,
	3978	'url': video_url,
	3979	'title': title,
	3980	'thumbnail': thumb,
	3981	'ext': ext
	3982	}]
	3983
	3984	class BandcampIE(InfoExtractor):
	3985	_VALID_URL = r'http://.?\.bandcamp\.com/track/(?P<title>.)'
	3986
	3987	def _real_extract(self, url):
	3988	mobj = re.match(self._VALID_URL, url)
	3989	title = mobj.group('title')
	3990	webpage = self._download_webpage(url, title)
	3991	# We get the link to the free download page
	3992	m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
	3993	if m_download is None:
	3994	raise ExtractorError(u'No free songs founded')
	3995
	3996	download_link = m_download.group(1)
	3997	id = re.search(r'var TralbumData = {(.?)id: (?P<id>\d?)$',
	3998	webpage, re.MULTILINE\|re.DOTALL).group('id')
	3999
	4000	download_webpage = self._download_webpage(download_link, id,
	4001	'Downloading free downloads page')
	4002	# We get the dictionary of the track from some javascrip code
	4003	info = re.search(r'items: (.*?),$',
	4004	download_webpage, re.MULTILINE).group(1)
	4005	info = json.loads(info)[0]
	4006	# We pick mp3-320 for now, until format selection can be easily implemented.
	4007	mp3_info = info[u'downloads'][u'mp3-320']
	4008	# If we try to use this url it says the link has expired
	4009	initial_url = mp3_info[u'url']
	4010	re_url = r'(?P<server>http://(.?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.?)&id=(?P<id>.?)&ts=(?P<ts>.)$'
	4011	m_url = re.match(re_url, initial_url)
	4012	#We build the url we will use to get the final track url
	4013	# This url is build in Bandcamp in the script download_bunde_*.js
	4014	request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
	4015	final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
	4016	# If we could correctly generate the .rand field the url would be
	4017	#in the "download_url" key
	4018	final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
	4019
	4020	track_info = {'id':id,
	4021	'title' : info[u'title'],
	4022	'ext' : 'mp3',
	4023	'url' : final_url,
	4024	'thumbnail' : info[u'thumb_url'],
	4025	'uploader' : info[u'artist']
	4026	}
	4027
	4028	return [track_info]
	4029
	4030	class RedTubeIE(InfoExtractor):
	4031	"""Information Extractor for redtube"""
	4032	_VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
	4033
	4034	def _real_extract(self,url):
	4035	mobj = re.match(self._VALID_URL, url)
	4036	if mobj is None:
	4037	raise ExtractorError(u'Invalid URL: %s' % url)
	4038
	4039	video_id = mobj.group('id')
	4040	video_extension = 'mp4'
	4041	webpage = self._download_webpage(url, video_id)
	4042	self.report_extraction(video_id)
	4043	mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
	4044
	4045	if mobj is None:
	4046	raise ExtractorError(u'Unable to extract media URL')
	4047
	4048	video_url = mobj.group(1)
	4049	mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
	4050	if mobj is None:
	4051	raise ExtractorError(u'Unable to extract title')
	4052	video_title = mobj.group(1)
	4053
	4054	return [{
	4055	'id': video_id,
	4056	'url': video_url,
	4057	'ext': video_extension,
	4058	'title': video_title,
	4059	}]
	4060
	4061	class InaIE(InfoExtractor):
	4062	"""Information Extractor for Ina.fr"""
	4063	_VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
	4064
	4065	def _real_extract(self,url):
	4066	mobj = re.match(self._VALID_URL, url)
	4067
	4068	video_id = mobj.group('id')
	4069	mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
	4070	video_extension = 'mp4'
	4071	webpage = self._download_webpage(mrss_url, video_id)
	4072
	4073	mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
	4074	if mobj is None:
	4075	raise ExtractorError(u'Unable to extract media URL')
	4076	video_url = mobj.group(1)
	4077
	4078	mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
	4079	if mobj is None:
	4080	raise ExtractorError(u'Unable to extract title')
	4081	video_title = mobj.group(1)
	4082
	4083	return [{
	4084	'id': video_id,
	4085	'url': video_url,
	4086	'ext': video_extension,
	4087	'title': video_title,
	4088	}]
	4089
	4090	def gen_extractors():
	4091	""" Return a list of an instance of every supported extractor.
	4092	The order does matter; the first extractor matched is the one handling the URL.
	4093	"""
	4094	return [
	4095	YoutubePlaylistIE(),
	4096	YoutubeChannelIE(),
	4097	YoutubeUserIE(),
	4098	YoutubeSearchIE(),
	4099	YoutubeIE(),
	4100	MetacafeIE(),
	4101	DailymotionIE(),
	4102	GoogleSearchIE(),
	4103	PhotobucketIE(),
	4104	YahooIE(),
	4105	YahooSearchIE(),
	4106	DepositFilesIE(),
	4107	FacebookIE(),
	4108	BlipTVUserIE(),
	4109	BlipTVIE(),
	4110	VimeoIE(),
	4111	MyVideoIE(),
	4112	ComedyCentralIE(),
	4113	EscapistIE(),
	4114	CollegeHumorIE(),
	4115	XVideosIE(),
	4116	SoundcloudSetIE(),
	4117	SoundcloudIE(),
	4118	InfoQIE(),
	4119	MixcloudIE(),
	4120	StanfordOpenClassroomIE(),
	4121	MTVIE(),
	4122	YoukuIE(),
	4123	XNXXIE(),
	4124	YouJizzIE(),
	4125	PornotubeIE(),
	4126	YouPornIE(),
	4127	GooglePlusIE(),
	4128	ArteTvIE(),
	4129	NBAIE(),
	4130	WorldStarHipHopIE(),
	4131	JustinTVIE(),
	4132	FunnyOrDieIE(),
	4133	SteamIE(),
	4134	UstreamIE(),
	4135	RBMARadioIE(),
	4136	EightTracksIE(),
	4137	KeekIE(),
	4138	TEDIE(),
	4139	MySpassIE(),
	4140	SpiegelIE(),
	4141	LiveLeakIE(),
	4142	ARDIE(),
	4143	TumblrIE(),
	4144	BandcampIE(),
	4145	RedTubeIE(),
	4146	InaIE(),
	4147	GenericIE()
	4148	]
	4149
	4150	def get_info_extractor(ie_name):
	4151	"""Returns the info extractor class with the given ie_name"""
	4152	return globals()[ie_name+'IE']