jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	import base64
	2	import datetime
	3	import itertools
	4	import netrc
	5	import os
	6	import re
	7	import socket
	8	import time
	9	import email.utils
	10	import xml.etree.ElementTree
	11	import random
	12	import math
	13	import operator
	14	import hashlib
	15	import binascii
	16	import urllib
	17
	18	from .utils import *
	19	from .extractor.common import InfoExtractor, SearchInfoExtractor
	20
	21	from .extractor.ard import ARDIE
	22	from .extractor.arte import ArteTvIE
	23	from .extractor.bliptv import BlipTVIE, BlipTVUserIE
	24	from .extractor.comedycentral import ComedyCentralIE
	25	from .extractor.collegehumor import CollegeHumorIE
	26	from .extractor.dailymotion import DailymotionIE
	27	from .extractor.depositfiles import DepositFilesIE
	28	from .extractor.escapist import EscapistIE
	29	from .extractor.facebook import FacebookIE
	30	from .extractor.funnyordie import FunnyOrDieIE
	31	from .extractor.gametrailers import GametrailersIE
	32	from .extractor.generic import GenericIE
	33	from .extractor.googleplus import GooglePlusIE
	34	from .extractor.googlesearch import GoogleSearchIE
	35	from .extractor.infoq import InfoQIE
	36	from .extractor.metacafe import MetacafeIE
	37	from .extractor.mixcloud import MixcloudIE
	38	from .extractor.mtv import MTVIE
	39	from .extractor.myvideo import MyVideoIE
	40	from .extractor.nba import NBAIE
	41	from .extractor.statigram import StatigramIE
	42	from .extractor.photobucket import PhotobucketIE
	43	from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
	44	from .extractor.stanfordoc import StanfordOpenClassroomIE
	45	from .extractor.steam import SteamIE
	46	from .extractor.ted import TEDIE
	47	from .extractor.vimeo import VimeoIE
	48	from .extractor.worldstarhiphop import WorldStarHipHopIE
	49	from .extractor.xnxx import XNXXIE
	50	from .extractor.xvideos import XVideosIE
	51	from .extractor.yahoo import YahooIE, YahooSearchIE
	52	from .extractor.youku import YoukuIE
	53	from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
	54	from .extractor.zdf import ZDFIE
	55
	56
	57
	58
	59
	60
	61
	62
	63
	64
	65
	66
	67
	68	class JustinTVIE(InfoExtractor):
	69	"""Information extractor for justin.tv and twitch.tv"""
	70	# TODO: One broadcast may be split into multiple videos. The key
	71	# 'broadcast_id' is the same for all parts, and 'broadcast_part'
	72	# starts at 1 and increases. Can we treat all parts as one video?
	73
	74	_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch\|justin)\.tv/
	75	(?:
	76	(?P<channelid>[^/]+)\|
	77	(?:(?:[^/]+)/b/(?P<videoid>[^/]+))\|
	78	(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
	79	)
	80	/?(?:\#.*)?$
	81	"""
	82	_JUSTIN_PAGE_LIMIT = 100
	83	IE_NAME = u'justin.tv'
	84
	85	def report_download_page(self, channel, offset):
	86	"""Report attempt to download a single page of videos."""
	87	self.to_screen(u'%s: Downloading video information from %d to %d' %
	88	(channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
	89
	90	# Return count of items, list of valid items
	91	def _parse_page(self, url, video_id):
	92	webpage = self._download_webpage(url, video_id,
	93	u'Downloading video info JSON',
	94	u'unable to download video info JSON')
	95
	96	response = json.loads(webpage)
	97	if type(response) != list:
	98	error_text = response.get('error', 'unknown error')
	99	raise ExtractorError(u'Justin.tv API: %s' % error_text)
	100	info = []
	101	for clip in response:
	102	video_url = clip['video_file_url']
	103	if video_url:
	104	video_extension = os.path.splitext(video_url)[1][1:]
	105	video_date = re.sub('-', '', clip['start_time'][:10])
	106	video_uploader_id = clip.get('user_id', clip.get('channel_id'))
	107	video_id = clip['id']
	108	video_title = clip.get('title', video_id)
	109	info.append({
	110	'id': video_id,
	111	'url': video_url,
	112	'title': video_title,
	113	'uploader': clip.get('channel_name', video_uploader_id),
	114	'uploader_id': video_uploader_id,
	115	'upload_date': video_date,
	116	'ext': video_extension,
	117	})
	118	return (len(response), info)
	119
	120	def _real_extract(self, url):
	121	mobj = re.match(self._VALID_URL, url)
	122	if mobj is None:
	123	raise ExtractorError(u'invalid URL: %s' % url)
	124
	125	api_base = 'http://api.justin.tv'
	126	paged = False
	127	if mobj.group('channelid'):
	128	paged = True
	129	video_id = mobj.group('channelid')
	130	api = api_base + '/channel/archives/%s.json' % video_id
	131	elif mobj.group('chapterid'):
	132	chapter_id = mobj.group('chapterid')
	133
	134	webpage = self._download_webpage(url, chapter_id)
	135	m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
	136	if not m:
	137	raise ExtractorError(u'Cannot find archive of a chapter')
	138	archive_id = m.group(1)
	139
	140	api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
	141	chapter_info_xml = self._download_webpage(api, chapter_id,
	142	note=u'Downloading chapter information',
	143	errnote=u'Chapter information download failed')
	144	doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
	145	for a in doc.findall('.//archive'):
	146	if archive_id == a.find('./id').text:
	147	break
	148	else:
	149	raise ExtractorError(u'Could not find chapter in chapter information')
	150
	151	video_url = a.find('./video_file_url').text
	152	video_ext = video_url.rpartition('.')[2] or u'flv'
	153
	154	chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
	155	chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
	156	note='Downloading chapter metadata',
	157	errnote='Download of chapter metadata failed')
	158	chapter_info = json.loads(chapter_info_json)
	159
	160	bracket_start = int(doc.find('.//bracket_start').text)
	161	bracket_end = int(doc.find('.//bracket_end').text)
	162
	163	# TODO determine start (and probably fix up file)
	164	# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
	165	#video_url += u'?start=' + TODO:start_timestamp
	166	# bracket_start is 13290, but we want 51670615
	167	self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
	168	u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
	169
	170	info = {
	171	'id': u'c' + chapter_id,
	172	'url': video_url,
	173	'ext': video_ext,
	174	'title': chapter_info['title'],
	175	'thumbnail': chapter_info['preview'],
	176	'description': chapter_info['description'],
	177	'uploader': chapter_info['channel']['display_name'],
	178	'uploader_id': chapter_info['channel']['name'],
	179	}
	180	return [info]
	181	else:
	182	video_id = mobj.group('videoid')
	183	api = api_base + '/broadcast/by_archive/%s.json' % video_id
	184
	185	self.report_extraction(video_id)
	186
	187	info = []
	188	offset = 0
	189	limit = self._JUSTIN_PAGE_LIMIT
	190	while True:
	191	if paged:
	192	self.report_download_page(video_id, offset)
	193	page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
	194	page_count, page_info = self._parse_page(page_url, video_id)
	195	info.extend(page_info)
	196	if not paged or page_count != limit:
	197	break
	198	offset += limit
	199	return info
	200
	201
	202
	203	class UstreamIE(InfoExtractor):
	204	_VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
	205	IE_NAME = u'ustream'
	206
	207	def _real_extract(self, url):
	208	m = re.match(self._VALID_URL, url)
	209	video_id = m.group('videoID')
	210
	211	video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
	212	webpage = self._download_webpage(url, video_id)
	213
	214	self.report_extraction(video_id)
	215
	216	video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
	217	webpage, u'title')
	218
	219	uploader = self._html_search_regex(r'data-content-type="channel".?>(?P<uploader>.?)</a>',
	220	webpage, u'uploader', fatal=False, flags=re.DOTALL)
	221
	222	thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
	223	webpage, u'thumbnail', fatal=False)
	224
	225	info = {
	226	'id': video_id,
	227	'url': video_url,
	228	'ext': 'flv',
	229	'title': video_title,
	230	'uploader': uploader,
	231	'thumbnail': thumbnail,
	232	}
	233	return info
	234
	235
	236	class RBMARadioIE(InfoExtractor):
	237	_VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
	238
	239	def _real_extract(self, url):
	240	m = re.match(self._VALID_URL, url)
	241	video_id = m.group('videoID')
	242
	243	webpage = self._download_webpage(url, video_id)
	244
	245	json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
	246	webpage, u'json data', flags=re.MULTILINE)
	247
	248	try:
	249	data = json.loads(json_data)
	250	except ValueError as e:
	251	raise ExtractorError(u'Invalid JSON: ' + str(e))
	252
	253	video_url = data['akamai_url'] + '&cbr=256'
	254	url_parts = compat_urllib_parse_urlparse(video_url)
	255	video_ext = url_parts.path.rpartition('.')[2]
	256	info = {
	257	'id': video_id,
	258	'url': video_url,
	259	'ext': video_ext,
	260	'title': data['title'],
	261	'description': data.get('teaser_text'),
	262	'location': data.get('country_of_origin'),
	263	'uploader': data.get('host', {}).get('name'),
	264	'uploader_id': data.get('host', {}).get('slug'),
	265	'thumbnail': data.get('image', {}).get('large_url_2x'),
	266	'duration': data.get('duration'),
	267	}
	268	return [info]
	269
	270
	271	class YouPornIE(InfoExtractor):
	272	"""Information extractor for youporn.com."""
	273	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
	274
	275	def _print_formats(self, formats):
	276	"""Print all available formats"""
	277	print(u'Available formats:')
	278	print(u'ext\t\tformat')
	279	print(u'---------------------------------')
	280	for format in formats:
	281	print(u'%s\t\t%s' % (format['ext'], format['format']))
	282
	283	def _specific(self, req_format, formats):
	284	for x in formats:
	285	if(x["format"]==req_format):
	286	return x
	287	return None
	288
	289	def _real_extract(self, url):
	290	mobj = re.match(self._VALID_URL, url)
	291	if mobj is None:
	292	raise ExtractorError(u'Invalid URL: %s' % url)
	293	video_id = mobj.group('videoid')
	294
	295	req = compat_urllib_request.Request(url)
	296	req.add_header('Cookie', 'age_verified=1')
	297	webpage = self._download_webpage(req, video_id)
	298
	299	# Get JSON parameters
	300	json_params = self._search_regex(r'var currentVideo = new Video$(.*)$;', webpage, u'JSON parameters')
	301	try:
	302	params = json.loads(json_params)
	303	except:
	304	raise ExtractorError(u'Invalid JSON')
	305
	306	self.report_extraction(video_id)
	307	try:
	308	video_title = params['title']
	309	upload_date = unified_strdate(params['release_date_f'])
	310	video_description = params['description']
	311	video_uploader = params['submitted_by']
	312	thumbnail = params['thumbnails'][0]['image']
	313	except KeyError:
	314	raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
	315
	316	# Get all of the formats available
	317	DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
	318	download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
	319	webpage, u'download list').strip()
	320
	321	# Get all of the links from the page
	322	LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
	323	links = re.findall(LINK_RE, download_list_html)
	324	if(len(links) == 0):
	325	raise ExtractorError(u'ERROR: no known formats available for video')
	326
	327	self.to_screen(u'Links found: %d' % len(links))
	328
	329	formats = []
	330	for link in links:
	331
	332	# A link looks like this:
	333	# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
	334	# A path looks like this:
	335	# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
	336	video_url = unescapeHTML( link )
	337	path = compat_urllib_parse_urlparse( video_url ).path
	338	extension = os.path.splitext( path )[1][1:]
	339	format = path.split('/')[4].split('_')[:2]
	340	size = format[0]
	341	bitrate = format[1]
	342	format = "-".join( format )
	343	# title = u'%s-%s-%s' % (video_title, size, bitrate)
	344
	345	formats.append({
	346	'id': video_id,
	347	'url': video_url,
	348	'uploader': video_uploader,
	349	'upload_date': upload_date,
	350	'title': video_title,
	351	'ext': extension,
	352	'format': format,
	353	'thumbnail': thumbnail,
	354	'description': video_description
	355	})
	356
	357	if self._downloader.params.get('listformats', None):
	358	self._print_formats(formats)
	359	return
	360
	361	req_format = self._downloader.params.get('format', None)
	362	self.to_screen(u'Format: %s' % req_format)
	363
	364	if req_format is None or req_format == 'best':
	365	return [formats[0]]
	366	elif req_format == 'worst':
	367	return [formats[-1]]
	368	elif req_format in ('-1', 'all'):
	369	return formats
	370	else:
	371	format = self._specific( req_format, formats )
	372	if result is None:
	373	raise ExtractorError(u'Requested format not available')
	374	return [format]
	375
	376
	377
	378	class PornotubeIE(InfoExtractor):
	379	"""Information extractor for pornotube.com."""
	380	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
	381
	382	def _real_extract(self, url):
	383	mobj = re.match(self._VALID_URL, url)
	384	if mobj is None:
	385	raise ExtractorError(u'Invalid URL: %s' % url)
	386
	387	video_id = mobj.group('videoid')
	388	video_title = mobj.group('title')
	389
	390	# Get webpage content
	391	webpage = self._download_webpage(url, video_id)
	392
	393	# Get the video URL
	394	VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
	395	video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
	396	video_url = compat_urllib_parse.unquote(video_url)
	397
	398	#Get the uploaded date
	399	VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
	400	upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
	401	if upload_date: upload_date = unified_strdate(upload_date)
	402
	403	info = {'id': video_id,
	404	'url': video_url,
	405	'uploader': None,
	406	'upload_date': upload_date,
	407	'title': video_title,
	408	'ext': 'flv',
	409	'format': 'flv'}
	410
	411	return [info]
	412
	413	class YouJizzIE(InfoExtractor):
	414	"""Information extractor for youjizz.com."""
	415	_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
	416
	417	def _real_extract(self, url):
	418	mobj = re.match(self._VALID_URL, url)
	419	if mobj is None:
	420	raise ExtractorError(u'Invalid URL: %s' % url)
	421
	422	video_id = mobj.group('videoid')
	423
	424	# Get webpage content
	425	webpage = self._download_webpage(url, video_id)
	426
	427	# Get the video title
	428	video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
	429	webpage, u'title').strip()
	430
	431	# Get the embed page
	432	result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
	433	if result is None:
	434	raise ExtractorError(u'ERROR: unable to extract embed page')
	435
	436	embed_page_url = result.group(0).strip()
	437	video_id = result.group('videoid')
	438
	439	webpage = self._download_webpage(embed_page_url, video_id)
	440
	441	# Get the video URL
	442	video_url = self._search_regex(r'so.addVariable$"file",encodeURIComponent\("(?P<source>[^"]+)"$\);',
	443	webpage, u'video URL')
	444
	445	info = {'id': video_id,
	446	'url': video_url,
	447	'title': video_title,
	448	'ext': 'flv',
	449	'format': 'flv',
	450	'player_url': embed_page_url}
	451
	452	return [info]
	453
	454	class EightTracksIE(InfoExtractor):
	455	IE_NAME = '8tracks'
	456	_VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
	457
	458	def _real_extract(self, url):
	459	mobj = re.match(self._VALID_URL, url)
	460	if mobj is None:
	461	raise ExtractorError(u'Invalid URL: %s' % url)
	462	playlist_id = mobj.group('id')
	463
	464	webpage = self._download_webpage(url, playlist_id)
	465
	466	json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
	467	data = json.loads(json_like)
	468
	469	session = str(random.randint(0, 1000000000))
	470	mix_id = data['id']
	471	track_count = data['tracks_count']
	472	first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
	473	next_url = first_url
	474	res = []
	475	for i in itertools.count():
	476	api_json = self._download_webpage(next_url, playlist_id,
	477	note=u'Downloading song information %s/%s' % (str(i+1), track_count),
	478	errnote=u'Failed to download song information')
	479	api_data = json.loads(api_json)
	480	track_data = api_data[u'set']['track']
	481	info = {
	482	'id': track_data['id'],
	483	'url': track_data['track_file_stream_url'],
	484	'title': track_data['performer'] + u' - ' + track_data['name'],
	485	'raw_title': track_data['name'],
	486	'uploader_id': data['user']['login'],
	487	'ext': 'm4a',
	488	}
	489	res.append(info)
	490	if api_data['set']['at_last_track']:
	491	break
	492	next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
	493	return res
	494
	495	class KeekIE(InfoExtractor):
	496	_VALID_URL = r'http://(?:www\.)?keek\.com/(?:!\|\w+/keeks/)(?P<videoID>\w+)'
	497	IE_NAME = u'keek'
	498
	499	def _real_extract(self, url):
	500	m = re.match(self._VALID_URL, url)
	501	video_id = m.group('videoID')
	502
	503	video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
	504	thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
	505	webpage = self._download_webpage(url, video_id)
	506
	507	video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
	508	webpage, u'title')
	509
	510	uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
	511	webpage, u'uploader', fatal=False)
	512
	513	info = {
	514	'id': video_id,
	515	'url': video_url,
	516	'ext': 'mp4',
	517	'title': video_title,
	518	'thumbnail': thumbnail,
	519	'uploader': uploader
	520	}
	521	return [info]
	522
	523
	524	class MySpassIE(InfoExtractor):
	525	_VALID_URL = r'http://www.myspass.de/.*'
	526
	527	def _real_extract(self, url):
	528	META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
	529
	530	# video id is the last path element of the URL
	531	# usually there is a trailing slash, so also try the second but last
	532	url_path = compat_urllib_parse_urlparse(url).path
	533	url_parent_path, video_id = os.path.split(url_path)
	534	if not video_id:
	535	_, video_id = os.path.split(url_parent_path)
	536
	537	# get metadata
	538	metadata_url = META_DATA_URL_TEMPLATE % video_id
	539	metadata_text = self._download_webpage(metadata_url, video_id)
	540	metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
	541
	542	# extract values from metadata
	543	url_flv_el = metadata.find('url_flv')
	544	if url_flv_el is None:
	545	raise ExtractorError(u'Unable to extract download url')
	546	video_url = url_flv_el.text
	547	extension = os.path.splitext(video_url)[1][1:]
	548	title_el = metadata.find('title')
	549	if title_el is None:
	550	raise ExtractorError(u'Unable to extract title')
	551	title = title_el.text
	552	format_id_el = metadata.find('format_id')
	553	if format_id_el is None:
	554	format = ext
	555	else:
	556	format = format_id_el.text
	557	description_el = metadata.find('description')
	558	if description_el is not None:
	559	description = description_el.text
	560	else:
	561	description = None
	562	imagePreview_el = metadata.find('imagePreview')
	563	if imagePreview_el is not None:
	564	thumbnail = imagePreview_el.text
	565	else:
	566	thumbnail = None
	567	info = {
	568	'id': video_id,
	569	'url': video_url,
	570	'title': title,
	571	'ext': extension,
	572	'format': format,
	573	'thumbnail': thumbnail,
	574	'description': description
	575	}
	576	return [info]
	577
	578	class SpiegelIE(InfoExtractor):
	579	_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]-(?P<videoID>[0-9]+)(?:\.html)?(?:#.)?$'
	580
	581	def _real_extract(self, url):
	582	m = re.match(self._VALID_URL, url)
	583	video_id = m.group('videoID')
	584
	585	webpage = self._download_webpage(url, video_id)
	586
	587	video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
	588	webpage, u'title')
	589
	590	xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
	591	xml_code = self._download_webpage(xml_url, video_id,
	592	note=u'Downloading XML', errnote=u'Failed to download XML')
	593
	594	idoc = xml.etree.ElementTree.fromstring(xml_code)
	595	last_type = idoc[-1]
	596	filename = last_type.findall('./filename')[0].text
	597	duration = float(last_type.findall('./duration')[0].text)
	598
	599	video_url = 'http://video2.spiegel.de/flash/' + filename
	600	video_ext = filename.rpartition('.')[2]
	601	info = {
	602	'id': video_id,
	603	'url': video_url,
	604	'ext': video_ext,
	605	'title': video_title,
	606	'duration': duration,
	607	}
	608	return [info]
	609
	610	class LiveLeakIE(InfoExtractor):
	611
	612	_VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.?)i=(?P<video_id>[\w_]+)(?:.)'
	613	IE_NAME = u'liveleak'
	614
	615	def _real_extract(self, url):
	616	mobj = re.match(self._VALID_URL, url)
	617	if mobj is None:
	618	raise ExtractorError(u'Invalid URL: %s' % url)
	619
	620	video_id = mobj.group('video_id')
	621
	622	webpage = self._download_webpage(url, video_id)
	623
	624	video_url = self._search_regex(r'file: "(.*?)",',
	625	webpage, u'video URL')
	626
	627	video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
	628	webpage, u'title').replace('LiveLeak.com -', '').strip()
	629
	630	video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
	631	webpage, u'description', fatal=False)
	632
	633	video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
	634	webpage, u'uploader', fatal=False)
	635
	636	info = {
	637	'id': video_id,
	638	'url': video_url,
	639	'ext': 'mp4',
	640	'title': video_title,
	641	'description': video_description,
	642	'uploader': video_uploader
	643	}
	644
	645	return [info]
	646
	647
	648
	649	class TumblrIE(InfoExtractor):
	650	_VALID_URL = r'http://(?P<blog_name>.?)\.tumblr\.com/((post)\|(video))/(?P<id>\d)/(.*?)'
	651
	652	def _real_extract(self, url):
	653	m_url = re.match(self._VALID_URL, url)
	654	video_id = m_url.group('id')
	655	blog = m_url.group('blog_name')
	656
	657	url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
	658	webpage = self._download_webpage(url, video_id)
	659
	660	re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.?))\\x22 type=\\x22video/(?P<ext>.?)\\x22' % (blog, video_id)
	661	video = re.search(re_video, webpage)
	662	if video is None:
	663	raise ExtractorError(u'Unable to extract video')
	664	video_url = video.group('video_url')
	665	ext = video.group('ext')
	666
	667	video_thumbnail = self._search_regex(r'posters(.?)\[\\x22(?P<thumb>.?)\\x22',
	668	webpage, u'thumbnail', fatal=False) # We pick the first poster
	669	if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
	670
	671	# The only place where you can get a title, it's not complete,
	672	# but searching in other places doesn't work for all videos
	673	video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
	674	webpage, u'title', flags=re.DOTALL)
	675
	676	return [{'id': video_id,
	677	'url': video_url,
	678	'title': video_title,
	679	'thumbnail': video_thumbnail,
	680	'ext': ext
	681	}]
	682
	683	class BandcampIE(InfoExtractor):
	684	_VALID_URL = r'http://.?\.bandcamp\.com/track/(?P<title>.)'
	685
	686	def _real_extract(self, url):
	687	mobj = re.match(self._VALID_URL, url)
	688	title = mobj.group('title')
	689	webpage = self._download_webpage(url, title)
	690	# We get the link to the free download page
	691	m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
	692	if m_download is None:
	693	raise ExtractorError(u'No free songs found')
	694
	695	download_link = m_download.group(1)
	696	id = re.search(r'var TralbumData = {(.?)id: (?P<id>\d?)$',
	697	webpage, re.MULTILINE\|re.DOTALL).group('id')
	698
	699	download_webpage = self._download_webpage(download_link, id,
	700	'Downloading free downloads page')
	701	# We get the dictionary of the track from some javascrip code
	702	info = re.search(r'items: (.*?),$',
	703	download_webpage, re.MULTILINE).group(1)
	704	info = json.loads(info)[0]
	705	# We pick mp3-320 for now, until format selection can be easily implemented.
	706	mp3_info = info[u'downloads'][u'mp3-320']
	707	# If we try to use this url it says the link has expired
	708	initial_url = mp3_info[u'url']
	709	re_url = r'(?P<server>http://(.?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.?)&id=(?P<id>.?)&ts=(?P<ts>.)$'
	710	m_url = re.match(re_url, initial_url)
	711	#We build the url we will use to get the final track url
	712	# This url is build in Bandcamp in the script download_bunde_*.js
	713	request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
	714	final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
	715	# If we could correctly generate the .rand field the url would be
	716	#in the "download_url" key
	717	final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
	718
	719	track_info = {'id':id,
	720	'title' : info[u'title'],
	721	'ext' : 'mp3',
	722	'url' : final_url,
	723	'thumbnail' : info[u'thumb_url'],
	724	'uploader' : info[u'artist']
	725	}
	726
	727	return [track_info]
	728
	729	class RedTubeIE(InfoExtractor):
	730	"""Information Extractor for redtube"""
	731	_VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
	732
	733	def _real_extract(self,url):
	734	mobj = re.match(self._VALID_URL, url)
	735	if mobj is None:
	736	raise ExtractorError(u'Invalid URL: %s' % url)
	737
	738	video_id = mobj.group('id')
	739	video_extension = 'mp4'
	740	webpage = self._download_webpage(url, video_id)
	741
	742	self.report_extraction(video_id)
	743
	744	video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
	745	webpage, u'video URL')
	746
	747	video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
	748	webpage, u'title')
	749
	750	return [{
	751	'id': video_id,
	752	'url': video_url,
	753	'ext': video_extension,
	754	'title': video_title,
	755	}]
	756
	757	class InaIE(InfoExtractor):
	758	"""Information Extractor for Ina.fr"""
	759	_VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
	760
	761	def _real_extract(self,url):
	762	mobj = re.match(self._VALID_URL, url)
	763
	764	video_id = mobj.group('id')
	765	mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
	766	video_extension = 'mp4'
	767	webpage = self._download_webpage(mrss_url, video_id)
	768
	769	self.report_extraction(video_id)
	770
	771	video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
	772	webpage, u'video URL')
	773
	774	video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
	775	webpage, u'title')
	776
	777	return [{
	778	'id': video_id,
	779	'url': video_url,
	780	'ext': video_extension,
	781	'title': video_title,
	782	}]
	783
	784	class HowcastIE(InfoExtractor):
	785	"""Information Extractor for Howcast.com"""
	786	_VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
	787
	788	def _real_extract(self, url):
	789	mobj = re.match(self._VALID_URL, url)
	790
	791	video_id = mobj.group('id')
	792	webpage_url = 'http://www.howcast.com/videos/' + video_id
	793	webpage = self._download_webpage(webpage_url, video_id)
	794
	795	self.report_extraction(video_id)
	796
	797	video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
	798	webpage, u'video URL')
	799
	800	video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"\|\'([^\']+)\') property=\'og:title\'',
	801	webpage, u'title')
	802
	803	video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"\|\'([^\']+)\') name=\'description\'',
	804	webpage, u'description', fatal=False)
	805
	806	thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
	807	webpage, u'thumbnail', fatal=False)
	808
	809	return [{
	810	'id': video_id,
	811	'url': video_url,
	812	'ext': 'mp4',
	813	'title': video_title,
	814	'description': video_description,
	815	'thumbnail': thumbnail,
	816	}]
	817
	818	class VineIE(InfoExtractor):
	819	"""Information Extractor for Vine.co"""
	820	_VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
	821
	822	def _real_extract(self, url):
	823	mobj = re.match(self._VALID_URL, url)
	824
	825	video_id = mobj.group('id')
	826	webpage_url = 'https://vine.co/v/' + video_id
	827	webpage = self._download_webpage(webpage_url, video_id)
	828
	829	self.report_extraction(video_id)
	830
	831	video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
	832	webpage, u'video URL')
	833
	834	video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
	835	webpage, u'title')
	836
	837	thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
	838	webpage, u'thumbnail', fatal=False)
	839
	840	uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
	841	webpage, u'uploader', fatal=False, flags=re.DOTALL)
	842
	843	return [{
	844	'id': video_id,
	845	'url': video_url,
	846	'ext': 'mp4',
	847	'title': video_title,
	848	'thumbnail': thumbnail,
	849	'uploader': uploader,
	850	}]
	851
	852	class FlickrIE(InfoExtractor):
	853	"""Information Extractor for Flickr videos"""
	854	_VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
	855
	856	def _real_extract(self, url):
	857	mobj = re.match(self._VALID_URL, url)
	858
	859	video_id = mobj.group('id')
	860	video_uploader_id = mobj.group('uploader_id')
	861	webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
	862	webpage = self._download_webpage(webpage_url, video_id)
	863
	864	secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
	865
	866	first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
	867	first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
	868
	869	node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
	870	first_xml, u'node_id')
	871
	872	second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
	873	second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
	874
	875	self.report_extraction(video_id)
	876
	877	mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
	878	if mobj is None:
	879	raise ExtractorError(u'Unable to extract video url')
	880	video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
	881
	882	video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"\|\'([^\']+)\')',
	883	webpage, u'video title')
	884
	885	video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"\|\'([^\']+)\')',
	886	webpage, u'description', fatal=False)
	887
	888	thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"\|\'([^\']+)\')',
	889	webpage, u'thumbnail', fatal=False)
	890
	891	return [{
	892	'id': video_id,
	893	'url': video_url,
	894	'ext': 'mp4',
	895	'title': video_title,
	896	'description': video_description,
	897	'thumbnail': thumbnail,
	898	'uploader_id': video_uploader_id,
	899	}]
	900
	901	class TeamcocoIE(InfoExtractor):
	902	_VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
	903
	904	def _real_extract(self, url):
	905	mobj = re.match(self._VALID_URL, url)
	906	if mobj is None:
	907	raise ExtractorError(u'Invalid URL: %s' % url)
	908	url_title = mobj.group('url_title')
	909	webpage = self._download_webpage(url, url_title)
	910
	911	video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
	912	webpage, u'video id')
	913
	914	self.report_extraction(video_id)
	915
	916	video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
	917	webpage, u'title')
	918
	919	thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
	920	webpage, u'thumbnail', fatal=False)
	921
	922	video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
	923	webpage, u'description', fatal=False)
	924
	925	data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
	926	data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
	927
	928	video_url = self._html_search_regex(r'<file type="high".?>(.?)</file>',
	929	data, u'video URL')
	930
	931	return [{
	932	'id': video_id,
	933	'url': video_url,
	934	'ext': 'mp4',
	935	'title': video_title,
	936	'thumbnail': thumbnail,
	937	'description': video_description,
	938	}]
	939
	940	class XHamsterIE(InfoExtractor):
	941	"""Information Extractor for xHamster"""
	942	_VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
	943
	944	def _real_extract(self,url):
	945	mobj = re.match(self._VALID_URL, url)
	946
	947	video_id = mobj.group('id')
	948	mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
	949	webpage = self._download_webpage(mrss_url, video_id)
	950
	951	mobj = re.search(r'\'srv\': \'(?P<server>[^\'])\',\s\'file\': \'(?P<file>[^\']+)\',', webpage)
	952	if mobj is None:
	953	raise ExtractorError(u'Unable to extract media URL')
	954	if len(mobj.group('server')) == 0:
	955	video_url = compat_urllib_parse.unquote(mobj.group('file'))
	956	else:
	957	video_url = mobj.group('server')+'/key='+mobj.group('file')
	958	video_extension = video_url.split('.')[-1]
	959
	960	video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
	961	webpage, u'title')
	962
	963	# Can't see the description anywhere in the UI
	964	# video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
	965	# webpage, u'description', fatal=False)
	966	# if video_description: video_description = unescapeHTML(video_description)
	967
	968	mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
	969	if mobj:
	970	video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
	971	else:
	972	video_upload_date = None
	973	self._downloader.report_warning(u'Unable to extract upload date')
	974
	975	video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
	976	webpage, u'uploader id', default=u'anonymous')
	977
	978	video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
	979	webpage, u'thumbnail', fatal=False)
	980
	981	return [{
	982	'id': video_id,
	983	'url': video_url,
	984	'ext': video_extension,
	985	'title': video_title,
	986	# 'description': video_description,
	987	'upload_date': video_upload_date,
	988	'uploader_id': video_uploader_id,
	989	'thumbnail': video_thumbnail
	990	}]
	991
	992	class HypemIE(InfoExtractor):
	993	"""Information Extractor for hypem"""
	994	_VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
	995
	996	def _real_extract(self, url):
	997	mobj = re.match(self._VALID_URL, url)
	998	if mobj is None:
	999	raise ExtractorError(u'Invalid URL: %s' % url)
	1000	track_id = mobj.group(1)
	1001
	1002	data = { 'ax': 1, 'ts': time.time() }
	1003	data_encoded = compat_urllib_parse.urlencode(data)
	1004	complete_url = url + "?" + data_encoded
	1005	request = compat_urllib_request.Request(complete_url)
	1006	response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
	1007	cookie = urlh.headers.get('Set-Cookie', '')
	1008
	1009	self.report_extraction(track_id)
	1010
	1011	html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
	1012	response, u'tracks', flags=re.MULTILINE\|re.DOTALL).strip()
	1013	try:
	1014	track_list = json.loads(html_tracks)
	1015	track = track_list[u'tracks'][0]
	1016	except ValueError:
	1017	raise ExtractorError(u'Hypemachine contained invalid JSON.')
	1018
	1019	key = track[u"key"]
	1020	track_id = track[u"id"]
	1021	artist = track[u"artist"]
	1022	title = track[u"song"]
	1023
	1024	serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
	1025	request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
	1026	request.add_header('cookie', cookie)
	1027	song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
	1028	try:
	1029	song_data = json.loads(song_data_json)
	1030	except ValueError:
	1031	raise ExtractorError(u'Hypemachine contained invalid JSON.')
	1032	final_url = song_data[u"url"]
	1033
	1034	return [{
	1035	'id': track_id,
	1036	'url': final_url,
	1037	'ext': "mp3",
	1038	'title': title,
	1039	'artist': artist,
	1040	}]
	1041
	1042	class Vbox7IE(InfoExtractor):
	1043	"""Information Extractor for Vbox7"""
	1044	_VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
	1045
	1046	def _real_extract(self,url):
	1047	mobj = re.match(self._VALID_URL, url)
	1048	if mobj is None:
	1049	raise ExtractorError(u'Invalid URL: %s' % url)
	1050	video_id = mobj.group(1)
	1051
	1052	redirect_page, urlh = self._download_webpage_handle(url, video_id)
	1053	new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
	1054	redirect_url = urlh.geturl() + new_location
	1055	webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
	1056
	1057	title = self._html_search_regex(r'<title>(.*)</title>',
	1058	webpage, u'title').split('/')[0].strip()
	1059
	1060	ext = "flv"
	1061	info_url = "http://vbox7.com/play/magare.do"
	1062	data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
	1063	info_request = compat_urllib_request.Request(info_url, data)
	1064	info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
	1065	info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
	1066	if info_response is None:
	1067	raise ExtractorError(u'Unable to extract the media url')
	1068	(final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
	1069
	1070	return [{
	1071	'id': video_id,
	1072	'url': final_url,
	1073	'ext': ext,
	1074	'title': title,
	1075	'thumbnail': thumbnail_url,
	1076	}]
	1077
	1078
	1079	def gen_extractors():
	1080	""" Return a list of an instance of every supported extractor.
	1081	The order does matter; the first extractor matched is the one handling the URL.
	1082	"""
	1083	return [
	1084	YoutubePlaylistIE(),
	1085	YoutubeChannelIE(),
	1086	YoutubeUserIE(),
	1087	YoutubeSearchIE(),
	1088	YoutubeIE(),
	1089	MetacafeIE(),
	1090	DailymotionIE(),
	1091	GoogleSearchIE(),
	1092	PhotobucketIE(),
	1093	YahooIE(),
	1094	YahooSearchIE(),
	1095	DepositFilesIE(),
	1096	FacebookIE(),
	1097	BlipTVIE(),
	1098	BlipTVUserIE(),
	1099	VimeoIE(),
	1100	MyVideoIE(),
	1101	ComedyCentralIE(),
	1102	EscapistIE(),
	1103	CollegeHumorIE(),
	1104	XVideosIE(),
	1105	SoundcloudSetIE(),
	1106	SoundcloudIE(),
	1107	InfoQIE(),
	1108	MixcloudIE(),
	1109	StanfordOpenClassroomIE(),
	1110	MTVIE(),
	1111	YoukuIE(),
	1112	XNXXIE(),
	1113	YouJizzIE(),
	1114	PornotubeIE(),
	1115	YouPornIE(),
	1116	GooglePlusIE(),
	1117	ArteTvIE(),
	1118	NBAIE(),
	1119	WorldStarHipHopIE(),
	1120	JustinTVIE(),
	1121	FunnyOrDieIE(),
	1122	SteamIE(),
	1123	UstreamIE(),
	1124	RBMARadioIE(),
	1125	EightTracksIE(),
	1126	KeekIE(),
	1127	TEDIE(),
	1128	MySpassIE(),
	1129	SpiegelIE(),
	1130	LiveLeakIE(),
	1131	ARDIE(),
	1132	ZDFIE(),
	1133	TumblrIE(),
	1134	BandcampIE(),
	1135	RedTubeIE(),
	1136	InaIE(),
	1137	HowcastIE(),
	1138	VineIE(),
	1139	FlickrIE(),
	1140	TeamcocoIE(),
	1141	XHamsterIE(),
	1142	HypemIE(),
	1143	Vbox7IE(),
	1144	GametrailersIE(),
	1145	StatigramIE(),
	1146	GenericIE()
	1147	]
	1148
	1149	def get_info_extractor(ie_name):
	1150	"""Returns the info extractor class with the given ie_name"""
	1151	return globals()[ie_name+'IE']