10 import xml
.etree
.ElementTree
19 from .extractor
.common
import InfoExtractor
, SearchInfoExtractor
21 from .extractor
.ard
import ARDIE
22 from .extractor
.arte
import ArteTvIE
23 from .extractor
.bliptv
import BlipTVIE
, BlipTVUserIE
24 from .extractor
.comedycentral
import ComedyCentralIE
25 from .extractor
.collegehumor
import CollegeHumorIE
26 from .extractor
.dailymotion
import DailymotionIE
27 from .extractor
.depositfiles
import DepositFilesIE
28 from .extractor
.eighttracks
import EightTracksIE
29 from .extractor
.escapist
import EscapistIE
30 from .extractor
.facebook
import FacebookIE
31 from .extractor
.funnyordie
import FunnyOrDieIE
32 from .extractor
.gametrailers
import GametrailersIE
33 from .extractor
.generic
import GenericIE
34 from .extractor
.googleplus
import GooglePlusIE
35 from .extractor
.googlesearch
import GoogleSearchIE
36 from .extractor
.infoq
import InfoQIE
37 from .extractor
.justintv
import JustinTVIE
38 from .extractor
.keek
import KeekIE
39 from .extractor
.metacafe
import MetacafeIE
40 from .extractor
.mixcloud
import MixcloudIE
41 from .extractor
.mtv
import MTVIE
42 from .extractor
.myspass
import MySpassIE
43 from .extractor
.myvideo
import MyVideoIE
44 from .extractor
.nba
import NBAIE
45 from .extractor
.statigram
import StatigramIE
46 from .extractor
.photobucket
import PhotobucketIE
47 from .extractor
.pornotube
import PornotubeIE
48 from .extractor
.rbmaradio
import RBMARadioIE
49 from .extractor
.soundcloud
import SoundcloudIE
, SoundcloudSetIE
50 from .extractor
.spiegel
import SpiegelIE
51 from .extractor
.stanfordoc
import StanfordOpenClassroomIE
52 from .extractor
.steam
import SteamIE
53 from .extractor
.ted
import TEDIE
54 from .extractor
.ustream
import UstreamIE
55 from .extractor
.vimeo
import VimeoIE
56 from .extractor
.worldstarhiphop
import WorldStarHipHopIE
57 from .extractor
.xnxx
import XNXXIE
58 from .extractor
.xvideos
import XVideosIE
59 from .extractor
.yahoo
import YahooIE
, YahooSearchIE
60 from .extractor
.youjizz
import YouJizzIE
61 from .extractor
.youku
import YoukuIE
62 from .extractor
.youporn
import YouPornIE
63 from .extractor
.youtube
import YoutubeIE
, YoutubePlaylistIE
, YoutubeSearchIE
, YoutubeUserIE
, YoutubeChannelIE
64 from .extractor
.zdf
import ZDFIE
95 class LiveLeakIE(InfoExtractor
):
97 _VALID_URL
= r
'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
100 def _real_extract(self
, url
):
101 mobj
= re
.match(self
._VALID
_URL
, url
)
103 raise ExtractorError(u
'Invalid URL: %s' % url
)
105 video_id
= mobj
.group('video_id')
107 webpage
= self
._download
_webpage
(url
, video_id
)
109 video_url
= self
._search
_regex
(r
'file: "(.*?)",',
110 webpage
, u
'video URL')
112 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
113 webpage
, u
'title').replace('LiveLeak.com -', '').strip()
115 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
116 webpage
, u
'description', fatal
=False)
118 video_uploader
= self
._html
_search
_regex
(r
'By:.*?(\w+)</a>',
119 webpage
, u
'uploader', fatal
=False)
125 'title': video_title
,
126 'description': video_description
,
127 'uploader': video_uploader
134 class TumblrIE(InfoExtractor
):
135 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
137 def _real_extract(self
, url
):
138 m_url
= re
.match(self
._VALID
_URL
, url
)
139 video_id
= m_url
.group('id')
140 blog
= m_url
.group('blog_name')
142 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
143 webpage
= self
._download
_webpage
(url
, video_id
)
145 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
146 video
= re
.search(re_video
, webpage
)
148 raise ExtractorError(u
'Unable to extract video')
149 video_url
= video
.group('video_url')
150 ext
= video
.group('ext')
152 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
153 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
154 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
156 # The only place where you can get a title, it's not complete,
157 # but searching in other places doesn't work for all videos
158 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
159 webpage
, u
'title', flags
=re
.DOTALL
)
161 return [{'id': video_id
,
163 'title': video_title
,
164 'thumbnail': video_thumbnail
,
168 class BandcampIE(InfoExtractor
):
169 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
171 def _real_extract(self
, url
):
172 mobj
= re
.match(self
._VALID
_URL
, url
)
173 title
= mobj
.group('title')
174 webpage
= self
._download
_webpage
(url
, title
)
175 # We get the link to the free download page
176 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
177 if m_download
is None:
178 raise ExtractorError(u
'No free songs found')
180 download_link
= m_download
.group(1)
181 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
182 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
184 download_webpage
= self
._download
_webpage
(download_link
, id,
185 'Downloading free downloads page')
186 # We get the dictionary of the track from some javascrip code
187 info
= re
.search(r
'items: (.*?),$',
188 download_webpage
, re
.MULTILINE
).group(1)
189 info
= json
.loads(info
)[0]
190 # We pick mp3-320 for now, until format selection can be easily implemented.
191 mp3_info
= info
[u
'downloads'][u
'mp3-320']
192 # If we try to use this url it says the link has expired
193 initial_url
= mp3_info
[u
'url']
194 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
195 m_url
= re
.match(re_url
, initial_url
)
196 #We build the url we will use to get the final track url
197 # This url is build in Bandcamp in the script download_bunde_*.js
198 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
199 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
200 # If we could correctly generate the .rand field the url would be
201 #in the "download_url" key
202 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
204 track_info
= {'id':id,
205 'title' : info
[u
'title'],
208 'thumbnail' : info
[u
'thumb_url'],
209 'uploader' : info
[u
'artist']
214 class RedTubeIE(InfoExtractor
):
215 """Information Extractor for redtube"""
216 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
218 def _real_extract(self
,url
):
219 mobj
= re
.match(self
._VALID
_URL
, url
)
221 raise ExtractorError(u
'Invalid URL: %s' % url
)
223 video_id
= mobj
.group('id')
224 video_extension
= 'mp4'
225 webpage
= self
._download
_webpage
(url
, video_id
)
227 self
.report_extraction(video_id
)
229 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
230 webpage
, u
'video URL')
232 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
238 'ext': video_extension
,
239 'title': video_title
,
242 class InaIE(InfoExtractor
):
243 """Information Extractor for Ina.fr"""
244 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
246 def _real_extract(self
,url
):
247 mobj
= re
.match(self
._VALID
_URL
, url
)
249 video_id
= mobj
.group('id')
250 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
251 video_extension
= 'mp4'
252 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
254 self
.report_extraction(video_id
)
256 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
257 webpage
, u
'video URL')
259 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
265 'ext': video_extension
,
266 'title': video_title
,
269 class HowcastIE(InfoExtractor
):
270 """Information Extractor for Howcast.com"""
271 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
273 def _real_extract(self
, url
):
274 mobj
= re
.match(self
._VALID
_URL
, url
)
276 video_id
= mobj
.group('id')
277 webpage_url
= 'http://www.howcast.com/videos/' + video_id
278 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
280 self
.report_extraction(video_id
)
282 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
283 webpage, u'video URL')
285 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
288 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
289 webpage, u'description', fatal=False)
291 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
292 webpage, u'thumbnail', fatal=False)
298 'title': video_title,
299 'description': video_description,
300 'thumbnail': thumbnail,
303 class VineIE(InfoExtractor):
304 """Information Extractor for Vine.co"""
305 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
307 def _real_extract(self, url):
308 mobj = re.match(self._VALID_URL, url)
310 video_id = mobj.group('id')
311 webpage_url = 'https://vine.co/v/' + video_id
312 webpage = self._download_webpage(webpage_url, video_id)
314 self.report_extraction(video_id)
316 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
317 webpage, u'video URL')
319 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
322 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
323 webpage, u'thumbnail', fatal=False)
325 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
326 webpage, u'uploader', fatal=False, flags=re.DOTALL)
332 'title': video_title,
333 'thumbnail': thumbnail,
334 'uploader': uploader,
337 class FlickrIE(InfoExtractor):
338 """Information Extractor for Flickr videos"""
339 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
341 def _real_extract(self, url):
342 mobj = re.match(self._VALID_URL, url)
344 video_id = mobj.group('id')
345 video_uploader_id = mobj.group('uploader_id')
346 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
347 webpage = self._download_webpage(webpage_url, video_id)
349 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
351 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
352 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
354 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
355 first_xml, u'node_id')
357 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
358 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
360 self.report_extraction(video_id)
362 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
364 raise ExtractorError(u'Unable to extract video url')
365 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
367 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
368 webpage, u'video title
')
370 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
371 webpage, u'description', fatal=False)
373 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
374 webpage, u'thumbnail
', fatal=False)
380 'title
': video_title,
381 'description
': video_description,
382 'thumbnail
': thumbnail,
383 'uploader_id
': video_uploader_id,
386 class TeamcocoIE(InfoExtractor):
387 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
389 def _real_extract(self, url):
390 mobj = re.match(self._VALID_URL, url)
392 raise ExtractorError(u'Invalid URL
: %s' % url)
393 url_title = mobj.group('url_title
')
394 webpage = self._download_webpage(url, url_title)
396 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
397 webpage, u'video
id')
399 self.report_extraction(video_id)
401 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
404 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
405 webpage, u'thumbnail
', fatal=False)
407 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
408 webpage, u'description
', fatal=False)
410 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
411 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
413 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
420 'title
': video_title,
421 'thumbnail
': thumbnail,
422 'description
': video_description,
425 class XHamsterIE(InfoExtractor):
426 """Information Extractor for xHamster"""
427 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
429 def _real_extract(self,url):
430 mobj = re.match(self._VALID_URL, url)
432 video_id = mobj.group('id')
433 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
434 webpage = self._download_webpage(mrss_url, video_id)
436 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
438 raise ExtractorError(u'Unable to extract media URL
')
439 if len(mobj.group('server
')) == 0:
440 video_url = compat_urllib_parse.unquote(mobj.group('file'))
442 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
443 video_extension = video_url.split('.')[-1]
445 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
448 # Can't see the description anywhere
in the UI
449 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
450 # webpage, u'description', fatal=False)
451 # if video_description: video_description = unescapeHTML(video_description)
453 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}
\'', webpage)
455 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
457 video_upload_date = None
458 self._downloader.report_warning(u'Unable to extract upload date
')
460 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
461 webpage, u'uploader
id', default=u'anonymous
')
463 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
464 webpage, u'thumbnail
', fatal=False)
469 'ext
': video_extension,
470 'title
': video_title,
471 # 'description
': video_description,
472 'upload_date
': video_upload_date,
473 'uploader_id
': video_uploader_id,
474 'thumbnail
': video_thumbnail
477 class HypemIE(InfoExtractor):
478 """Information Extractor for hypem"""
479 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
481 def _real_extract(self, url):
482 mobj = re.match(self._VALID_URL, url)
484 raise ExtractorError(u'Invalid URL
: %s' % url)
485 track_id = mobj.group(1)
487 data = { 'ax': 1, 'ts': time.time() }
488 data_encoded = compat_urllib_parse.urlencode(data)
489 complete_url = url + "?" + data_encoded
490 request = compat_urllib_request.Request(complete_url)
491 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
492 cookie = urlh.headers.get('Set
-Cookie
', '')
494 self.report_extraction(track_id)
496 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
497 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
499 track_list = json.loads(html_tracks)
500 track = track_list[u'tracks
'][0]
502 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
505 track_id = track[u"id"]
506 artist = track[u"artist"]
507 title = track[u"song"]
509 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
510 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
511 request.add_header('cookie
', cookie)
512 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
514 song_data = json.loads(song_data_json)
516 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
517 final_url = song_data[u"url"]
527 class Vbox7IE(InfoExtractor):
528 """Information Extractor for Vbox7"""
529 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
531 def _real_extract(self,url):
532 mobj = re.match(self._VALID_URL, url)
534 raise ExtractorError(u'Invalid URL
: %s' % url)
535 video_id = mobj.group(1)
537 redirect_page, urlh = self._download_webpage_handle(url, video_id)
538 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
539 redirect_url = urlh.geturl() + new_location
540 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
542 title = self._html_search_regex(r'<title
>(.*)</title
>',
543 webpage, u'title
').split('/')[0].strip()
546 info_url = "http://vbox7.com/play/magare.do"
547 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
548 info_request = compat_urllib_request.Request(info_url, data)
549 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
550 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
551 if info_response is None:
552 raise ExtractorError(u'Unable to extract the media url
')
553 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
560 'thumbnail
': thumbnail_url,
564 def gen_extractors():
565 """ Return a list of an instance of every supported extractor.
566 The order does matter; the first extractor matched is the one handling the URL.
594 StanfordOpenClassroomIE(),
634 def get_info_extractor(ie_name):
635 """Returns the info extractor class with the given ie_name"""
636 return globals()[ie_name+'IE
']