10 import xml
.etree
.ElementTree
19 from .extractor
.common
import InfoExtractor
, SearchInfoExtractor
21 from .extractor
.ard
import ARDIE
22 from .extractor
.arte
import ArteTvIE
23 from .extractor
.bliptv
import BlipTVIE
, BlipTVUserIE
24 from .extractor
.comedycentral
import ComedyCentralIE
25 from .extractor
.collegehumor
import CollegeHumorIE
26 from .extractor
.dailymotion
import DailymotionIE
27 from .extractor
.depositfiles
import DepositFilesIE
28 from .extractor
.escapist
import EscapistIE
29 from .extractor
.facebook
import FacebookIE
30 from .extractor
.funnyordie
import FunnyOrDieIE
31 from .extractor
.gametrailers
import GametrailersIE
32 from .extractor
.generic
import GenericIE
33 from .extractor
.googleplus
import GooglePlusIE
34 from .extractor
.googlesearch
import GoogleSearchIE
35 from .extractor
.infoq
import InfoQIE
36 from .extractor
.metacafe
import MetacafeIE
37 from .extractor
.mixcloud
import MixcloudIE
38 from .extractor
.mtv
import MTVIE
39 from .extractor
.myvideo
import MyVideoIE
40 from .extractor
.nba
import NBAIE
41 from .extractor
.statigram
import StatigramIE
42 from .extractor
.photobucket
import PhotobucketIE
43 from .extractor
.soundcloud
import SoundcloudIE
, SoundcloudSetIE
44 from .extractor
.stanfordoc
import StanfordOpenClassroomIE
45 from .extractor
.steam
import SteamIE
46 from .extractor
.ted
import TEDIE
47 from .extractor
.vimeo
import VimeoIE
48 from .extractor
.worldstarhiphop
import WorldStarHipHopIE
49 from .extractor
.xnxx
import XNXXIE
50 from .extractor
.xvideos
import XVideosIE
51 from .extractor
.yahoo
import YahooIE
, YahooSearchIE
52 from .extractor
.youku
import YoukuIE
53 from .extractor
.youtube
import YoutubeIE
, YoutubePlaylistIE
, YoutubeSearchIE
, YoutubeUserIE
, YoutubeChannelIE
54 from .extractor
.zdf
import ZDFIE
68 class JustinTVIE(InfoExtractor
):
69 """Information extractor for justin.tv and twitch.tv"""
70 # TODO: One broadcast may be split into multiple videos. The key
71 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
72 # starts at 1 and increases. Can we treat all parts as one video?
74 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
77 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
78 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
82 _JUSTIN_PAGE_LIMIT
= 100
83 IE_NAME
= u
'justin.tv'
85 def report_download_page(self
, channel
, offset
):
86 """Report attempt to download a single page of videos."""
87 self
.to_screen(u
'%s: Downloading video information from %d to %d' %
88 (channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
90 # Return count of items, list of *valid* items
91 def _parse_page(self
, url
, video_id
):
92 webpage
= self
._download
_webpage
(url
, video_id
,
93 u
'Downloading video info JSON',
94 u
'unable to download video info JSON')
96 response
= json
.loads(webpage
)
97 if type(response
) != list:
98 error_text
= response
.get('error', 'unknown error')
99 raise ExtractorError(u
'Justin.tv API: %s' % error_text
)
101 for clip
in response
:
102 video_url
= clip
['video_file_url']
104 video_extension
= os
.path
.splitext(video_url
)[1][1:]
105 video_date
= re
.sub('-', '', clip
['start_time'][:10])
106 video_uploader_id
= clip
.get('user_id', clip
.get('channel_id'))
107 video_id
= clip
['id']
108 video_title
= clip
.get('title', video_id
)
112 'title': video_title
,
113 'uploader': clip
.get('channel_name', video_uploader_id
),
114 'uploader_id': video_uploader_id
,
115 'upload_date': video_date
,
116 'ext': video_extension
,
118 return (len(response
), info
)
120 def _real_extract(self
, url
):
121 mobj
= re
.match(self
._VALID
_URL
, url
)
123 raise ExtractorError(u
'invalid URL: %s' % url
)
125 api_base
= 'http://api.justin.tv'
127 if mobj
.group('channelid'):
129 video_id
= mobj
.group('channelid')
130 api
= api_base
+ '/channel/archives/%s.json' % video_id
131 elif mobj
.group('chapterid'):
132 chapter_id
= mobj
.group('chapterid')
134 webpage
= self
._download
_webpage
(url
, chapter_id
)
135 m
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
)
137 raise ExtractorError(u
'Cannot find archive of a chapter')
138 archive_id
= m
.group(1)
140 api
= api_base
+ '/broadcast/by_chapter/%s.xml' % chapter_id
141 chapter_info_xml
= self
._download
_webpage
(api
, chapter_id
,
142 note
=u
'Downloading chapter information',
143 errnote
=u
'Chapter information download failed')
144 doc
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
)
145 for a
in doc
.findall('.//archive'):
146 if archive_id
== a
.find('./id').text
:
149 raise ExtractorError(u
'Could not find chapter in chapter information')
151 video_url
= a
.find('./video_file_url').text
152 video_ext
= video_url
.rpartition('.')[2] or u
'flv'
154 chapter_api_url
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
155 chapter_info_json
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
,
156 note
='Downloading chapter metadata',
157 errnote
='Download of chapter metadata failed')
158 chapter_info
= json
.loads(chapter_info_json
)
160 bracket_start
= int(doc
.find('.//bracket_start').text
)
161 bracket_end
= int(doc
.find('.//bracket_end').text
)
163 # TODO determine start (and probably fix up file)
164 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
165 #video_url += u'?start=' + TODO:start_timestamp
166 # bracket_start is 13290, but we want 51670615
167 self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. '
168 u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
)))
171 'id': u
'c' + chapter_id
,
174 'title': chapter_info
['title'],
175 'thumbnail': chapter_info
['preview'],
176 'description': chapter_info
['description'],
177 'uploader': chapter_info
['channel']['display_name'],
178 'uploader_id': chapter_info
['channel']['name'],
182 video_id
= mobj
.group('videoid')
183 api
= api_base
+ '/broadcast/by_archive/%s.json' % video_id
185 self
.report_extraction(video_id
)
189 limit
= self
._JUSTIN
_PAGE
_LIMIT
192 self
.report_download_page(video_id
, offset
)
193 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
194 page_count
, page_info
= self
._parse
_page
(page_url
, video_id
)
195 info
.extend(page_info
)
196 if not paged
or page_count
!= limit
:
203 class UstreamIE(InfoExtractor
):
204 _VALID_URL
= r
'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
207 def _real_extract(self
, url
):
208 m
= re
.match(self
._VALID
_URL
, url
)
209 video_id
= m
.group('videoID')
211 video_url
= u
'http://tcdn.ustream.tv/video/%s' % video_id
212 webpage
= self
._download
_webpage
(url
, video_id
)
214 self
.report_extraction(video_id
)
216 video_title
= self
._html
_search
_regex
(r
'data-title="(?P<title>.+)"',
219 uploader
= self
._html
_search
_regex
(r
'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
220 webpage
, u
'uploader', fatal
=False, flags
=re
.DOTALL
)
222 thumbnail
= self
._html
_search
_regex
(r
'<link rel="image_src" href="(?P<thumb>.*?)"',
223 webpage
, u
'thumbnail', fatal
=False)
229 'title': video_title
,
230 'uploader': uploader
,
231 'thumbnail': thumbnail
,
236 class RBMARadioIE(InfoExtractor
):
237 _VALID_URL
= r
'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
239 def _real_extract(self
, url
):
240 m
= re
.match(self
._VALID
_URL
, url
)
241 video_id
= m
.group('videoID')
243 webpage
= self
._download
_webpage
(url
, video_id
)
245 json_data
= self
._search
_regex
(r
'window\.gon.*?gon\.show=(.+?);$',
246 webpage
, u
'json data', flags
=re
.MULTILINE
)
249 data
= json
.loads(json_data
)
250 except ValueError as e
:
251 raise ExtractorError(u
'Invalid JSON: ' + str(e
))
253 video_url
= data
['akamai_url'] + '&cbr=256'
254 url_parts
= compat_urllib_parse_urlparse(video_url
)
255 video_ext
= url_parts
.path
.rpartition('.')[2]
260 'title': data
['title'],
261 'description': data
.get('teaser_text'),
262 'location': data
.get('country_of_origin'),
263 'uploader': data
.get('host', {}).get('name'),
264 'uploader_id': data
.get('host', {}).get('slug'),
265 'thumbnail': data
.get('image', {}).get('large_url_2x'),
266 'duration': data
.get('duration'),
271 class YouPornIE(InfoExtractor
):
272 """Information extractor for youporn.com."""
273 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
275 def _print_formats(self
, formats
):
276 """Print all available formats"""
277 print(u
'Available formats:')
278 print(u
'ext\t\tformat')
279 print(u
'---------------------------------')
280 for format
in formats
:
281 print(u
'%s\t\t%s' % (format
['ext'], format
['format']))
283 def _specific(self
, req_format
, formats
):
285 if(x
["format"]==req_format
):
289 def _real_extract(self
, url
):
290 mobj
= re
.match(self
._VALID
_URL
, url
)
292 raise ExtractorError(u
'Invalid URL: %s' % url
)
293 video_id
= mobj
.group('videoid')
295 req
= compat_urllib_request
.Request(url
)
296 req
.add_header('Cookie', 'age_verified=1')
297 webpage
= self
._download
_webpage
(req
, video_id
)
299 # Get JSON parameters
300 json_params
= self
._search
_regex
(r
'var currentVideo = new Video\((.*)\);', webpage
, u
'JSON parameters')
302 params
= json
.loads(json_params
)
304 raise ExtractorError(u
'Invalid JSON')
306 self
.report_extraction(video_id
)
308 video_title
= params
['title']
309 upload_date
= unified_strdate(params
['release_date_f'])
310 video_description
= params
['description']
311 video_uploader
= params
['submitted_by']
312 thumbnail
= params
['thumbnails'][0]['image']
314 raise ExtractorError('Missing JSON parameter: ' + sys
.exc_info()[1])
316 # Get all of the formats available
317 DOWNLOAD_LIST_RE
= r
'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
318 download_list_html
= self
._search
_regex
(DOWNLOAD_LIST_RE
,
319 webpage
, u
'download list').strip()
321 # Get all of the links from the page
322 LINK_RE
= r
'(?s)<a href="(?P<url>[^"]+)">'
323 links
= re
.findall(LINK_RE
, download_list_html
)
325 raise ExtractorError(u
'ERROR: no known formats available for video')
327 self
.to_screen(u
'Links found: %d' % len(links
))
332 # A link looks like this:
333 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
334 # A path looks like this:
335 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
336 video_url
= unescapeHTML( link
)
337 path
= compat_urllib_parse_urlparse( video_url
).path
338 extension
= os
.path
.splitext( path
)[1][1:]
339 format
= path
.split('/')[4].split('_')[:2]
342 format
= "-".join( format
)
343 # title = u'%s-%s-%s' % (video_title, size, bitrate)
348 'uploader': video_uploader
,
349 'upload_date': upload_date
,
350 'title': video_title
,
353 'thumbnail': thumbnail
,
354 'description': video_description
357 if self
._downloader
.params
.get('listformats', None):
358 self
._print
_formats
(formats
)
361 req_format
= self
._downloader
.params
.get('format', None)
362 self
.to_screen(u
'Format: %s' % req_format
)
364 if req_format
is None or req_format
== 'best':
366 elif req_format
== 'worst':
368 elif req_format
in ('-1', 'all'):
371 format
= self
._specific
( req_format
, formats
)
373 raise ExtractorError(u
'Requested format not available')
378 class PornotubeIE(InfoExtractor
):
379 """Information extractor for pornotube.com."""
380 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
382 def _real_extract(self
, url
):
383 mobj
= re
.match(self
._VALID
_URL
, url
)
385 raise ExtractorError(u
'Invalid URL: %s' % url
)
387 video_id
= mobj
.group('videoid')
388 video_title
= mobj
.group('title')
390 # Get webpage content
391 webpage
= self
._download
_webpage
(url
, video_id
)
394 VIDEO_URL_RE
= r
'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
395 video_url
= self
._search
_regex
(VIDEO_URL_RE
, webpage
, u
'video url')
396 video_url
= compat_urllib_parse
.unquote(video_url
)
398 #Get the uploaded date
399 VIDEO_UPLOADED_RE
= r
'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
400 upload_date
= self
._html
_search
_regex
(VIDEO_UPLOADED_RE
, webpage
, u
'upload date', fatal
=False)
401 if upload_date
: upload_date
= unified_strdate(upload_date
)
403 info
= {'id': video_id
,
406 'upload_date': upload_date
,
407 'title': video_title
,
413 class YouJizzIE(InfoExtractor
):
414 """Information extractor for youjizz.com."""
415 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
417 def _real_extract(self
, url
):
418 mobj
= re
.match(self
._VALID
_URL
, url
)
420 raise ExtractorError(u
'Invalid URL: %s' % url
)
422 video_id
= mobj
.group('videoid')
424 # Get webpage content
425 webpage
= self
._download
_webpage
(url
, video_id
)
427 # Get the video title
428 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*)</title>',
429 webpage
, u
'title').strip()
432 result
= re
.search(r
'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage
)
434 raise ExtractorError(u
'ERROR: unable to extract embed page')
436 embed_page_url
= result
.group(0).strip()
437 video_id
= result
.group('videoid')
439 webpage
= self
._download
_webpage
(embed_page_url
, video_id
)
442 video_url
= self
._search
_regex
(r
'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
443 webpage
, u
'video URL')
445 info
= {'id': video_id
,
447 'title': video_title
,
450 'player_url': embed_page_url
}
454 class EightTracksIE(InfoExtractor
):
456 _VALID_URL
= r
'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
458 def _real_extract(self
, url
):
459 mobj
= re
.match(self
._VALID
_URL
, url
)
461 raise ExtractorError(u
'Invalid URL: %s' % url
)
462 playlist_id
= mobj
.group('id')
464 webpage
= self
._download
_webpage
(url
, playlist_id
)
466 json_like
= self
._search
_regex
(r
"PAGE.mix = (.*?);\n", webpage
, u
'trax information', flags
=re
.DOTALL
)
467 data
= json
.loads(json_like
)
469 session
= str(random
.randint(0, 1000000000))
471 track_count
= data
['tracks_count']
472 first_url
= 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session
, mix_id
)
475 for i
in itertools
.count():
476 api_json
= self
._download
_webpage
(next_url
, playlist_id
,
477 note
=u
'Downloading song information %s/%s' % (str(i
+1), track_count
),
478 errnote
=u
'Failed to download song information')
479 api_data
= json
.loads(api_json
)
480 track_data
= api_data
[u
'set']['track']
482 'id': track_data
['id'],
483 'url': track_data
['track_file_stream_url'],
484 'title': track_data
['performer'] + u
' - ' + track_data
['name'],
485 'raw_title': track_data
['name'],
486 'uploader_id': data
['user']['login'],
490 if api_data
['set']['at_last_track']:
492 next_url
= 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session
, mix_id
, track_data
['id'])
495 class KeekIE(InfoExtractor
):
496 _VALID_URL
= r
'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
499 def _real_extract(self
, url
):
500 m
= re
.match(self
._VALID
_URL
, url
)
501 video_id
= m
.group('videoID')
503 video_url
= u
'http://cdn.keek.com/keek/video/%s' % video_id
504 thumbnail
= u
'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
505 webpage
= self
._download
_webpage
(url
, video_id
)
507 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
510 uploader
= self
._html
_search
_regex
(r
'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
511 webpage
, u
'uploader', fatal
=False)
517 'title': video_title
,
518 'thumbnail': thumbnail
,
524 class MySpassIE(InfoExtractor
):
525 _VALID_URL
= r
'http://www.myspass.de/.*'
527 def _real_extract(self
, url
):
528 META_DATA_URL_TEMPLATE
= 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
530 # video id is the last path element of the URL
531 # usually there is a trailing slash, so also try the second but last
532 url_path
= compat_urllib_parse_urlparse(url
).path
533 url_parent_path
, video_id
= os
.path
.split(url_path
)
535 _
, video_id
= os
.path
.split(url_parent_path
)
538 metadata_url
= META_DATA_URL_TEMPLATE
% video_id
539 metadata_text
= self
._download
_webpage
(metadata_url
, video_id
)
540 metadata
= xml
.etree
.ElementTree
.fromstring(metadata_text
.encode('utf-8'))
542 # extract values from metadata
543 url_flv_el
= metadata
.find('url_flv')
544 if url_flv_el
is None:
545 raise ExtractorError(u
'Unable to extract download url')
546 video_url
= url_flv_el
.text
547 extension
= os
.path
.splitext(video_url
)[1][1:]
548 title_el
= metadata
.find('title')
550 raise ExtractorError(u
'Unable to extract title')
551 title
= title_el
.text
552 format_id_el
= metadata
.find('format_id')
553 if format_id_el
is None:
556 format
= format_id_el
.text
557 description_el
= metadata
.find('description')
558 if description_el
is not None:
559 description
= description_el
.text
562 imagePreview_el
= metadata
.find('imagePreview')
563 if imagePreview_el
is not None:
564 thumbnail
= imagePreview_el
.text
573 'thumbnail': thumbnail
,
574 'description': description
578 class SpiegelIE(InfoExtractor
):
579 _VALID_URL
= r
'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
581 def _real_extract(self
, url
):
582 m
= re
.match(self
._VALID
_URL
, url
)
583 video_id
= m
.group('videoID')
585 webpage
= self
._download
_webpage
(url
, video_id
)
587 video_title
= self
._html
_search
_regex
(r
'<div class="module-title">(.*?)</div>',
590 xml_url
= u
'http://video2.spiegel.de/flash/' + video_id
+ u
'.xml'
591 xml_code
= self
._download
_webpage
(xml_url
, video_id
,
592 note
=u
'Downloading XML', errnote
=u
'Failed to download XML')
594 idoc
= xml
.etree
.ElementTree
.fromstring(xml_code
)
596 filename
= last_type
.findall('./filename')[0].text
597 duration
= float(last_type
.findall('./duration')[0].text
)
599 video_url
= 'http://video2.spiegel.de/flash/' + filename
600 video_ext
= filename
.rpartition('.')[2]
605 'title': video_title
,
606 'duration': duration
,
610 class LiveLeakIE(InfoExtractor
):
612 _VALID_URL
= r
'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
613 IE_NAME
= u
'liveleak'
615 def _real_extract(self
, url
):
616 mobj
= re
.match(self
._VALID
_URL
, url
)
618 raise ExtractorError(u
'Invalid URL: %s' % url
)
620 video_id
= mobj
.group('video_id')
622 webpage
= self
._download
_webpage
(url
, video_id
)
624 video_url
= self
._search
_regex
(r
'file: "(.*?)",',
625 webpage
, u
'video URL')
627 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
628 webpage
, u
'title').replace('LiveLeak.com -', '').strip()
630 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
631 webpage
, u
'description', fatal
=False)
633 video_uploader
= self
._html
_search
_regex
(r
'By:.*?(\w+)</a>',
634 webpage
, u
'uploader', fatal
=False)
640 'title': video_title
,
641 'description': video_description
,
642 'uploader': video_uploader
649 class TumblrIE(InfoExtractor
):
650 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
652 def _real_extract(self
, url
):
653 m_url
= re
.match(self
._VALID
_URL
, url
)
654 video_id
= m_url
.group('id')
655 blog
= m_url
.group('blog_name')
657 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
658 webpage
= self
._download
_webpage
(url
, video_id
)
660 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
661 video
= re
.search(re_video
, webpage
)
663 raise ExtractorError(u
'Unable to extract video')
664 video_url
= video
.group('video_url')
665 ext
= video
.group('ext')
667 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
668 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
669 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
671 # The only place where you can get a title, it's not complete,
672 # but searching in other places doesn't work for all videos
673 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
674 webpage
, u
'title', flags
=re
.DOTALL
)
676 return [{'id': video_id
,
678 'title': video_title
,
679 'thumbnail': video_thumbnail
,
683 class BandcampIE(InfoExtractor
):
684 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
686 def _real_extract(self
, url
):
687 mobj
= re
.match(self
._VALID
_URL
, url
)
688 title
= mobj
.group('title')
689 webpage
= self
._download
_webpage
(url
, title
)
690 # We get the link to the free download page
691 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
692 if m_download
is None:
693 raise ExtractorError(u
'No free songs found')
695 download_link
= m_download
.group(1)
696 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
697 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
699 download_webpage
= self
._download
_webpage
(download_link
, id,
700 'Downloading free downloads page')
701 # We get the dictionary of the track from some javascrip code
702 info
= re
.search(r
'items: (.*?),$',
703 download_webpage
, re
.MULTILINE
).group(1)
704 info
= json
.loads(info
)[0]
705 # We pick mp3-320 for now, until format selection can be easily implemented.
706 mp3_info
= info
[u
'downloads'][u
'mp3-320']
707 # If we try to use this url it says the link has expired
708 initial_url
= mp3_info
[u
'url']
709 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
710 m_url
= re
.match(re_url
, initial_url
)
711 #We build the url we will use to get the final track url
712 # This url is build in Bandcamp in the script download_bunde_*.js
713 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
714 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
715 # If we could correctly generate the .rand field the url would be
716 #in the "download_url" key
717 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
719 track_info
= {'id':id,
720 'title' : info
[u
'title'],
723 'thumbnail' : info
[u
'thumb_url'],
724 'uploader' : info
[u
'artist']
729 class RedTubeIE(InfoExtractor
):
730 """Information Extractor for redtube"""
731 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
733 def _real_extract(self
,url
):
734 mobj
= re
.match(self
._VALID
_URL
, url
)
736 raise ExtractorError(u
'Invalid URL: %s' % url
)
738 video_id
= mobj
.group('id')
739 video_extension
= 'mp4'
740 webpage
= self
._download
_webpage
(url
, video_id
)
742 self
.report_extraction(video_id
)
744 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
745 webpage
, u
'video URL')
747 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
753 'ext': video_extension
,
754 'title': video_title
,
757 class InaIE(InfoExtractor
):
758 """Information Extractor for Ina.fr"""
759 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
761 def _real_extract(self
,url
):
762 mobj
= re
.match(self
._VALID
_URL
, url
)
764 video_id
= mobj
.group('id')
765 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
766 video_extension
= 'mp4'
767 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
769 self
.report_extraction(video_id
)
771 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
772 webpage
, u
'video URL')
774 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
780 'ext': video_extension
,
781 'title': video_title
,
784 class HowcastIE(InfoExtractor
):
785 """Information Extractor for Howcast.com"""
786 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
788 def _real_extract(self
, url
):
789 mobj
= re
.match(self
._VALID
_URL
, url
)
791 video_id
= mobj
.group('id')
792 webpage_url
= 'http://www.howcast.com/videos/' + video_id
793 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
795 self
.report_extraction(video_id
)
797 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
798 webpage, u'video URL')
800 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
803 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
804 webpage, u'description', fatal=False)
806 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
807 webpage, u'thumbnail', fatal=False)
813 'title': video_title,
814 'description': video_description,
815 'thumbnail': thumbnail,
818 class VineIE(InfoExtractor):
819 """Information Extractor for Vine.co"""
820 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
822 def _real_extract(self, url):
823 mobj = re.match(self._VALID_URL, url)
825 video_id = mobj.group('id')
826 webpage_url = 'https://vine.co/v/' + video_id
827 webpage = self._download_webpage(webpage_url, video_id)
829 self.report_extraction(video_id)
831 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
832 webpage, u'video URL')
834 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
837 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
838 webpage, u'thumbnail', fatal=False)
840 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
841 webpage, u'uploader', fatal=False, flags=re.DOTALL)
847 'title': video_title,
848 'thumbnail': thumbnail,
849 'uploader': uploader,
852 class FlickrIE(InfoExtractor):
853 """Information Extractor for Flickr videos"""
854 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
856 def _real_extract(self, url):
857 mobj = re.match(self._VALID_URL, url)
859 video_id = mobj.group('id')
860 video_uploader_id = mobj.group('uploader_id')
861 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
862 webpage = self._download_webpage(webpage_url, video_id)
864 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
866 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
867 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
869 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
870 first_xml, u'node_id')
872 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
873 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
875 self.report_extraction(video_id)
877 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
879 raise ExtractorError(u'Unable to extract video url')
880 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
882 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
883 webpage, u'video title
')
885 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
886 webpage, u'description', fatal=False)
888 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
889 webpage, u'thumbnail
', fatal=False)
895 'title
': video_title,
896 'description
': video_description,
897 'thumbnail
': thumbnail,
898 'uploader_id
': video_uploader_id,
901 class TeamcocoIE(InfoExtractor):
902 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
904 def _real_extract(self, url):
905 mobj = re.match(self._VALID_URL, url)
907 raise ExtractorError(u'Invalid URL
: %s' % url)
908 url_title = mobj.group('url_title
')
909 webpage = self._download_webpage(url, url_title)
911 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
912 webpage, u'video
id')
914 self.report_extraction(video_id)
916 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
919 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
920 webpage, u'thumbnail
', fatal=False)
922 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
923 webpage, u'description
', fatal=False)
925 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
926 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
928 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
935 'title
': video_title,
936 'thumbnail
': thumbnail,
937 'description
': video_description,
940 class XHamsterIE(InfoExtractor):
941 """Information Extractor for xHamster"""
942 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
944 def _real_extract(self,url):
945 mobj = re.match(self._VALID_URL, url)
947 video_id = mobj.group('id')
948 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
949 webpage = self._download_webpage(mrss_url, video_id)
951 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
953 raise ExtractorError(u'Unable to extract media URL
')
954 if len(mobj.group('server
')) == 0:
955 video_url = compat_urllib_parse.unquote(mobj.group('file'))
957 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
958 video_extension = video_url.split('.')[-1]
960 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
963 # Can't see the description anywhere
in the UI
964 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
965 # webpage, u'description', fatal=False)
966 # if video_description: video_description = unescapeHTML(video_description)
968 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}
\'', webpage)
970 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
972 video_upload_date = None
973 self._downloader.report_warning(u'Unable to extract upload date
')
975 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
976 webpage, u'uploader
id', default=u'anonymous
')
978 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
979 webpage, u'thumbnail
', fatal=False)
984 'ext
': video_extension,
985 'title
': video_title,
986 # 'description
': video_description,
987 'upload_date
': video_upload_date,
988 'uploader_id
': video_uploader_id,
989 'thumbnail
': video_thumbnail
992 class HypemIE(InfoExtractor):
993 """Information Extractor for hypem"""
994 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
996 def _real_extract(self, url):
997 mobj = re.match(self._VALID_URL, url)
999 raise ExtractorError(u'Invalid URL
: %s' % url)
1000 track_id = mobj.group(1)
1002 data = { 'ax': 1, 'ts': time.time() }
1003 data_encoded = compat_urllib_parse.urlencode(data)
1004 complete_url = url + "?" + data_encoded
1005 request = compat_urllib_request.Request(complete_url)
1006 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
1007 cookie = urlh.headers.get('Set
-Cookie
', '')
1009 self.report_extraction(track_id)
1011 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
1012 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
1014 track_list = json.loads(html_tracks)
1015 track = track_list[u'tracks
'][0]
1017 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
1020 track_id = track[u"id"]
1021 artist = track[u"artist"]
1022 title = track[u"song"]
1024 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1025 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1026 request.add_header('cookie
', cookie)
1027 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
1029 song_data = json.loads(song_data_json)
1031 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
1032 final_url = song_data[u"url"]
1042 class Vbox7IE(InfoExtractor):
1043 """Information Extractor for Vbox7"""
1044 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
1046 def _real_extract(self,url):
1047 mobj = re.match(self._VALID_URL, url)
1049 raise ExtractorError(u'Invalid URL
: %s' % url)
1050 video_id = mobj.group(1)
1052 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1053 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
1054 redirect_url = urlh.geturl() + new_location
1055 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
1057 title = self._html_search_regex(r'<title
>(.*)</title
>',
1058 webpage, u'title
').split('/')[0].strip()
1061 info_url = "http://vbox7.com/play/magare.do"
1062 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1063 info_request = compat_urllib_request.Request(info_url, data)
1064 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
1065 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
1066 if info_response is None:
1067 raise ExtractorError(u'Unable to extract the media url
')
1068 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1075 'thumbnail
': thumbnail_url,
1079 def gen_extractors():
1080 """ Return a list of an instance of every supported extractor.
1081 The order does matter; the first extractor matched is the one handling the URL.
1084 YoutubePlaylistIE(),
1109 StanfordOpenClassroomIE(),
1119 WorldStarHipHopIE(),
1149 def get_info_extractor(ie_name):
1150 """Returns the info extractor class with the given ie_name"""
1151 return globals()[ie_name+'IE
']