10 import xml
.etree
.ElementTree
19 from .extractor
.common
import InfoExtractor
, SearchInfoExtractor
21 from .extractor
.ard
import ARDIE
22 from .extractor
.arte
import ArteTvIE
23 from .extractor
.bliptv
import BlipTVIE
, BlipTVUserIE
24 from .extractor
.comedycentral
import ComedyCentralIE
25 from .extractor
.collegehumor
import CollegeHumorIE
26 from .extractor
.dailymotion
import DailymotionIE
27 from .extractor
.depositfiles
import DepositFilesIE
28 from .extractor
.escapist
import EscapistIE
29 from .extractor
.facebook
import FacebookIE
30 from .extractor
.gametrailers
import GametrailersIE
31 from .extractor
.generic
import GenericIE
32 from .extractor
.googleplus
import GooglePlusIE
33 from .extractor
.googlesearch
import GoogleSearchIE
34 from .extractor
.infoq
import InfoQIE
35 from .extractor
.metacafe
import MetacafeIE
36 from .extractor
.myvideo
import MyVideoIE
37 from .extractor
.nba
import NBAIE
38 from .extractor
.statigram
import StatigramIE
39 from .extractor
.photobucket
import PhotobucketIE
40 from .extractor
.soundcloud
import SoundcloudIE
, SoundcloudSetIE
41 from .extractor
.stanfordoc
import StanfordOpenClassroomIE
42 from .extractor
.vimeo
import VimeoIE
43 from .extractor
.xvideos
import XVideosIE
44 from .extractor
.yahoo
import YahooIE
, YahooSearchIE
45 from .extractor
.youtube
import YoutubeIE
, YoutubePlaylistIE
, YoutubeSearchIE
, YoutubeUserIE
, YoutubeChannelIE
46 from .extractor
.zdf
import ZDFIE
50 class MixcloudIE(InfoExtractor
):
51 """Information extractor for www.mixcloud.com"""
53 _WORKING
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
54 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
57 def report_download_json(self
, file_id
):
58 """Report JSON download."""
59 self
.to_screen(u
'Downloading json')
61 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
62 """Get urls from 'audio_formats' section in json"""
65 bitrate_list
= jsonData
[fmt
]
66 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
67 bitrate
= max(bitrate_list
) # select highest
69 url_list
= jsonData
[fmt
][bitrate
]
70 except TypeError: # we have no bitrate info.
71 url_list
= jsonData
[fmt
]
74 def check_urls(self
, url_list
):
75 """Returns 1st active url from list"""
78 compat_urllib_request
.urlopen(url
)
80 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
85 def _print_formats(self
, formats
):
86 print('Available formats:')
87 for fmt
in formats
.keys():
88 for b
in formats
[fmt
]:
90 ext
= formats
[fmt
][b
][0]
91 print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]))
92 except TypeError: # we have no bitrate info
94 print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]))
97 def _real_extract(self
, url
):
98 mobj
= re
.match(self
._VALID
_URL
, url
)
100 raise ExtractorError(u
'Invalid URL: %s' % url
)
101 # extract uploader & filename from url
102 uploader
= mobj
.group(1).decode('utf-8')
103 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
105 # construct API request
106 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
107 # retrieve .json file with links to files
108 request
= compat_urllib_request
.Request(file_url
)
110 self
.report_download_json(file_url
)
111 jsonData
= compat_urllib_request
.urlopen(request
).read()
112 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
113 raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
))
116 json_data
= json
.loads(jsonData
)
117 player_url
= json_data
['player_swf_url']
118 formats
= dict(json_data
['audio_formats'])
120 req_format
= self
._downloader
.params
.get('format', None)
123 if self
._downloader
.params
.get('listformats', None):
124 self
._print
_formats
(formats
)
127 if req_format
is None or req_format
== 'best':
128 for format_param
in formats
.keys():
129 url_list
= self
.get_urls(formats
, format_param
)
131 file_url
= self
.check_urls(url_list
)
132 if file_url
is not None:
135 if req_format
not in formats
:
136 raise ExtractorError(u
'Format is not available')
138 url_list
= self
.get_urls(formats
, req_format
)
139 file_url
= self
.check_urls(url_list
)
140 format_param
= req_format
143 'id': file_id
.decode('utf-8'),
144 'url': file_url
.decode('utf-8'),
145 'uploader': uploader
.decode('utf-8'),
147 'title': json_data
['name'],
148 'ext': file_url
.split('.')[-1].decode('utf-8'),
149 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
150 'thumbnail': json_data
['thumbnail_url'],
151 'description': json_data
['description'],
152 'player_url': player_url
.decode('utf-8'),
156 class MTVIE(InfoExtractor
):
157 """Information extractor for MTV.com"""
159 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
162 def _real_extract(self
, url
):
163 mobj
= re
.match(self
._VALID
_URL
, url
)
165 raise ExtractorError(u
'Invalid URL: %s' % url
)
166 if not mobj
.group('proto'):
167 url
= 'http://' + url
168 video_id
= mobj
.group('videoid')
170 webpage
= self
._download
_webpage
(url
, video_id
)
172 song_name
= self
._html
_search
_regex
(r
'<meta name="mtv_vt" content="([^"]+)"/>',
173 webpage
, u
'song name', fatal
=False)
175 video_title
= self
._html
_search
_regex
(r
'<meta name="mtv_an" content="([^"]+)"/>',
178 mtvn_uri
= self
._html
_search
_regex
(r
'<meta name="mtvn_uri" content="([^"]+)"/>',
179 webpage
, u
'mtvn_uri', fatal
=False)
181 content_id
= self
._search
_regex
(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);',
182 webpage
, u
'content id', fatal
=False)
184 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
185 self
.report_extraction(video_id
)
186 request
= compat_urllib_request
.Request(videogen_url
)
188 metadataXml
= compat_urllib_request
.urlopen(request
).read()
189 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
190 raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
))
192 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
193 renditions
= mdoc
.findall('.//rendition')
195 # For now, always pick the highest quality.
196 rendition
= renditions
[-1]
199 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
200 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
201 video_url
= rendition
.find('./src').text
203 raise ExtractorError('Invalid rendition field.')
208 'uploader': performer
,
210 'title': video_title
,
218 class YoukuIE(InfoExtractor
):
219 _VALID_URL
= r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
222 nowTime
= int(time
.time() * 1000)
223 random1
= random
.randint(1000,1998)
224 random2
= random
.randint(1000,9999)
226 return "%d%d%d" %(nowTime
,random1
,random2
)
228 def _get_file_ID_mix_string(self
, seed
):
230 source
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
232 for i
in range(len(source
)):
233 seed
= (seed
* 211 + 30031 ) % 65536
234 index
= math
.floor(seed
/ 65536 * len(source
) )
235 mixed
.append(source
[int(index
)])
236 source
.remove(source
[int(index
)])
237 #return ''.join(mixed)
240 def _get_file_id(self
, fileId
, seed
):
241 mixed
= self
._get
_file
_ID
_mix
_string
(seed
)
242 ids
= fileId
.split('*')
246 realId
.append(mixed
[int(ch
)])
247 return ''.join(realId
)
249 def _real_extract(self
, url
):
250 mobj
= re
.match(self
._VALID
_URL
, url
)
252 raise ExtractorError(u
'Invalid URL: %s' % url
)
253 video_id
= mobj
.group('ID')
255 info_url
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
257 jsondata
= self
._download
_webpage
(info_url
, video_id
)
259 self
.report_extraction(video_id
)
261 config
= json
.loads(jsondata
)
263 video_title
= config
['data'][0]['title']
264 seed
= config
['data'][0]['seed']
266 format
= self
._downloader
.params
.get('format', None)
267 supported_format
= list(config
['data'][0]['streamfileids'].keys())
269 if format
is None or format
== 'best':
270 if 'hd2' in supported_format
:
275 elif format
== 'worst':
283 fileid
= config
['data'][0]['streamfileids'][format
]
284 keys
= [s
['k'] for s
in config
['data'][0]['segs'][format
]]
285 except (UnicodeDecodeError, ValueError, KeyError):
286 raise ExtractorError(u
'Unable to extract info section')
289 sid
= self
._gen
_sid
()
290 fileid
= self
._get
_file
_id
(fileid
, seed
)
292 #column 8,9 of fileid represent the segment number
293 #fileid[7:9] should be changed
294 for index
, key
in enumerate(keys
):
296 temp_fileid
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:])
297 download_url
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
)
300 'id': '%s_part%02d' % (video_id
, index
),
304 'title': video_title
,
307 files_info
.append(info
)
312 class XNXXIE(InfoExtractor
):
313 """Information extractor for xnxx.com"""
315 _VALID_URL
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
317 VIDEO_URL_RE
= r
'flv_url=(.*?)&'
318 VIDEO_TITLE_RE
= r
'<title>(.*?)\s+-\s+XNXX.COM'
319 VIDEO_THUMB_RE
= r
'url_bigthumb=(.*?)&'
321 def _real_extract(self
, url
):
322 mobj
= re
.match(self
._VALID
_URL
, url
)
324 raise ExtractorError(u
'Invalid URL: %s' % url
)
325 video_id
= mobj
.group(1)
327 # Get webpage content
328 webpage
= self
._download
_webpage
(url
, video_id
)
330 video_url
= self
._search
_regex
(self
.VIDEO_URL_RE
,
331 webpage
, u
'video URL')
332 video_url
= compat_urllib_parse
.unquote(video_url
)
334 video_title
= self
._html
_search
_regex
(self
.VIDEO_TITLE_RE
,
337 video_thumbnail
= self
._search
_regex
(self
.VIDEO_THUMB_RE
,
338 webpage
, u
'thumbnail', fatal
=False)
345 'title': video_title
,
347 'thumbnail': video_thumbnail
,
354 class JustinTVIE(InfoExtractor
):
355 """Information extractor for justin.tv and twitch.tv"""
356 # TODO: One broadcast may be split into multiple videos. The key
357 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
358 # starts at 1 and increases. Can we treat all parts as one video?
360 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
362 (?P<channelid>[^/]+)|
363 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
364 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
368 _JUSTIN_PAGE_LIMIT
= 100
369 IE_NAME
= u
'justin.tv'
371 def report_download_page(self
, channel
, offset
):
372 """Report attempt to download a single page of videos."""
373 self
.to_screen(u
'%s: Downloading video information from %d to %d' %
374 (channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
376 # Return count of items, list of *valid* items
377 def _parse_page(self
, url
, video_id
):
378 webpage
= self
._download
_webpage
(url
, video_id
,
379 u
'Downloading video info JSON',
380 u
'unable to download video info JSON')
382 response
= json
.loads(webpage
)
383 if type(response
) != list:
384 error_text
= response
.get('error', 'unknown error')
385 raise ExtractorError(u
'Justin.tv API: %s' % error_text
)
387 for clip
in response
:
388 video_url
= clip
['video_file_url']
390 video_extension
= os
.path
.splitext(video_url
)[1][1:]
391 video_date
= re
.sub('-', '', clip
['start_time'][:10])
392 video_uploader_id
= clip
.get('user_id', clip
.get('channel_id'))
393 video_id
= clip
['id']
394 video_title
= clip
.get('title', video_id
)
398 'title': video_title
,
399 'uploader': clip
.get('channel_name', video_uploader_id
),
400 'uploader_id': video_uploader_id
,
401 'upload_date': video_date
,
402 'ext': video_extension
,
404 return (len(response
), info
)
406 def _real_extract(self
, url
):
407 mobj
= re
.match(self
._VALID
_URL
, url
)
409 raise ExtractorError(u
'invalid URL: %s' % url
)
411 api_base
= 'http://api.justin.tv'
413 if mobj
.group('channelid'):
415 video_id
= mobj
.group('channelid')
416 api
= api_base
+ '/channel/archives/%s.json' % video_id
417 elif mobj
.group('chapterid'):
418 chapter_id
= mobj
.group('chapterid')
420 webpage
= self
._download
_webpage
(url
, chapter_id
)
421 m
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
)
423 raise ExtractorError(u
'Cannot find archive of a chapter')
424 archive_id
= m
.group(1)
426 api
= api_base
+ '/broadcast/by_chapter/%s.xml' % chapter_id
427 chapter_info_xml
= self
._download
_webpage
(api
, chapter_id
,
428 note
=u
'Downloading chapter information',
429 errnote
=u
'Chapter information download failed')
430 doc
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
)
431 for a
in doc
.findall('.//archive'):
432 if archive_id
== a
.find('./id').text
:
435 raise ExtractorError(u
'Could not find chapter in chapter information')
437 video_url
= a
.find('./video_file_url').text
438 video_ext
= video_url
.rpartition('.')[2] or u
'flv'
440 chapter_api_url
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
441 chapter_info_json
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
,
442 note
='Downloading chapter metadata',
443 errnote
='Download of chapter metadata failed')
444 chapter_info
= json
.loads(chapter_info_json
)
446 bracket_start
= int(doc
.find('.//bracket_start').text
)
447 bracket_end
= int(doc
.find('.//bracket_end').text
)
449 # TODO determine start (and probably fix up file)
450 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
451 #video_url += u'?start=' + TODO:start_timestamp
452 # bracket_start is 13290, but we want 51670615
453 self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. '
454 u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
)))
457 'id': u
'c' + chapter_id
,
460 'title': chapter_info
['title'],
461 'thumbnail': chapter_info
['preview'],
462 'description': chapter_info
['description'],
463 'uploader': chapter_info
['channel']['display_name'],
464 'uploader_id': chapter_info
['channel']['name'],
468 video_id
= mobj
.group('videoid')
469 api
= api_base
+ '/broadcast/by_archive/%s.json' % video_id
471 self
.report_extraction(video_id
)
475 limit
= self
._JUSTIN
_PAGE
_LIMIT
478 self
.report_download_page(video_id
, offset
)
479 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
480 page_count
, page_info
= self
._parse
_page
(page_url
, video_id
)
481 info
.extend(page_info
)
482 if not paged
or page_count
!= limit
:
487 class FunnyOrDieIE(InfoExtractor
):
488 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
490 def _real_extract(self
, url
):
491 mobj
= re
.match(self
._VALID
_URL
, url
)
493 raise ExtractorError(u
'invalid URL: %s' % url
)
495 video_id
= mobj
.group('id')
496 webpage
= self
._download
_webpage
(url
, video_id
)
498 video_url
= self
._html
_search
_regex
(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
499 webpage
, u
'video URL', flags
=re
.DOTALL
)
501 title
= self
._html
_search
_regex
((r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
502 r
'<title>(?P<title>[^<]+?)</title>'), webpage
, 'title', flags
=re
.DOTALL
)
504 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
505 webpage
, u
'description', fatal
=False, flags
=re
.DOTALL
)
512 'description': video_description
,
516 class SteamIE(InfoExtractor
):
517 _VALID_URL
= r
"""http://store\.steampowered\.com/
519 (?P<urltype>video|app)/ #If the page is only for videos or for a game
521 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
523 _VIDEO_PAGE_TEMPLATE
= 'http://store.steampowered.com/video/%s/'
524 _AGECHECK_TEMPLATE
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
527 def suitable(cls
, url
):
528 """Receives a URL and returns True if suitable for this IE."""
529 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
531 def _real_extract(self
, url
):
532 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
533 gameID
= m
.group('gameID')
535 videourl
= self
._VIDEO
_PAGE
_TEMPLATE
% gameID
536 webpage
= self
._download
_webpage
(videourl
, gameID
)
538 if re
.search('<h2>Please enter your birth date to continue:</h2>', webpage
) is not None:
539 videourl
= self
._AGECHECK
_TEMPLATE
% gameID
540 self
.report_age_confirmation()
541 webpage
= self
._download
_webpage
(videourl
, gameID
)
543 self
.report_extraction(gameID
)
544 game_title
= self
._html
_search
_regex
(r
'<h2 class="pageheader">(.*?)</h2>',
545 webpage
, 'game title')
547 urlRE
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
548 mweb
= re
.finditer(urlRE
, webpage
)
549 namesRE
= r
'<span class="title">(?P<videoName>.+?)</span>'
550 titles
= re
.finditer(namesRE
, webpage
)
551 thumbsRE
= r
'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
552 thumbs
= re
.finditer(thumbsRE
, webpage
)
554 for vid
,vtitle
,thumb
in zip(mweb
,titles
,thumbs
):
555 video_id
= vid
.group('videoID')
556 title
= vtitle
.group('videoName')
557 video_url
= vid
.group('videoURL')
558 video_thumb
= thumb
.group('thumbnail')
560 raise ExtractorError(u
'Cannot find video url for %s' % video_id
)
565 'title': unescapeHTML(title
),
566 'thumbnail': video_thumb
569 return [self
.playlist_result(videos
, gameID
, game_title
)]
571 class UstreamIE(InfoExtractor
):
572 _VALID_URL
= r
'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
575 def _real_extract(self
, url
):
576 m
= re
.match(self
._VALID
_URL
, url
)
577 video_id
= m
.group('videoID')
579 video_url
= u
'http://tcdn.ustream.tv/video/%s' % video_id
580 webpage
= self
._download
_webpage
(url
, video_id
)
582 self
.report_extraction(video_id
)
584 video_title
= self
._html
_search
_regex
(r
'data-title="(?P<title>.+)"',
587 uploader
= self
._html
_search
_regex
(r
'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
588 webpage
, u
'uploader', fatal
=False, flags
=re
.DOTALL
)
590 thumbnail
= self
._html
_search
_regex
(r
'<link rel="image_src" href="(?P<thumb>.*?)"',
591 webpage
, u
'thumbnail', fatal
=False)
597 'title': video_title
,
598 'uploader': uploader
,
599 'thumbnail': thumbnail
,
603 class WorldStarHipHopIE(InfoExtractor
):
604 _VALID_URL
= r
'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
605 IE_NAME
= u
'WorldStarHipHop'
607 def _real_extract(self
, url
):
608 m
= re
.match(self
._VALID
_URL
, url
)
609 video_id
= m
.group('id')
611 webpage_src
= self
._download
_webpage
(url
, video_id
)
613 video_url
= self
._search
_regex
(r
'so\.addVariable\("file","(.*?)"\)',
614 webpage_src
, u
'video URL')
616 if 'mp4' in video_url
:
621 video_title
= self
._html
_search
_regex
(r
"<title>(.*)</title>",
622 webpage_src
, u
'title')
624 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
625 thumbnail
= self
._html
_search
_regex
(r
'rel="image_src" href="(.*)" />',
626 webpage_src
, u
'thumbnail', fatal
=False)
629 _title
= r
"""candytitles.*>(.*)</span>"""
630 mobj
= re
.search(_title
, webpage_src
)
632 video_title
= mobj
.group(1)
637 'title' : video_title
,
638 'thumbnail' : thumbnail
,
643 class RBMARadioIE(InfoExtractor
):
644 _VALID_URL
= r
'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
646 def _real_extract(self
, url
):
647 m
= re
.match(self
._VALID
_URL
, url
)
648 video_id
= m
.group('videoID')
650 webpage
= self
._download
_webpage
(url
, video_id
)
652 json_data
= self
._search
_regex
(r
'window\.gon.*?gon\.show=(.+?);$',
653 webpage
, u
'json data', flags
=re
.MULTILINE
)
656 data
= json
.loads(json_data
)
657 except ValueError as e
:
658 raise ExtractorError(u
'Invalid JSON: ' + str(e
))
660 video_url
= data
['akamai_url'] + '&cbr=256'
661 url_parts
= compat_urllib_parse_urlparse(video_url
)
662 video_ext
= url_parts
.path
.rpartition('.')[2]
667 'title': data
['title'],
668 'description': data
.get('teaser_text'),
669 'location': data
.get('country_of_origin'),
670 'uploader': data
.get('host', {}).get('name'),
671 'uploader_id': data
.get('host', {}).get('slug'),
672 'thumbnail': data
.get('image', {}).get('large_url_2x'),
673 'duration': data
.get('duration'),
678 class YouPornIE(InfoExtractor
):
679 """Information extractor for youporn.com."""
680 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
682 def _print_formats(self
, formats
):
683 """Print all available formats"""
684 print(u
'Available formats:')
685 print(u
'ext\t\tformat')
686 print(u
'---------------------------------')
687 for format
in formats
:
688 print(u
'%s\t\t%s' % (format
['ext'], format
['format']))
690 def _specific(self
, req_format
, formats
):
692 if(x
["format"]==req_format
):
696 def _real_extract(self
, url
):
697 mobj
= re
.match(self
._VALID
_URL
, url
)
699 raise ExtractorError(u
'Invalid URL: %s' % url
)
700 video_id
= mobj
.group('videoid')
702 req
= compat_urllib_request
.Request(url
)
703 req
.add_header('Cookie', 'age_verified=1')
704 webpage
= self
._download
_webpage
(req
, video_id
)
706 # Get JSON parameters
707 json_params
= self
._search
_regex
(r
'var currentVideo = new Video\((.*)\);', webpage
, u
'JSON parameters')
709 params
= json
.loads(json_params
)
711 raise ExtractorError(u
'Invalid JSON')
713 self
.report_extraction(video_id
)
715 video_title
= params
['title']
716 upload_date
= unified_strdate(params
['release_date_f'])
717 video_description
= params
['description']
718 video_uploader
= params
['submitted_by']
719 thumbnail
= params
['thumbnails'][0]['image']
721 raise ExtractorError('Missing JSON parameter: ' + sys
.exc_info()[1])
723 # Get all of the formats available
724 DOWNLOAD_LIST_RE
= r
'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
725 download_list_html
= self
._search
_regex
(DOWNLOAD_LIST_RE
,
726 webpage
, u
'download list').strip()
728 # Get all of the links from the page
729 LINK_RE
= r
'(?s)<a href="(?P<url>[^"]+)">'
730 links
= re
.findall(LINK_RE
, download_list_html
)
732 raise ExtractorError(u
'ERROR: no known formats available for video')
734 self
.to_screen(u
'Links found: %d' % len(links
))
739 # A link looks like this:
740 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
741 # A path looks like this:
742 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
743 video_url
= unescapeHTML( link
)
744 path
= compat_urllib_parse_urlparse( video_url
).path
745 extension
= os
.path
.splitext( path
)[1][1:]
746 format
= path
.split('/')[4].split('_')[:2]
749 format
= "-".join( format
)
750 # title = u'%s-%s-%s' % (video_title, size, bitrate)
755 'uploader': video_uploader
,
756 'upload_date': upload_date
,
757 'title': video_title
,
760 'thumbnail': thumbnail
,
761 'description': video_description
764 if self
._downloader
.params
.get('listformats', None):
765 self
._print
_formats
(formats
)
768 req_format
= self
._downloader
.params
.get('format', None)
769 self
.to_screen(u
'Format: %s' % req_format
)
771 if req_format
is None or req_format
== 'best':
773 elif req_format
== 'worst':
775 elif req_format
in ('-1', 'all'):
778 format
= self
._specific
( req_format
, formats
)
780 raise ExtractorError(u
'Requested format not available')
785 class PornotubeIE(InfoExtractor
):
786 """Information extractor for pornotube.com."""
787 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
789 def _real_extract(self
, url
):
790 mobj
= re
.match(self
._VALID
_URL
, url
)
792 raise ExtractorError(u
'Invalid URL: %s' % url
)
794 video_id
= mobj
.group('videoid')
795 video_title
= mobj
.group('title')
797 # Get webpage content
798 webpage
= self
._download
_webpage
(url
, video_id
)
801 VIDEO_URL_RE
= r
'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
802 video_url
= self
._search
_regex
(VIDEO_URL_RE
, webpage
, u
'video url')
803 video_url
= compat_urllib_parse
.unquote(video_url
)
805 #Get the uploaded date
806 VIDEO_UPLOADED_RE
= r
'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
807 upload_date
= self
._html
_search
_regex
(VIDEO_UPLOADED_RE
, webpage
, u
'upload date', fatal
=False)
808 if upload_date
: upload_date
= unified_strdate(upload_date
)
810 info
= {'id': video_id
,
813 'upload_date': upload_date
,
814 'title': video_title
,
820 class YouJizzIE(InfoExtractor
):
821 """Information extractor for youjizz.com."""
822 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
824 def _real_extract(self
, url
):
825 mobj
= re
.match(self
._VALID
_URL
, url
)
827 raise ExtractorError(u
'Invalid URL: %s' % url
)
829 video_id
= mobj
.group('videoid')
831 # Get webpage content
832 webpage
= self
._download
_webpage
(url
, video_id
)
834 # Get the video title
835 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*)</title>',
836 webpage
, u
'title').strip()
839 result
= re
.search(r
'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage
)
841 raise ExtractorError(u
'ERROR: unable to extract embed page')
843 embed_page_url
= result
.group(0).strip()
844 video_id
= result
.group('videoid')
846 webpage
= self
._download
_webpage
(embed_page_url
, video_id
)
849 video_url
= self
._search
_regex
(r
'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
850 webpage
, u
'video URL')
852 info
= {'id': video_id
,
854 'title': video_title
,
857 'player_url': embed_page_url
}
861 class EightTracksIE(InfoExtractor
):
863 _VALID_URL
= r
'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
865 def _real_extract(self
, url
):
866 mobj
= re
.match(self
._VALID
_URL
, url
)
868 raise ExtractorError(u
'Invalid URL: %s' % url
)
869 playlist_id
= mobj
.group('id')
871 webpage
= self
._download
_webpage
(url
, playlist_id
)
873 json_like
= self
._search
_regex
(r
"PAGE.mix = (.*?);\n", webpage
, u
'trax information', flags
=re
.DOTALL
)
874 data
= json
.loads(json_like
)
876 session
= str(random
.randint(0, 1000000000))
878 track_count
= data
['tracks_count']
879 first_url
= 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session
, mix_id
)
882 for i
in itertools
.count():
883 api_json
= self
._download
_webpage
(next_url
, playlist_id
,
884 note
=u
'Downloading song information %s/%s' % (str(i
+1), track_count
),
885 errnote
=u
'Failed to download song information')
886 api_data
= json
.loads(api_json
)
887 track_data
= api_data
[u
'set']['track']
889 'id': track_data
['id'],
890 'url': track_data
['track_file_stream_url'],
891 'title': track_data
['performer'] + u
' - ' + track_data
['name'],
892 'raw_title': track_data
['name'],
893 'uploader_id': data
['user']['login'],
897 if api_data
['set']['at_last_track']:
899 next_url
= 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session
, mix_id
, track_data
['id'])
902 class KeekIE(InfoExtractor
):
903 _VALID_URL
= r
'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
906 def _real_extract(self
, url
):
907 m
= re
.match(self
._VALID
_URL
, url
)
908 video_id
= m
.group('videoID')
910 video_url
= u
'http://cdn.keek.com/keek/video/%s' % video_id
911 thumbnail
= u
'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
912 webpage
= self
._download
_webpage
(url
, video_id
)
914 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
917 uploader
= self
._html
_search
_regex
(r
'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
918 webpage
, u
'uploader', fatal
=False)
924 'title': video_title
,
925 'thumbnail': thumbnail
,
930 class TEDIE(InfoExtractor
):
931 _VALID_URL
=r
'''http://www\.ted\.com/
933 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
935 ((?P<type_talk>talks)) # We have a simple talk
937 (/lang/(.*?))? # The url may contain the language
938 /(?P<name>\w+) # Here goes the name and then ".html"
942 def suitable(cls
, url
):
943 """Receives a URL and returns True if suitable for this IE."""
944 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
946 def _real_extract(self
, url
):
947 m
=re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
948 if m
.group('type_talk'):
949 return [self
._talk
_info
(url
)]
951 playlist_id
=m
.group('playlist_id')
953 self
.to_screen(u
'Getting info of playlist %s: "%s"' % (playlist_id
,name
))
954 return [self
._playlist
_videos
_info
(url
,name
,playlist_id
)]
956 def _playlist_videos_info(self
,url
,name
,playlist_id
=0):
957 '''Returns the videos of the playlist'''
959 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
960 ([.\s]*?)data-playlist_item_id="(\d+)"
961 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
963 video_name_RE
=r
'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
964 webpage
=self
._download
_webpage
(url
, playlist_id
, 'Downloading playlist webpage')
965 m_videos
=re
.finditer(video_RE
,webpage
,re
.VERBOSE
)
966 m_names
=re
.finditer(video_name_RE
,webpage
)
968 playlist_title
= self
._html
_search
_regex
(r
'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
969 webpage
, 'playlist title')
971 playlist_entries
= []
972 for m_video
, m_name
in zip(m_videos
,m_names
):
973 video_id
=m_video
.group('video_id')
974 talk_url
='http://www.ted.com%s' % m_name
.group('talk_url')
975 playlist_entries
.append(self
.url_result(talk_url
, 'TED'))
976 return self
.playlist_result(playlist_entries
, playlist_id
= playlist_id
, playlist_title
= playlist_title
)
978 def _talk_info(self
, url
, video_id
=0):
979 """Return the video for the talk in the url"""
980 m
= re
.match(self
._VALID
_URL
, url
,re
.VERBOSE
)
981 video_name
= m
.group('name')
982 webpage
= self
._download
_webpage
(url
, video_id
, 'Downloading \"%s\" page' % video_name
)
983 self
.report_extraction(video_name
)
984 # If the url includes the language we get the title translated
985 title
= self
._html
_search
_regex
(r
'<span id="altHeadline" >(?P<title>.*)</span>',
987 json_data
= self
._search
_regex
(r
'<script.*?>var talkDetails = ({.*?})</script>',
988 webpage
, 'json data')
989 info
= json
.loads(json_data
)
990 desc
= self
._html
_search
_regex
(r
'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
991 webpage
, 'description', flags
= re
.DOTALL
)
993 thumbnail
= self
._search
_regex
(r
'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
994 webpage
, 'thumbnail')
997 'url': info
['htmlStreams'][-1]['file'],
1000 'thumbnail': thumbnail
,
1001 'description': desc
,
1005 class MySpassIE(InfoExtractor
):
1006 _VALID_URL
= r
'http://www.myspass.de/.*'
1008 def _real_extract(self
, url
):
1009 META_DATA_URL_TEMPLATE
= 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1011 # video id is the last path element of the URL
1012 # usually there is a trailing slash, so also try the second but last
1013 url_path
= compat_urllib_parse_urlparse(url
).path
1014 url_parent_path
, video_id
= os
.path
.split(url_path
)
1016 _
, video_id
= os
.path
.split(url_parent_path
)
1019 metadata_url
= META_DATA_URL_TEMPLATE
% video_id
1020 metadata_text
= self
._download
_webpage
(metadata_url
, video_id
)
1021 metadata
= xml
.etree
.ElementTree
.fromstring(metadata_text
.encode('utf-8'))
1023 # extract values from metadata
1024 url_flv_el
= metadata
.find('url_flv')
1025 if url_flv_el
is None:
1026 raise ExtractorError(u
'Unable to extract download url')
1027 video_url
= url_flv_el
.text
1028 extension
= os
.path
.splitext(video_url
)[1][1:]
1029 title_el
= metadata
.find('title')
1030 if title_el
is None:
1031 raise ExtractorError(u
'Unable to extract title')
1032 title
= title_el
.text
1033 format_id_el
= metadata
.find('format_id')
1034 if format_id_el
is None:
1037 format
= format_id_el
.text
1038 description_el
= metadata
.find('description')
1039 if description_el
is not None:
1040 description
= description_el
.text
1043 imagePreview_el
= metadata
.find('imagePreview')
1044 if imagePreview_el
is not None:
1045 thumbnail
= imagePreview_el
.text
1054 'thumbnail': thumbnail
,
1055 'description': description
1059 class SpiegelIE(InfoExtractor
):
1060 _VALID_URL
= r
'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1062 def _real_extract(self
, url
):
1063 m
= re
.match(self
._VALID
_URL
, url
)
1064 video_id
= m
.group('videoID')
1066 webpage
= self
._download
_webpage
(url
, video_id
)
1068 video_title
= self
._html
_search
_regex
(r
'<div class="module-title">(.*?)</div>',
1071 xml_url
= u
'http://video2.spiegel.de/flash/' + video_id
+ u
'.xml'
1072 xml_code
= self
._download
_webpage
(xml_url
, video_id
,
1073 note
=u
'Downloading XML', errnote
=u
'Failed to download XML')
1075 idoc
= xml
.etree
.ElementTree
.fromstring(xml_code
)
1076 last_type
= idoc
[-1]
1077 filename
= last_type
.findall('./filename')[0].text
1078 duration
= float(last_type
.findall('./duration')[0].text
)
1080 video_url
= 'http://video2.spiegel.de/flash/' + filename
1081 video_ext
= filename
.rpartition('.')[2]
1086 'title': video_title
,
1087 'duration': duration
,
1091 class LiveLeakIE(InfoExtractor
):
1093 _VALID_URL
= r
'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1094 IE_NAME
= u
'liveleak'
1096 def _real_extract(self
, url
):
1097 mobj
= re
.match(self
._VALID
_URL
, url
)
1099 raise ExtractorError(u
'Invalid URL: %s' % url
)
1101 video_id
= mobj
.group('video_id')
1103 webpage
= self
._download
_webpage
(url
, video_id
)
1105 video_url
= self
._search
_regex
(r
'file: "(.*?)",',
1106 webpage
, u
'video URL')
1108 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
1109 webpage
, u
'title').replace('LiveLeak.com -', '').strip()
1111 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
1112 webpage
, u
'description', fatal
=False)
1114 video_uploader
= self
._html
_search
_regex
(r
'By:.*?(\w+)</a>',
1115 webpage
, u
'uploader', fatal
=False)
1121 'title': video_title
,
1122 'description': video_description
,
1123 'uploader': video_uploader
1130 class TumblrIE(InfoExtractor
):
1131 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1133 def _real_extract(self
, url
):
1134 m_url
= re
.match(self
._VALID
_URL
, url
)
1135 video_id
= m_url
.group('id')
1136 blog
= m_url
.group('blog_name')
1138 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
1139 webpage
= self
._download
_webpage
(url
, video_id
)
1141 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
1142 video
= re
.search(re_video
, webpage
)
1144 raise ExtractorError(u
'Unable to extract video')
1145 video_url
= video
.group('video_url')
1146 ext
= video
.group('ext')
1148 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1149 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
1150 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
1152 # The only place where you can get a title, it's not complete,
1153 # but searching in other places doesn't work for all videos
1154 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
1155 webpage
, u
'title', flags
=re
.DOTALL
)
1157 return [{'id': video_id
,
1159 'title': video_title
,
1160 'thumbnail': video_thumbnail
,
1164 class BandcampIE(InfoExtractor
):
1165 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1167 def _real_extract(self
, url
):
1168 mobj
= re
.match(self
._VALID
_URL
, url
)
1169 title
= mobj
.group('title')
1170 webpage
= self
._download
_webpage
(url
, title
)
1171 # We get the link to the free download page
1172 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
1173 if m_download
is None:
1174 raise ExtractorError(u
'No free songs found')
1176 download_link
= m_download
.group(1)
1177 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1178 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
1180 download_webpage
= self
._download
_webpage
(download_link
, id,
1181 'Downloading free downloads page')
1182 # We get the dictionary of the track from some javascrip code
1183 info
= re
.search(r
'items: (.*?),$',
1184 download_webpage
, re
.MULTILINE
).group(1)
1185 info
= json
.loads(info
)[0]
1186 # We pick mp3-320 for now, until format selection can be easily implemented.
1187 mp3_info
= info
[u
'downloads'][u
'mp3-320']
1188 # If we try to use this url it says the link has expired
1189 initial_url
= mp3_info
[u
'url']
1190 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1191 m_url
= re
.match(re_url
, initial_url
)
1192 #We build the url we will use to get the final track url
1193 # This url is build in Bandcamp in the script download_bunde_*.js
1194 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
1195 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
1196 # If we could correctly generate the .rand field the url would be
1197 #in the "download_url" key
1198 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
1200 track_info
= {'id':id,
1201 'title' : info
[u
'title'],
1204 'thumbnail' : info
[u
'thumb_url'],
1205 'uploader' : info
[u
'artist']
1210 class RedTubeIE(InfoExtractor
):
1211 """Information Extractor for redtube"""
1212 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1214 def _real_extract(self
,url
):
1215 mobj
= re
.match(self
._VALID
_URL
, url
)
1217 raise ExtractorError(u
'Invalid URL: %s' % url
)
1219 video_id
= mobj
.group('id')
1220 video_extension
= 'mp4'
1221 webpage
= self
._download
_webpage
(url
, video_id
)
1223 self
.report_extraction(video_id
)
1225 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
1226 webpage
, u
'video URL')
1228 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1234 'ext': video_extension
,
1235 'title': video_title
,
1238 class InaIE(InfoExtractor
):
1239 """Information Extractor for Ina.fr"""
1240 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1242 def _real_extract(self
,url
):
1243 mobj
= re
.match(self
._VALID
_URL
, url
)
1245 video_id
= mobj
.group('id')
1246 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
1247 video_extension
= 'mp4'
1248 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
1250 self
.report_extraction(video_id
)
1252 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1253 webpage
, u
'video URL')
1255 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1261 'ext': video_extension
,
1262 'title': video_title
,
1265 class HowcastIE(InfoExtractor
):
1266 """Information Extractor for Howcast.com"""
1267 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1269 def _real_extract(self
, url
):
1270 mobj
= re
.match(self
._VALID
_URL
, url
)
1272 video_id
= mobj
.group('id')
1273 webpage_url
= 'http://www.howcast.com/videos/' + video_id
1274 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
1276 self
.report_extraction(video_id
)
1278 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1279 webpage, u'video URL')
1281 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
1284 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1285 webpage, u'description', fatal=False)
1287 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1288 webpage, u'thumbnail', fatal=False)
1294 'title': video_title,
1295 'description': video_description,
1296 'thumbnail': thumbnail,
1299 class VineIE(InfoExtractor):
1300 """Information Extractor for Vine.co"""
1301 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1303 def _real_extract(self, url):
1304 mobj = re.match(self._VALID_URL, url)
1306 video_id = mobj.group('id')
1307 webpage_url = 'https://vine.co/v/' + video_id
1308 webpage = self._download_webpage(webpage_url, video_id)
1310 self.report_extraction(video_id)
1312 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
1313 webpage, u'video URL')
1315 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
1318 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
1319 webpage, u'thumbnail', fatal=False)
1321 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
1322 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1328 'title': video_title,
1329 'thumbnail': thumbnail,
1330 'uploader': uploader,
1333 class FlickrIE(InfoExtractor):
1334 """Information Extractor for Flickr videos"""
1335 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1337 def _real_extract(self, url):
1338 mobj = re.match(self._VALID_URL, url)
1340 video_id = mobj.group('id')
1341 video_uploader_id = mobj.group('uploader_id')
1342 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1343 webpage = self._download_webpage(webpage_url, video_id)
1345 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
1347 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1348 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1350 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1351 first_xml, u'node_id')
1353 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1354 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1356 self.report_extraction(video_id)
1358 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
1360 raise ExtractorError(u'Unable to extract video url')
1361 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1363 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
1364 webpage, u'video title
')
1366 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
1367 webpage, u'description', fatal=False)
1369 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
1370 webpage, u'thumbnail
', fatal=False)
1376 'title
': video_title,
1377 'description
': video_description,
1378 'thumbnail
': thumbnail,
1379 'uploader_id
': video_uploader_id,
1382 class TeamcocoIE(InfoExtractor):
1383 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
1385 def _real_extract(self, url):
1386 mobj = re.match(self._VALID_URL, url)
1388 raise ExtractorError(u'Invalid URL
: %s' % url)
1389 url_title = mobj.group('url_title
')
1390 webpage = self._download_webpage(url, url_title)
1392 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
1393 webpage, u'video
id')
1395 self.report_extraction(video_id)
1397 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
1400 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
1401 webpage, u'thumbnail
', fatal=False)
1403 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
1404 webpage, u'description
', fatal=False)
1406 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
1407 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
1409 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
1416 'title
': video_title,
1417 'thumbnail
': thumbnail,
1418 'description
': video_description,
1421 class XHamsterIE(InfoExtractor):
1422 """Information Extractor for xHamster"""
1423 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
1425 def _real_extract(self,url):
1426 mobj = re.match(self._VALID_URL, url)
1428 video_id = mobj.group('id')
1429 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
1430 webpage = self._download_webpage(mrss_url, video_id)
1432 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
1434 raise ExtractorError(u'Unable to extract media URL
')
1435 if len(mobj.group('server
')) == 0:
1436 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1438 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
1439 video_extension = video_url.split('.')[-1]
1441 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
1444 # Can't see the description anywhere
in the UI
1445 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1446 # webpage, u'description', fatal=False)
1447 # if video_description: video_description = unescapeHTML(video_description)
1449 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}
\'', webpage)
1451 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
1453 video_upload_date = None
1454 self._downloader.report_warning(u'Unable to extract upload date
')
1456 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
1457 webpage, u'uploader
id', default=u'anonymous
')
1459 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
1460 webpage, u'thumbnail
', fatal=False)
1465 'ext
': video_extension,
1466 'title
': video_title,
1467 # 'description
': video_description,
1468 'upload_date
': video_upload_date,
1469 'uploader_id
': video_uploader_id,
1470 'thumbnail
': video_thumbnail
1473 class HypemIE(InfoExtractor):
1474 """Information Extractor for hypem"""
1475 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
1477 def _real_extract(self, url):
1478 mobj = re.match(self._VALID_URL, url)
1480 raise ExtractorError(u'Invalid URL
: %s' % url)
1481 track_id = mobj.group(1)
1483 data = { 'ax': 1, 'ts': time.time() }
1484 data_encoded = compat_urllib_parse.urlencode(data)
1485 complete_url = url + "?" + data_encoded
1486 request = compat_urllib_request.Request(complete_url)
1487 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
1488 cookie = urlh.headers.get('Set
-Cookie
', '')
1490 self.report_extraction(track_id)
1492 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
1493 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
1495 track_list = json.loads(html_tracks)
1496 track = track_list[u'tracks
'][0]
1498 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
1501 track_id = track[u"id"]
1502 artist = track[u"artist"]
1503 title = track[u"song"]
1505 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1506 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1507 request.add_header('cookie
', cookie)
1508 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
1510 song_data = json.loads(song_data_json)
1512 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
1513 final_url = song_data[u"url"]
1523 class Vbox7IE(InfoExtractor):
1524 """Information Extractor for Vbox7"""
1525 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
1527 def _real_extract(self,url):
1528 mobj = re.match(self._VALID_URL, url)
1530 raise ExtractorError(u'Invalid URL
: %s' % url)
1531 video_id = mobj.group(1)
1533 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1534 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
1535 redirect_url = urlh.geturl() + new_location
1536 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
1538 title = self._html_search_regex(r'<title
>(.*)</title
>',
1539 webpage, u'title
').split('/')[0].strip()
1542 info_url = "http://vbox7.com/play/magare.do"
1543 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1544 info_request = compat_urllib_request.Request(info_url, data)
1545 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
1546 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
1547 if info_response is None:
1548 raise ExtractorError(u'Unable to extract the media url
')
1549 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1556 'thumbnail
': thumbnail_url,
1560 def gen_extractors():
1561 """ Return a list of an instance of every supported extractor.
1562 The order does matter; the first extractor matched is the one handling the URL.
1565 YoutubePlaylistIE(),
1590 StanfordOpenClassroomIE(),
1600 WorldStarHipHopIE(),
1630 def get_info_extractor(ie_name):
1631 """Returns the info extractor class with the given ie_name"""
1632 return globals()[ie_name+'IE
']