10 import xml
.etree
.ElementTree
19 from .extractor
.common
import InfoExtractor
, SearchInfoExtractor
21 from .extractor
.ard
import ARDIE
22 from .extractor
.arte
import ArteTvIE
23 from .extractor
.bliptv
import BlipTVIE
, BlipTVUserIE
24 from .extractor
.comedycentral
import ComedyCentralIE
25 from .extractor
.collegehumor
import CollegeHumorIE
26 from .extractor
.dailymotion
import DailymotionIE
27 from .extractor
.depositfiles
import DepositFilesIE
28 from .extractor
.escapist
import EscapistIE
29 from .extractor
.facebook
import FacebookIE
30 from .extractor
.gametrailers
import GametrailersIE
31 from .extractor
.generic
import GenericIE
32 from .extractor
.googleplus
import GooglePlusIE
33 from .extractor
.googlesearch
import GoogleSearchIE
34 from .extractor
.metacafe
import MetacafeIE
35 from .extractor
.myvideo
import MyVideoIE
36 from .extractor
.statigram
import StatigramIE
37 from .extractor
.photobucket
import PhotobucketIE
38 from .extractor
.soundcloud
import SoundcloudIE
, SoundcloudSetIE
39 from .extractor
.vimeo
import VimeoIE
40 from .extractor
.xvideos
import XVideosIE
41 from .extractor
.yahoo
import YahooIE
, YahooSearchIE
42 from .extractor
.youtube
import YoutubeIE
, YoutubePlaylistIE
, YoutubeSearchIE
, YoutubeUserIE
, YoutubeChannelIE
43 from .extractor
.zdf
import ZDFIE
47 class InfoQIE(InfoExtractor
):
48 """Information extractor for infoq.com"""
49 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
51 def _real_extract(self
, url
):
52 mobj
= re
.match(self
._VALID
_URL
, url
)
54 raise ExtractorError(u
'Invalid URL: %s' % url
)
56 webpage
= self
._download
_webpage
(url
, video_id
=url
)
57 self
.report_extraction(url
)
60 mobj
= re
.search(r
"jsclassref ?= ?'([^']*)'", webpage
)
62 raise ExtractorError(u
'Unable to extract video url')
63 real_id
= compat_urllib_parse
.unquote(base64
.b64decode(mobj
.group(1).encode('ascii')).decode('utf-8'))
64 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + real_id
67 video_title
= self
._search
_regex
(r
'contentTitle = "(.*?)";',
71 video_description
= self
._html
_search
_regex
(r
'<meta name="description" content="(.*)"(?:\s*/)?>',
72 webpage
, u
'description', fatal
=False)
74 video_filename
= video_url
.split('/')[-1]
75 video_id
, extension
= video_filename
.split('.')
83 'ext': extension
, # Extension is always(?) mp4, but seems to be flv
85 'description': video_description
,
90 class MixcloudIE(InfoExtractor
):
91 """Information extractor for www.mixcloud.com"""
93 _WORKING
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
94 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
97 def report_download_json(self
, file_id
):
98 """Report JSON download."""
99 self
.to_screen(u
'Downloading json')
101 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
102 """Get urls from 'audio_formats' section in json"""
105 bitrate_list
= jsonData
[fmt
]
106 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
107 bitrate
= max(bitrate_list
) # select highest
109 url_list
= jsonData
[fmt
][bitrate
]
110 except TypeError: # we have no bitrate info.
111 url_list
= jsonData
[fmt
]
114 def check_urls(self
, url_list
):
115 """Returns 1st active url from list"""
118 compat_urllib_request
.urlopen(url
)
120 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
125 def _print_formats(self
, formats
):
126 print('Available formats:')
127 for fmt
in formats
.keys():
128 for b
in formats
[fmt
]:
130 ext
= formats
[fmt
][b
][0]
131 print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]))
132 except TypeError: # we have no bitrate info
133 ext
= formats
[fmt
][0]
134 print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]))
137 def _real_extract(self
, url
):
138 mobj
= re
.match(self
._VALID
_URL
, url
)
140 raise ExtractorError(u
'Invalid URL: %s' % url
)
141 # extract uploader & filename from url
142 uploader
= mobj
.group(1).decode('utf-8')
143 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
145 # construct API request
146 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
147 # retrieve .json file with links to files
148 request
= compat_urllib_request
.Request(file_url
)
150 self
.report_download_json(file_url
)
151 jsonData
= compat_urllib_request
.urlopen(request
).read()
152 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
153 raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
))
156 json_data
= json
.loads(jsonData
)
157 player_url
= json_data
['player_swf_url']
158 formats
= dict(json_data
['audio_formats'])
160 req_format
= self
._downloader
.params
.get('format', None)
163 if self
._downloader
.params
.get('listformats', None):
164 self
._print
_formats
(formats
)
167 if req_format
is None or req_format
== 'best':
168 for format_param
in formats
.keys():
169 url_list
= self
.get_urls(formats
, format_param
)
171 file_url
= self
.check_urls(url_list
)
172 if file_url
is not None:
175 if req_format
not in formats
:
176 raise ExtractorError(u
'Format is not available')
178 url_list
= self
.get_urls(formats
, req_format
)
179 file_url
= self
.check_urls(url_list
)
180 format_param
= req_format
183 'id': file_id
.decode('utf-8'),
184 'url': file_url
.decode('utf-8'),
185 'uploader': uploader
.decode('utf-8'),
187 'title': json_data
['name'],
188 'ext': file_url
.split('.')[-1].decode('utf-8'),
189 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
190 'thumbnail': json_data
['thumbnail_url'],
191 'description': json_data
['description'],
192 'player_url': player_url
.decode('utf-8'),
195 class StanfordOpenClassroomIE(InfoExtractor
):
196 """Information extractor for Stanford's Open ClassRoom"""
198 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
199 IE_NAME
= u
'stanfordoc'
201 def _real_extract(self
, url
):
202 mobj
= re
.match(self
._VALID
_URL
, url
)
204 raise ExtractorError(u
'Invalid URL: %s' % url
)
206 if mobj
.group('course') and mobj
.group('video'): # A specific video
207 course
= mobj
.group('course')
208 video
= mobj
.group('video')
210 'id': course
+ '_' + video
,
215 self
.report_extraction(info
['id'])
216 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
217 xmlUrl
= baseUrl
+ video
+ '.xml'
219 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
220 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
221 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
222 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
224 info
['title'] = mdoc
.findall('./title')[0].text
225 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
227 raise ExtractorError(u
'Invalid metadata XML file')
228 info
['ext'] = info
['url'].rpartition('.')[2]
230 elif mobj
.group('course'): # A course page
231 course
= mobj
.group('course')
239 coursepage
= self
._download
_webpage
(url
, info
['id'],
240 note
='Downloading course info page',
241 errnote
='Unable to download course info page')
243 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
245 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
246 coursepage
, u
'description', fatal
=False)
248 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
252 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
256 for entry
in info
['list']:
257 assert entry
['type'] == 'reference'
258 results
+= self
.extract(entry
['url'])
262 'id': 'Stanford OpenClassroom',
268 self
.report_download_webpage(info
['id'])
269 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
271 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
272 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
273 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
))
275 info
['title'] = info
['id']
277 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
281 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
286 for entry
in info
['list']:
287 assert entry
['type'] == 'reference'
288 results
+= self
.extract(entry
['url'])
291 class MTVIE(InfoExtractor
):
292 """Information extractor for MTV.com"""
294 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
297 def _real_extract(self
, url
):
298 mobj
= re
.match(self
._VALID
_URL
, url
)
300 raise ExtractorError(u
'Invalid URL: %s' % url
)
301 if not mobj
.group('proto'):
302 url
= 'http://' + url
303 video_id
= mobj
.group('videoid')
305 webpage
= self
._download
_webpage
(url
, video_id
)
307 song_name
= self
._html
_search
_regex
(r
'<meta name="mtv_vt" content="([^"]+)"/>',
308 webpage
, u
'song name', fatal
=False)
310 video_title
= self
._html
_search
_regex
(r
'<meta name="mtv_an" content="([^"]+)"/>',
313 mtvn_uri
= self
._html
_search
_regex
(r
'<meta name="mtvn_uri" content="([^"]+)"/>',
314 webpage
, u
'mtvn_uri', fatal
=False)
316 content_id
= self
._search
_regex
(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);',
317 webpage
, u
'content id', fatal
=False)
319 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
320 self
.report_extraction(video_id
)
321 request
= compat_urllib_request
.Request(videogen_url
)
323 metadataXml
= compat_urllib_request
.urlopen(request
).read()
324 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
325 raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
))
327 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
328 renditions
= mdoc
.findall('.//rendition')
330 # For now, always pick the highest quality.
331 rendition
= renditions
[-1]
334 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
335 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
336 video_url
= rendition
.find('./src').text
338 raise ExtractorError('Invalid rendition field.')
343 'uploader': performer
,
345 'title': video_title
,
353 class YoukuIE(InfoExtractor
):
354 _VALID_URL
= r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
357 nowTime
= int(time
.time() * 1000)
358 random1
= random
.randint(1000,1998)
359 random2
= random
.randint(1000,9999)
361 return "%d%d%d" %(nowTime
,random1
,random2
)
363 def _get_file_ID_mix_string(self
, seed
):
365 source
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
367 for i
in range(len(source
)):
368 seed
= (seed
* 211 + 30031 ) % 65536
369 index
= math
.floor(seed
/ 65536 * len(source
) )
370 mixed
.append(source
[int(index
)])
371 source
.remove(source
[int(index
)])
372 #return ''.join(mixed)
375 def _get_file_id(self
, fileId
, seed
):
376 mixed
= self
._get
_file
_ID
_mix
_string
(seed
)
377 ids
= fileId
.split('*')
381 realId
.append(mixed
[int(ch
)])
382 return ''.join(realId
)
384 def _real_extract(self
, url
):
385 mobj
= re
.match(self
._VALID
_URL
, url
)
387 raise ExtractorError(u
'Invalid URL: %s' % url
)
388 video_id
= mobj
.group('ID')
390 info_url
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
392 jsondata
= self
._download
_webpage
(info_url
, video_id
)
394 self
.report_extraction(video_id
)
396 config
= json
.loads(jsondata
)
398 video_title
= config
['data'][0]['title']
399 seed
= config
['data'][0]['seed']
401 format
= self
._downloader
.params
.get('format', None)
402 supported_format
= list(config
['data'][0]['streamfileids'].keys())
404 if format
is None or format
== 'best':
405 if 'hd2' in supported_format
:
410 elif format
== 'worst':
418 fileid
= config
['data'][0]['streamfileids'][format
]
419 keys
= [s
['k'] for s
in config
['data'][0]['segs'][format
]]
420 except (UnicodeDecodeError, ValueError, KeyError):
421 raise ExtractorError(u
'Unable to extract info section')
424 sid
= self
._gen
_sid
()
425 fileid
= self
._get
_file
_id
(fileid
, seed
)
427 #column 8,9 of fileid represent the segment number
428 #fileid[7:9] should be changed
429 for index
, key
in enumerate(keys
):
431 temp_fileid
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:])
432 download_url
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
)
435 'id': '%s_part%02d' % (video_id
, index
),
439 'title': video_title
,
442 files_info
.append(info
)
447 class XNXXIE(InfoExtractor
):
448 """Information extractor for xnxx.com"""
450 _VALID_URL
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
452 VIDEO_URL_RE
= r
'flv_url=(.*?)&'
453 VIDEO_TITLE_RE
= r
'<title>(.*?)\s+-\s+XNXX.COM'
454 VIDEO_THUMB_RE
= r
'url_bigthumb=(.*?)&'
456 def _real_extract(self
, url
):
457 mobj
= re
.match(self
._VALID
_URL
, url
)
459 raise ExtractorError(u
'Invalid URL: %s' % url
)
460 video_id
= mobj
.group(1)
462 # Get webpage content
463 webpage
= self
._download
_webpage
(url
, video_id
)
465 video_url
= self
._search
_regex
(self
.VIDEO_URL_RE
,
466 webpage
, u
'video URL')
467 video_url
= compat_urllib_parse
.unquote(video_url
)
469 video_title
= self
._html
_search
_regex
(self
.VIDEO_TITLE_RE
,
472 video_thumbnail
= self
._search
_regex
(self
.VIDEO_THUMB_RE
,
473 webpage
, u
'thumbnail', fatal
=False)
480 'title': video_title
,
482 'thumbnail': video_thumbnail
,
488 class NBAIE(InfoExtractor
):
489 _VALID_URL
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
492 def _real_extract(self
, url
):
493 mobj
= re
.match(self
._VALID
_URL
, url
)
495 raise ExtractorError(u
'Invalid URL: %s' % url
)
497 video_id
= mobj
.group(1)
499 webpage
= self
._download
_webpage
(url
, video_id
)
501 video_url
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id
+ '_nba_1280x720.mp4'
503 shortened_video_id
= video_id
.rpartition('/')[2]
504 title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(.*?)"',
505 webpage
, 'title', default
=shortened_video_id
).replace('NBA.com: ', '')
507 # It isn't there in the HTML it returns to us
508 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
510 description
= self
._html
_search
_regex
(r
'<meta name="description" (?:content|value)="(.*?)" />', webpage
, 'description', fatal
=False)
513 'id': shortened_video_id
,
517 # 'uploader_date': uploader_date,
518 'description': description
,
522 class JustinTVIE(InfoExtractor
):
523 """Information extractor for justin.tv and twitch.tv"""
524 # TODO: One broadcast may be split into multiple videos. The key
525 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
526 # starts at 1 and increases. Can we treat all parts as one video?
528 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
530 (?P<channelid>[^/]+)|
531 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
532 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
536 _JUSTIN_PAGE_LIMIT
= 100
537 IE_NAME
= u
'justin.tv'
539 def report_download_page(self
, channel
, offset
):
540 """Report attempt to download a single page of videos."""
541 self
.to_screen(u
'%s: Downloading video information from %d to %d' %
542 (channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
544 # Return count of items, list of *valid* items
545 def _parse_page(self
, url
, video_id
):
546 webpage
= self
._download
_webpage
(url
, video_id
,
547 u
'Downloading video info JSON',
548 u
'unable to download video info JSON')
550 response
= json
.loads(webpage
)
551 if type(response
) != list:
552 error_text
= response
.get('error', 'unknown error')
553 raise ExtractorError(u
'Justin.tv API: %s' % error_text
)
555 for clip
in response
:
556 video_url
= clip
['video_file_url']
558 video_extension
= os
.path
.splitext(video_url
)[1][1:]
559 video_date
= re
.sub('-', '', clip
['start_time'][:10])
560 video_uploader_id
= clip
.get('user_id', clip
.get('channel_id'))
561 video_id
= clip
['id']
562 video_title
= clip
.get('title', video_id
)
566 'title': video_title
,
567 'uploader': clip
.get('channel_name', video_uploader_id
),
568 'uploader_id': video_uploader_id
,
569 'upload_date': video_date
,
570 'ext': video_extension
,
572 return (len(response
), info
)
574 def _real_extract(self
, url
):
575 mobj
= re
.match(self
._VALID
_URL
, url
)
577 raise ExtractorError(u
'invalid URL: %s' % url
)
579 api_base
= 'http://api.justin.tv'
581 if mobj
.group('channelid'):
583 video_id
= mobj
.group('channelid')
584 api
= api_base
+ '/channel/archives/%s.json' % video_id
585 elif mobj
.group('chapterid'):
586 chapter_id
= mobj
.group('chapterid')
588 webpage
= self
._download
_webpage
(url
, chapter_id
)
589 m
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
)
591 raise ExtractorError(u
'Cannot find archive of a chapter')
592 archive_id
= m
.group(1)
594 api
= api_base
+ '/broadcast/by_chapter/%s.xml' % chapter_id
595 chapter_info_xml
= self
._download
_webpage
(api
, chapter_id
,
596 note
=u
'Downloading chapter information',
597 errnote
=u
'Chapter information download failed')
598 doc
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
)
599 for a
in doc
.findall('.//archive'):
600 if archive_id
== a
.find('./id').text
:
603 raise ExtractorError(u
'Could not find chapter in chapter information')
605 video_url
= a
.find('./video_file_url').text
606 video_ext
= video_url
.rpartition('.')[2] or u
'flv'
608 chapter_api_url
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
609 chapter_info_json
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
,
610 note
='Downloading chapter metadata',
611 errnote
='Download of chapter metadata failed')
612 chapter_info
= json
.loads(chapter_info_json
)
614 bracket_start
= int(doc
.find('.//bracket_start').text
)
615 bracket_end
= int(doc
.find('.//bracket_end').text
)
617 # TODO determine start (and probably fix up file)
618 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
619 #video_url += u'?start=' + TODO:start_timestamp
620 # bracket_start is 13290, but we want 51670615
621 self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. '
622 u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
)))
625 'id': u
'c' + chapter_id
,
628 'title': chapter_info
['title'],
629 'thumbnail': chapter_info
['preview'],
630 'description': chapter_info
['description'],
631 'uploader': chapter_info
['channel']['display_name'],
632 'uploader_id': chapter_info
['channel']['name'],
636 video_id
= mobj
.group('videoid')
637 api
= api_base
+ '/broadcast/by_archive/%s.json' % video_id
639 self
.report_extraction(video_id
)
643 limit
= self
._JUSTIN
_PAGE
_LIMIT
646 self
.report_download_page(video_id
, offset
)
647 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
648 page_count
, page_info
= self
._parse
_page
(page_url
, video_id
)
649 info
.extend(page_info
)
650 if not paged
or page_count
!= limit
:
655 class FunnyOrDieIE(InfoExtractor
):
656 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
658 def _real_extract(self
, url
):
659 mobj
= re
.match(self
._VALID
_URL
, url
)
661 raise ExtractorError(u
'invalid URL: %s' % url
)
663 video_id
= mobj
.group('id')
664 webpage
= self
._download
_webpage
(url
, video_id
)
666 video_url
= self
._html
_search
_regex
(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
667 webpage
, u
'video URL', flags
=re
.DOTALL
)
669 title
= self
._html
_search
_regex
((r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
670 r
'<title>(?P<title>[^<]+?)</title>'), webpage
, 'title', flags
=re
.DOTALL
)
672 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
673 webpage
, u
'description', fatal
=False, flags
=re
.DOTALL
)
680 'description': video_description
,
684 class SteamIE(InfoExtractor
):
685 _VALID_URL
= r
"""http://store\.steampowered\.com/
687 (?P<urltype>video|app)/ #If the page is only for videos or for a game
689 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
691 _VIDEO_PAGE_TEMPLATE
= 'http://store.steampowered.com/video/%s/'
692 _AGECHECK_TEMPLATE
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
695 def suitable(cls
, url
):
696 """Receives a URL and returns True if suitable for this IE."""
697 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
699 def _real_extract(self
, url
):
700 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
701 gameID
= m
.group('gameID')
703 videourl
= self
._VIDEO
_PAGE
_TEMPLATE
% gameID
704 webpage
= self
._download
_webpage
(videourl
, gameID
)
706 if re
.search('<h2>Please enter your birth date to continue:</h2>', webpage
) is not None:
707 videourl
= self
._AGECHECK
_TEMPLATE
% gameID
708 self
.report_age_confirmation()
709 webpage
= self
._download
_webpage
(videourl
, gameID
)
711 self
.report_extraction(gameID
)
712 game_title
= self
._html
_search
_regex
(r
'<h2 class="pageheader">(.*?)</h2>',
713 webpage
, 'game title')
715 urlRE
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
716 mweb
= re
.finditer(urlRE
, webpage
)
717 namesRE
= r
'<span class="title">(?P<videoName>.+?)</span>'
718 titles
= re
.finditer(namesRE
, webpage
)
719 thumbsRE
= r
'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
720 thumbs
= re
.finditer(thumbsRE
, webpage
)
722 for vid
,vtitle
,thumb
in zip(mweb
,titles
,thumbs
):
723 video_id
= vid
.group('videoID')
724 title
= vtitle
.group('videoName')
725 video_url
= vid
.group('videoURL')
726 video_thumb
= thumb
.group('thumbnail')
728 raise ExtractorError(u
'Cannot find video url for %s' % video_id
)
733 'title': unescapeHTML(title
),
734 'thumbnail': video_thumb
737 return [self
.playlist_result(videos
, gameID
, game_title
)]
739 class UstreamIE(InfoExtractor
):
740 _VALID_URL
= r
'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
743 def _real_extract(self
, url
):
744 m
= re
.match(self
._VALID
_URL
, url
)
745 video_id
= m
.group('videoID')
747 video_url
= u
'http://tcdn.ustream.tv/video/%s' % video_id
748 webpage
= self
._download
_webpage
(url
, video_id
)
750 self
.report_extraction(video_id
)
752 video_title
= self
._html
_search
_regex
(r
'data-title="(?P<title>.+)"',
755 uploader
= self
._html
_search
_regex
(r
'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
756 webpage
, u
'uploader', fatal
=False, flags
=re
.DOTALL
)
758 thumbnail
= self
._html
_search
_regex
(r
'<link rel="image_src" href="(?P<thumb>.*?)"',
759 webpage
, u
'thumbnail', fatal
=False)
765 'title': video_title
,
766 'uploader': uploader
,
767 'thumbnail': thumbnail
,
771 class WorldStarHipHopIE(InfoExtractor
):
772 _VALID_URL
= r
'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
773 IE_NAME
= u
'WorldStarHipHop'
775 def _real_extract(self
, url
):
776 m
= re
.match(self
._VALID
_URL
, url
)
777 video_id
= m
.group('id')
779 webpage_src
= self
._download
_webpage
(url
, video_id
)
781 video_url
= self
._search
_regex
(r
'so\.addVariable\("file","(.*?)"\)',
782 webpage_src
, u
'video URL')
784 if 'mp4' in video_url
:
789 video_title
= self
._html
_search
_regex
(r
"<title>(.*)</title>",
790 webpage_src
, u
'title')
792 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
793 thumbnail
= self
._html
_search
_regex
(r
'rel="image_src" href="(.*)" />',
794 webpage_src
, u
'thumbnail', fatal
=False)
797 _title
= r
"""candytitles.*>(.*)</span>"""
798 mobj
= re
.search(_title
, webpage_src
)
800 video_title
= mobj
.group(1)
805 'title' : video_title
,
806 'thumbnail' : thumbnail
,
811 class RBMARadioIE(InfoExtractor
):
812 _VALID_URL
= r
'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
814 def _real_extract(self
, url
):
815 m
= re
.match(self
._VALID
_URL
, url
)
816 video_id
= m
.group('videoID')
818 webpage
= self
._download
_webpage
(url
, video_id
)
820 json_data
= self
._search
_regex
(r
'window\.gon.*?gon\.show=(.+?);$',
821 webpage
, u
'json data', flags
=re
.MULTILINE
)
824 data
= json
.loads(json_data
)
825 except ValueError as e
:
826 raise ExtractorError(u
'Invalid JSON: ' + str(e
))
828 video_url
= data
['akamai_url'] + '&cbr=256'
829 url_parts
= compat_urllib_parse_urlparse(video_url
)
830 video_ext
= url_parts
.path
.rpartition('.')[2]
835 'title': data
['title'],
836 'description': data
.get('teaser_text'),
837 'location': data
.get('country_of_origin'),
838 'uploader': data
.get('host', {}).get('name'),
839 'uploader_id': data
.get('host', {}).get('slug'),
840 'thumbnail': data
.get('image', {}).get('large_url_2x'),
841 'duration': data
.get('duration'),
846 class YouPornIE(InfoExtractor
):
847 """Information extractor for youporn.com."""
848 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
850 def _print_formats(self
, formats
):
851 """Print all available formats"""
852 print(u
'Available formats:')
853 print(u
'ext\t\tformat')
854 print(u
'---------------------------------')
855 for format
in formats
:
856 print(u
'%s\t\t%s' % (format
['ext'], format
['format']))
858 def _specific(self
, req_format
, formats
):
860 if(x
["format"]==req_format
):
864 def _real_extract(self
, url
):
865 mobj
= re
.match(self
._VALID
_URL
, url
)
867 raise ExtractorError(u
'Invalid URL: %s' % url
)
868 video_id
= mobj
.group('videoid')
870 req
= compat_urllib_request
.Request(url
)
871 req
.add_header('Cookie', 'age_verified=1')
872 webpage
= self
._download
_webpage
(req
, video_id
)
874 # Get JSON parameters
875 json_params
= self
._search
_regex
(r
'var currentVideo = new Video\((.*)\);', webpage
, u
'JSON parameters')
877 params
= json
.loads(json_params
)
879 raise ExtractorError(u
'Invalid JSON')
881 self
.report_extraction(video_id
)
883 video_title
= params
['title']
884 upload_date
= unified_strdate(params
['release_date_f'])
885 video_description
= params
['description']
886 video_uploader
= params
['submitted_by']
887 thumbnail
= params
['thumbnails'][0]['image']
889 raise ExtractorError('Missing JSON parameter: ' + sys
.exc_info()[1])
891 # Get all of the formats available
892 DOWNLOAD_LIST_RE
= r
'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
893 download_list_html
= self
._search
_regex
(DOWNLOAD_LIST_RE
,
894 webpage
, u
'download list').strip()
896 # Get all of the links from the page
897 LINK_RE
= r
'(?s)<a href="(?P<url>[^"]+)">'
898 links
= re
.findall(LINK_RE
, download_list_html
)
900 raise ExtractorError(u
'ERROR: no known formats available for video')
902 self
.to_screen(u
'Links found: %d' % len(links
))
907 # A link looks like this:
908 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
909 # A path looks like this:
910 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
911 video_url
= unescapeHTML( link
)
912 path
= compat_urllib_parse_urlparse( video_url
).path
913 extension
= os
.path
.splitext( path
)[1][1:]
914 format
= path
.split('/')[4].split('_')[:2]
917 format
= "-".join( format
)
918 # title = u'%s-%s-%s' % (video_title, size, bitrate)
923 'uploader': video_uploader
,
924 'upload_date': upload_date
,
925 'title': video_title
,
928 'thumbnail': thumbnail
,
929 'description': video_description
932 if self
._downloader
.params
.get('listformats', None):
933 self
._print
_formats
(formats
)
936 req_format
= self
._downloader
.params
.get('format', None)
937 self
.to_screen(u
'Format: %s' % req_format
)
939 if req_format
is None or req_format
== 'best':
941 elif req_format
== 'worst':
943 elif req_format
in ('-1', 'all'):
946 format
= self
._specific
( req_format
, formats
)
948 raise ExtractorError(u
'Requested format not available')
953 class PornotubeIE(InfoExtractor
):
954 """Information extractor for pornotube.com."""
955 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
957 def _real_extract(self
, url
):
958 mobj
= re
.match(self
._VALID
_URL
, url
)
960 raise ExtractorError(u
'Invalid URL: %s' % url
)
962 video_id
= mobj
.group('videoid')
963 video_title
= mobj
.group('title')
965 # Get webpage content
966 webpage
= self
._download
_webpage
(url
, video_id
)
969 VIDEO_URL_RE
= r
'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
970 video_url
= self
._search
_regex
(VIDEO_URL_RE
, webpage
, u
'video url')
971 video_url
= compat_urllib_parse
.unquote(video_url
)
973 #Get the uploaded date
974 VIDEO_UPLOADED_RE
= r
'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
975 upload_date
= self
._html
_search
_regex
(VIDEO_UPLOADED_RE
, webpage
, u
'upload date', fatal
=False)
976 if upload_date
: upload_date
= unified_strdate(upload_date
)
978 info
= {'id': video_id
,
981 'upload_date': upload_date
,
982 'title': video_title
,
988 class YouJizzIE(InfoExtractor
):
989 """Information extractor for youjizz.com."""
990 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
992 def _real_extract(self
, url
):
993 mobj
= re
.match(self
._VALID
_URL
, url
)
995 raise ExtractorError(u
'Invalid URL: %s' % url
)
997 video_id
= mobj
.group('videoid')
999 # Get webpage content
1000 webpage
= self
._download
_webpage
(url
, video_id
)
1002 # Get the video title
1003 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*)</title>',
1004 webpage
, u
'title').strip()
1006 # Get the embed page
1007 result
= re
.search(r
'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage
)
1009 raise ExtractorError(u
'ERROR: unable to extract embed page')
1011 embed_page_url
= result
.group(0).strip()
1012 video_id
= result
.group('videoid')
1014 webpage
= self
._download
_webpage
(embed_page_url
, video_id
)
1017 video_url
= self
._search
_regex
(r
'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1018 webpage
, u
'video URL')
1020 info
= {'id': video_id
,
1022 'title': video_title
,
1025 'player_url': embed_page_url
}
1029 class EightTracksIE(InfoExtractor
):
1031 _VALID_URL
= r
'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1033 def _real_extract(self
, url
):
1034 mobj
= re
.match(self
._VALID
_URL
, url
)
1036 raise ExtractorError(u
'Invalid URL: %s' % url
)
1037 playlist_id
= mobj
.group('id')
1039 webpage
= self
._download
_webpage
(url
, playlist_id
)
1041 json_like
= self
._search
_regex
(r
"PAGE.mix = (.*?);\n", webpage
, u
'trax information', flags
=re
.DOTALL
)
1042 data
= json
.loads(json_like
)
1044 session
= str(random
.randint(0, 1000000000))
1046 track_count
= data
['tracks_count']
1047 first_url
= 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session
, mix_id
)
1048 next_url
= first_url
1050 for i
in itertools
.count():
1051 api_json
= self
._download
_webpage
(next_url
, playlist_id
,
1052 note
=u
'Downloading song information %s/%s' % (str(i
+1), track_count
),
1053 errnote
=u
'Failed to download song information')
1054 api_data
= json
.loads(api_json
)
1055 track_data
= api_data
[u
'set']['track']
1057 'id': track_data
['id'],
1058 'url': track_data
['track_file_stream_url'],
1059 'title': track_data
['performer'] + u
' - ' + track_data
['name'],
1060 'raw_title': track_data
['name'],
1061 'uploader_id': data
['user']['login'],
1065 if api_data
['set']['at_last_track']:
1067 next_url
= 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session
, mix_id
, track_data
['id'])
1070 class KeekIE(InfoExtractor
):
1071 _VALID_URL
= r
'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1074 def _real_extract(self
, url
):
1075 m
= re
.match(self
._VALID
_URL
, url
)
1076 video_id
= m
.group('videoID')
1078 video_url
= u
'http://cdn.keek.com/keek/video/%s' % video_id
1079 thumbnail
= u
'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1080 webpage
= self
._download
_webpage
(url
, video_id
)
1082 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
1085 uploader
= self
._html
_search
_regex
(r
'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1086 webpage
, u
'uploader', fatal
=False)
1092 'title': video_title
,
1093 'thumbnail': thumbnail
,
1094 'uploader': uploader
1098 class TEDIE(InfoExtractor
):
1099 _VALID_URL
=r
'''http://www\.ted\.com/
1101 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1103 ((?P<type_talk>talks)) # We have a simple talk
1105 (/lang/(.*?))? # The url may contain the language
1106 /(?P<name>\w+) # Here goes the name and then ".html"
1110 def suitable(cls
, url
):
1111 """Receives a URL and returns True if suitable for this IE."""
1112 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1114 def _real_extract(self
, url
):
1115 m
=re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1116 if m
.group('type_talk'):
1117 return [self
._talk
_info
(url
)]
1119 playlist_id
=m
.group('playlist_id')
1120 name
=m
.group('name')
1121 self
.to_screen(u
'Getting info of playlist %s: "%s"' % (playlist_id
,name
))
1122 return [self
._playlist
_videos
_info
(url
,name
,playlist_id
)]
1124 def _playlist_videos_info(self
,url
,name
,playlist_id
=0):
1125 '''Returns the videos of the playlist'''
1127 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1128 ([.\s]*?)data-playlist_item_id="(\d+)"
1129 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1131 video_name_RE
=r
'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1132 webpage
=self
._download
_webpage
(url
, playlist_id
, 'Downloading playlist webpage')
1133 m_videos
=re
.finditer(video_RE
,webpage
,re
.VERBOSE
)
1134 m_names
=re
.finditer(video_name_RE
,webpage
)
1136 playlist_title
= self
._html
_search
_regex
(r
'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1137 webpage
, 'playlist title')
1139 playlist_entries
= []
1140 for m_video
, m_name
in zip(m_videos
,m_names
):
1141 video_id
=m_video
.group('video_id')
1142 talk_url
='http://www.ted.com%s' % m_name
.group('talk_url')
1143 playlist_entries
.append(self
.url_result(talk_url
, 'TED'))
1144 return self
.playlist_result(playlist_entries
, playlist_id
= playlist_id
, playlist_title
= playlist_title
)
1146 def _talk_info(self
, url
, video_id
=0):
1147 """Return the video for the talk in the url"""
1148 m
= re
.match(self
._VALID
_URL
, url
,re
.VERBOSE
)
1149 video_name
= m
.group('name')
1150 webpage
= self
._download
_webpage
(url
, video_id
, 'Downloading \"%s\" page' % video_name
)
1151 self
.report_extraction(video_name
)
1152 # If the url includes the language we get the title translated
1153 title
= self
._html
_search
_regex
(r
'<span id="altHeadline" >(?P<title>.*)</span>',
1155 json_data
= self
._search
_regex
(r
'<script.*?>var talkDetails = ({.*?})</script>',
1156 webpage
, 'json data')
1157 info
= json
.loads(json_data
)
1158 desc
= self
._html
_search
_regex
(r
'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1159 webpage
, 'description', flags
= re
.DOTALL
)
1161 thumbnail
= self
._search
_regex
(r
'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1162 webpage
, 'thumbnail')
1165 'url': info
['htmlStreams'][-1]['file'],
1168 'thumbnail': thumbnail
,
1169 'description': desc
,
1173 class MySpassIE(InfoExtractor
):
1174 _VALID_URL
= r
'http://www.myspass.de/.*'
1176 def _real_extract(self
, url
):
1177 META_DATA_URL_TEMPLATE
= 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1179 # video id is the last path element of the URL
1180 # usually there is a trailing slash, so also try the second but last
1181 url_path
= compat_urllib_parse_urlparse(url
).path
1182 url_parent_path
, video_id
= os
.path
.split(url_path
)
1184 _
, video_id
= os
.path
.split(url_parent_path
)
1187 metadata_url
= META_DATA_URL_TEMPLATE
% video_id
1188 metadata_text
= self
._download
_webpage
(metadata_url
, video_id
)
1189 metadata
= xml
.etree
.ElementTree
.fromstring(metadata_text
.encode('utf-8'))
1191 # extract values from metadata
1192 url_flv_el
= metadata
.find('url_flv')
1193 if url_flv_el
is None:
1194 raise ExtractorError(u
'Unable to extract download url')
1195 video_url
= url_flv_el
.text
1196 extension
= os
.path
.splitext(video_url
)[1][1:]
1197 title_el
= metadata
.find('title')
1198 if title_el
is None:
1199 raise ExtractorError(u
'Unable to extract title')
1200 title
= title_el
.text
1201 format_id_el
= metadata
.find('format_id')
1202 if format_id_el
is None:
1205 format
= format_id_el
.text
1206 description_el
= metadata
.find('description')
1207 if description_el
is not None:
1208 description
= description_el
.text
1211 imagePreview_el
= metadata
.find('imagePreview')
1212 if imagePreview_el
is not None:
1213 thumbnail
= imagePreview_el
.text
1222 'thumbnail': thumbnail
,
1223 'description': description
1227 class SpiegelIE(InfoExtractor
):
1228 _VALID_URL
= r
'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1230 def _real_extract(self
, url
):
1231 m
= re
.match(self
._VALID
_URL
, url
)
1232 video_id
= m
.group('videoID')
1234 webpage
= self
._download
_webpage
(url
, video_id
)
1236 video_title
= self
._html
_search
_regex
(r
'<div class="module-title">(.*?)</div>',
1239 xml_url
= u
'http://video2.spiegel.de/flash/' + video_id
+ u
'.xml'
1240 xml_code
= self
._download
_webpage
(xml_url
, video_id
,
1241 note
=u
'Downloading XML', errnote
=u
'Failed to download XML')
1243 idoc
= xml
.etree
.ElementTree
.fromstring(xml_code
)
1244 last_type
= idoc
[-1]
1245 filename
= last_type
.findall('./filename')[0].text
1246 duration
= float(last_type
.findall('./duration')[0].text
)
1248 video_url
= 'http://video2.spiegel.de/flash/' + filename
1249 video_ext
= filename
.rpartition('.')[2]
1254 'title': video_title
,
1255 'duration': duration
,
1259 class LiveLeakIE(InfoExtractor
):
1261 _VALID_URL
= r
'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1262 IE_NAME
= u
'liveleak'
1264 def _real_extract(self
, url
):
1265 mobj
= re
.match(self
._VALID
_URL
, url
)
1267 raise ExtractorError(u
'Invalid URL: %s' % url
)
1269 video_id
= mobj
.group('video_id')
1271 webpage
= self
._download
_webpage
(url
, video_id
)
1273 video_url
= self
._search
_regex
(r
'file: "(.*?)",',
1274 webpage
, u
'video URL')
1276 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
1277 webpage
, u
'title').replace('LiveLeak.com -', '').strip()
1279 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
1280 webpage
, u
'description', fatal
=False)
1282 video_uploader
= self
._html
_search
_regex
(r
'By:.*?(\w+)</a>',
1283 webpage
, u
'uploader', fatal
=False)
1289 'title': video_title
,
1290 'description': video_description
,
1291 'uploader': video_uploader
1298 class TumblrIE(InfoExtractor
):
1299 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1301 def _real_extract(self
, url
):
1302 m_url
= re
.match(self
._VALID
_URL
, url
)
1303 video_id
= m_url
.group('id')
1304 blog
= m_url
.group('blog_name')
1306 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
1307 webpage
= self
._download
_webpage
(url
, video_id
)
1309 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
1310 video
= re
.search(re_video
, webpage
)
1312 raise ExtractorError(u
'Unable to extract video')
1313 video_url
= video
.group('video_url')
1314 ext
= video
.group('ext')
1316 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1317 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
1318 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
1320 # The only place where you can get a title, it's not complete,
1321 # but searching in other places doesn't work for all videos
1322 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
1323 webpage
, u
'title', flags
=re
.DOTALL
)
1325 return [{'id': video_id
,
1327 'title': video_title
,
1328 'thumbnail': video_thumbnail
,
1332 class BandcampIE(InfoExtractor
):
1333 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1335 def _real_extract(self
, url
):
1336 mobj
= re
.match(self
._VALID
_URL
, url
)
1337 title
= mobj
.group('title')
1338 webpage
= self
._download
_webpage
(url
, title
)
1339 # We get the link to the free download page
1340 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
1341 if m_download
is None:
1342 raise ExtractorError(u
'No free songs found')
1344 download_link
= m_download
.group(1)
1345 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1346 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
1348 download_webpage
= self
._download
_webpage
(download_link
, id,
1349 'Downloading free downloads page')
1350 # We get the dictionary of the track from some javascrip code
1351 info
= re
.search(r
'items: (.*?),$',
1352 download_webpage
, re
.MULTILINE
).group(1)
1353 info
= json
.loads(info
)[0]
1354 # We pick mp3-320 for now, until format selection can be easily implemented.
1355 mp3_info
= info
[u
'downloads'][u
'mp3-320']
1356 # If we try to use this url it says the link has expired
1357 initial_url
= mp3_info
[u
'url']
1358 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1359 m_url
= re
.match(re_url
, initial_url
)
1360 #We build the url we will use to get the final track url
1361 # This url is build in Bandcamp in the script download_bunde_*.js
1362 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
1363 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
1364 # If we could correctly generate the .rand field the url would be
1365 #in the "download_url" key
1366 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
1368 track_info
= {'id':id,
1369 'title' : info
[u
'title'],
1372 'thumbnail' : info
[u
'thumb_url'],
1373 'uploader' : info
[u
'artist']
1378 class RedTubeIE(InfoExtractor
):
1379 """Information Extractor for redtube"""
1380 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1382 def _real_extract(self
,url
):
1383 mobj
= re
.match(self
._VALID
_URL
, url
)
1385 raise ExtractorError(u
'Invalid URL: %s' % url
)
1387 video_id
= mobj
.group('id')
1388 video_extension
= 'mp4'
1389 webpage
= self
._download
_webpage
(url
, video_id
)
1391 self
.report_extraction(video_id
)
1393 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
1394 webpage
, u
'video URL')
1396 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1402 'ext': video_extension
,
1403 'title': video_title
,
1406 class InaIE(InfoExtractor
):
1407 """Information Extractor for Ina.fr"""
1408 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1410 def _real_extract(self
,url
):
1411 mobj
= re
.match(self
._VALID
_URL
, url
)
1413 video_id
= mobj
.group('id')
1414 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
1415 video_extension
= 'mp4'
1416 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
1418 self
.report_extraction(video_id
)
1420 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1421 webpage
, u
'video URL')
1423 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1429 'ext': video_extension
,
1430 'title': video_title
,
1433 class HowcastIE(InfoExtractor
):
1434 """Information Extractor for Howcast.com"""
1435 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1437 def _real_extract(self
, url
):
1438 mobj
= re
.match(self
._VALID
_URL
, url
)
1440 video_id
= mobj
.group('id')
1441 webpage_url
= 'http://www.howcast.com/videos/' + video_id
1442 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
1444 self
.report_extraction(video_id
)
1446 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1447 webpage, u'video URL')
1449 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
1452 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1453 webpage, u'description', fatal=False)
1455 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1456 webpage, u'thumbnail', fatal=False)
1462 'title': video_title,
1463 'description': video_description,
1464 'thumbnail': thumbnail,
1467 class VineIE(InfoExtractor):
1468 """Information Extractor for Vine.co"""
1469 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1471 def _real_extract(self, url):
1472 mobj = re.match(self._VALID_URL, url)
1474 video_id = mobj.group('id')
1475 webpage_url = 'https://vine.co/v/' + video_id
1476 webpage = self._download_webpage(webpage_url, video_id)
1478 self.report_extraction(video_id)
1480 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
1481 webpage, u'video URL')
1483 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
1486 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
1487 webpage, u'thumbnail', fatal=False)
1489 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
1490 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1496 'title': video_title,
1497 'thumbnail': thumbnail,
1498 'uploader': uploader,
1501 class FlickrIE(InfoExtractor):
1502 """Information Extractor for Flickr videos"""
1503 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1505 def _real_extract(self, url):
1506 mobj = re.match(self._VALID_URL, url)
1508 video_id = mobj.group('id')
1509 video_uploader_id = mobj.group('uploader_id')
1510 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1511 webpage = self._download_webpage(webpage_url, video_id)
1513 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
1515 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1516 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1518 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1519 first_xml, u'node_id')
1521 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1522 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1524 self.report_extraction(video_id)
1526 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
1528 raise ExtractorError(u'Unable to extract video url')
1529 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1531 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
1532 webpage, u'video title
')
1534 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
1535 webpage, u'description', fatal=False)
1537 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
1538 webpage, u'thumbnail
', fatal=False)
1544 'title
': video_title,
1545 'description
': video_description,
1546 'thumbnail
': thumbnail,
1547 'uploader_id
': video_uploader_id,
1550 class TeamcocoIE(InfoExtractor):
1551 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
1553 def _real_extract(self, url):
1554 mobj = re.match(self._VALID_URL, url)
1556 raise ExtractorError(u'Invalid URL
: %s' % url)
1557 url_title = mobj.group('url_title
')
1558 webpage = self._download_webpage(url, url_title)
1560 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
1561 webpage, u'video
id')
1563 self.report_extraction(video_id)
1565 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
1568 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
1569 webpage, u'thumbnail
', fatal=False)
1571 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
1572 webpage, u'description
', fatal=False)
1574 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
1575 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
1577 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
1584 'title
': video_title,
1585 'thumbnail
': thumbnail,
1586 'description
': video_description,
1589 class XHamsterIE(InfoExtractor):
1590 """Information Extractor for xHamster"""
1591 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
1593 def _real_extract(self,url):
1594 mobj = re.match(self._VALID_URL, url)
1596 video_id = mobj.group('id')
1597 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
1598 webpage = self._download_webpage(mrss_url, video_id)
1600 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
1602 raise ExtractorError(u'Unable to extract media URL
')
1603 if len(mobj.group('server
')) == 0:
1604 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1606 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
1607 video_extension = video_url.split('.')[-1]
1609 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
1612 # Can't see the description anywhere
in the UI
1613 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1614 # webpage, u'description', fatal=False)
1615 # if video_description: video_description = unescapeHTML(video_description)
1617 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}
\'', webpage)
1619 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
1621 video_upload_date = None
1622 self._downloader.report_warning(u'Unable to extract upload date
')
1624 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
1625 webpage, u'uploader
id', default=u'anonymous
')
1627 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
1628 webpage, u'thumbnail
', fatal=False)
1633 'ext
': video_extension,
1634 'title
': video_title,
1635 # 'description
': video_description,
1636 'upload_date
': video_upload_date,
1637 'uploader_id
': video_uploader_id,
1638 'thumbnail
': video_thumbnail
1641 class HypemIE(InfoExtractor):
1642 """Information Extractor for hypem"""
1643 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
1645 def _real_extract(self, url):
1646 mobj = re.match(self._VALID_URL, url)
1648 raise ExtractorError(u'Invalid URL
: %s' % url)
1649 track_id = mobj.group(1)
1651 data = { 'ax': 1, 'ts': time.time() }
1652 data_encoded = compat_urllib_parse.urlencode(data)
1653 complete_url = url + "?" + data_encoded
1654 request = compat_urllib_request.Request(complete_url)
1655 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
1656 cookie = urlh.headers.get('Set
-Cookie
', '')
1658 self.report_extraction(track_id)
1660 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
1661 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
1663 track_list = json.loads(html_tracks)
1664 track = track_list[u'tracks
'][0]
1666 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
1669 track_id = track[u"id"]
1670 artist = track[u"artist"]
1671 title = track[u"song"]
1673 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1674 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1675 request.add_header('cookie
', cookie)
1676 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
1678 song_data = json.loads(song_data_json)
1680 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
1681 final_url = song_data[u"url"]
1691 class Vbox7IE(InfoExtractor):
1692 """Information Extractor for Vbox7"""
1693 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
1695 def _real_extract(self,url):
1696 mobj = re.match(self._VALID_URL, url)
1698 raise ExtractorError(u'Invalid URL
: %s' % url)
1699 video_id = mobj.group(1)
1701 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1702 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
1703 redirect_url = urlh.geturl() + new_location
1704 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
1706 title = self._html_search_regex(r'<title
>(.*)</title
>',
1707 webpage, u'title
').split('/')[0].strip()
1710 info_url = "http://vbox7.com/play/magare.do"
1711 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1712 info_request = compat_urllib_request.Request(info_url, data)
1713 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
1714 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
1715 if info_response is None:
1716 raise ExtractorError(u'Unable to extract the media url
')
1717 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1724 'thumbnail
': thumbnail_url,
1728 def gen_extractors():
1729 """ Return a list of an instance of every supported extractor.
1730 The order does matter; the first extractor matched is the one handling the URL.
1733 YoutubePlaylistIE(),
1758 StanfordOpenClassroomIE(),
1768 WorldStarHipHopIE(),
1798 def get_info_extractor(ie_name):
1799 """Returns the info extractor class with the given ie_name"""
1800 return globals()[ie_name+'IE
']