2 # -*- coding: utf-8 -*-
4 from __future__
import absolute_import
15 import xml
.etree
.ElementTree
24 from .extractor
.common
import InfoExtractor
, SearchInfoExtractor
26 from .extractor
.ard
import ARDIE
27 from .extractor
.arte
import ArteTvIE
28 from .extractor
.dailymotion
import DailymotionIE
29 from .extractor
.gametrailers
import GametrailersIE
30 from .extractor
.generic
import GenericIE
31 from .extractor
.metacafe
import MetacafeIE
32 from .extractor
.statigram
import StatigramIE
33 from .extractor
.photobucket
import PhotobucketIE
34 from .extractor
.vimeo
import VimeoIE
35 from .extractor
.yahoo
import YahooIE
36 from .extractor
.youtube
import YoutubeIE
, YoutubePlaylistIE
, YoutubeSearchIE
, YoutubeUserIE
, YoutubeChannelIE
37 from .extractor
.zdf
import ZDFIE
52 class GoogleSearchIE(SearchInfoExtractor
):
53 """Information Extractor for Google Video search queries."""
54 _MORE_PAGES_INDICATOR
= r
'id="pnnext" class="pn"'
56 IE_NAME
= u
'video.google:search'
57 _SEARCH_KEY
= 'gvsearch'
59 def _get_n_results(self
, query
, n
):
60 """Get a specified number of results for a query"""
68 for pagenum
in itertools
.count(1):
69 result_url
= u
'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse
.quote_plus(query
), pagenum
*10)
70 webpage
= self
._download
_webpage
(result_url
, u
'gvsearch:' + query
,
71 note
='Downloading result page ' + str(pagenum
))
73 for mobj
in re
.finditer(r
'<h3 class="r"><a href="([^"]+)"', webpage
):
78 res
['entries'].append(e
)
80 if (pagenum
* 10 > n
) or not re
.search(self
._MORE
_PAGES
_INDICATOR
, webpage
):
83 class YahooSearchIE(SearchInfoExtractor
):
84 """Information Extractor for Yahoo! Video search queries."""
87 IE_NAME
= u
'screen.yahoo:search'
88 _SEARCH_KEY
= 'yvsearch'
90 def _get_n_results(self
, query
, n
):
91 """Get a specified number of results for a query"""
98 for pagenum
in itertools
.count(0):
99 result_url
= u
'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum
* 30)
100 webpage
= self
._download
_webpage
(result_url
, query
,
101 note
='Downloading results page '+str(pagenum
+1))
102 info
= json
.loads(webpage
)
104 results
= info
[u
'results']
106 for (i
, r
) in enumerate(results
):
107 if (pagenum
* 30) +i
>= n
:
109 mobj
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
)
110 e
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo')
111 res
['entries'].append(e
)
112 if (pagenum
* 30 +i
>= n
) or (m
[u
'last'] >= (m
[u
'total'] -1 )):
118 class BlipTVUserIE(InfoExtractor
):
119 """Information Extractor for blip.tv users."""
121 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
123 IE_NAME
= u
'blip.tv:user'
125 def _real_extract(self
, url
):
127 mobj
= re
.match(self
._VALID
_URL
, url
)
129 raise ExtractorError(u
'Invalid URL: %s' % url
)
131 username
= mobj
.group(1)
133 page_base
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
135 page
= self
._download
_webpage
(url
, username
, u
'Downloading user page')
136 mobj
= re
.search(r
'data-users-id="([^"]+)"', page
)
137 page_base
= page_base
% mobj
.group(1)
140 # Download video ids using BlipTV Ajax calls. Result size per
141 # query is limited (currently to 12 videos) so we need to query
142 # page by page until there are no video ids - it means we got
149 url
= page_base
+ "&page=" + str(pagenum
)
150 page
= self
._download
_webpage
(url
, username
,
151 u
'Downloading video ids from page %d' % pagenum
)
153 # Extract video identifiers
156 for mobj
in re
.finditer(r
'href="/([^"]+)"', page
):
157 if mobj
.group(1) not in ids_in_page
:
158 ids_in_page
.append(unescapeHTML(mobj
.group(1)))
160 video_ids
.extend(ids_in_page
)
162 # A little optimization - if current page is not
163 # "full", ie. does not contain PAGE_SIZE video ids then
164 # we can assume that this page is the last one - there
165 # are no more ids on further pages - no need to query
168 if len(ids_in_page
) < self
._PAGE
_SIZE
:
173 urls
= [u
'http://blip.tv/%s' % video_id
for video_id
in video_ids
]
174 url_entries
= [self
.url_result(url
, 'BlipTV') for url
in urls
]
175 return [self
.playlist_result(url_entries
, playlist_title
= username
)]
178 class DepositFilesIE(InfoExtractor
):
179 """Information extractor for depositfiles.com"""
181 _VALID_URL
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
183 def _real_extract(self
, url
):
184 file_id
= url
.split('/')[-1]
185 # Rebuild url in english locale
186 url
= 'http://depositfiles.com/en/files/' + file_id
188 # Retrieve file webpage with 'Free download' button pressed
189 free_download_indication
= { 'gateway_result' : '1' }
190 request
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
))
192 self
.report_download_webpage(file_id
)
193 webpage
= compat_urllib_request
.urlopen(request
).read()
194 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
195 raise ExtractorError(u
'Unable to retrieve file webpage: %s' % compat_str(err
))
197 # Search for the real file URL
198 mobj
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
)
199 if (mobj
is None) or (mobj
.group(1) is None):
200 # Try to figure out reason of the error.
201 mobj
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
)
202 if (mobj
is not None) and (mobj
.group(1) is not None):
203 restriction_message
= re
.sub('\s+', ' ', mobj
.group(1)).strip()
204 raise ExtractorError(u
'%s' % restriction_message
)
206 raise ExtractorError(u
'Unable to extract download URL from: %s' % url
)
208 file_url
= mobj
.group(1)
209 file_extension
= os
.path
.splitext(file_url
)[1][1:]
211 # Search for file title
212 file_title
= self
._search
_regex
(r
'<b title="(.*?)">', webpage
, u
'title')
215 'id': file_id
.decode('utf-8'),
216 'url': file_url
.decode('utf-8'),
220 'ext': file_extension
.decode('utf-8'),
224 class FacebookIE(InfoExtractor
):
225 """Information Extractor for Facebook"""
227 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
228 _LOGIN_URL
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
229 _NETRC_MACHINE
= 'facebook'
230 IE_NAME
= u
'facebook'
232 def report_login(self
):
233 """Report attempt to log in."""
234 self
.to_screen(u
'Logging in')
236 def _real_initialize(self
):
237 if self
._downloader
is None:
242 downloader_params
= self
._downloader
.params
244 # Attempt to use provided username and password or .netrc data
245 if downloader_params
.get('username', None) is not None:
246 useremail
= downloader_params
['username']
247 password
= downloader_params
['password']
248 elif downloader_params
.get('usenetrc', False):
250 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
255 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
256 except (IOError, netrc
.NetrcParseError
) as err
:
257 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
))
260 if useremail
is None:
269 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
272 login_results
= compat_urllib_request
.urlopen(request
).read()
273 if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None:
274 self
._downloader
.report_warning(u
'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
276 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
277 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
280 def _real_extract(self
, url
):
281 mobj
= re
.match(self
._VALID
_URL
, url
)
283 raise ExtractorError(u
'Invalid URL: %s' % url
)
284 video_id
= mobj
.group('ID')
286 url
= 'https://www.facebook.com/video/video.php?v=%s' % video_id
287 webpage
= self
._download
_webpage
(url
, video_id
)
289 BEFORE
= '{swf.addParam(param[0], param[1]);});\n'
290 AFTER
= '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
291 m
= re
.search(re
.escape(BEFORE
) + '(.*?)' + re
.escape(AFTER
), webpage
)
293 raise ExtractorError(u
'Cannot parse data')
294 data
= dict(json
.loads(m
.group(1)))
295 params_raw
= compat_urllib_parse
.unquote(data
['params'])
296 params
= json
.loads(params_raw
)
297 video_data
= params
['video_data'][0]
298 video_url
= video_data
.get('hd_src')
300 video_url
= video_data
['sd_src']
302 raise ExtractorError(u
'Cannot find video URL')
303 video_duration
= int(video_data
['video_duration'])
304 thumbnail
= video_data
['thumbnail_src']
306 video_title
= self
._html
_search
_regex
('<h2 class="uiHeaderTitle">([^<]+)</h2>',
311 'title': video_title
,
314 'duration': video_duration
,
315 'thumbnail': thumbnail
,
320 class BlipTVIE(InfoExtractor
):
321 """Information extractor for blip.tv"""
323 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
324 _URL_EXT
= r
'^.*\.([a-z0-9]+)$'
327 def report_direct_download(self
, title
):
328 """Report information extraction."""
329 self
.to_screen(u
'%s: Direct download detected' % title
)
331 def _real_extract(self
, url
):
332 mobj
= re
.match(self
._VALID
_URL
, url
)
334 raise ExtractorError(u
'Invalid URL: %s' % url
)
336 # See https://github.com/rg3/youtube-dl/issues/857
337 api_mobj
= re
.match(r
'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url
)
338 if api_mobj
is not None:
339 url
= 'http://blip.tv/play/g_%s' % api_mobj
.group('video_id')
340 urlp
= compat_urllib_parse_urlparse(url
)
341 if urlp
.path
.startswith('/play/'):
342 request
= compat_urllib_request
.Request(url
)
343 response
= compat_urllib_request
.urlopen(request
)
344 redirecturl
= response
.geturl()
345 rurlp
= compat_urllib_parse_urlparse(redirecturl
)
346 file_id
= compat_parse_qs(rurlp
.fragment
)['file'][0].rpartition('/')[2]
347 url
= 'http://blip.tv/a/a-' + file_id
348 return self
._real
_extract
(url
)
355 json_url
= url
+ cchar
+ 'skin=json&version=2&no_wrap=1'
356 request
= compat_urllib_request
.Request(json_url
)
357 request
.add_header('User-Agent', 'iTunes/10.6.1')
358 self
.report_extraction(mobj
.group(1))
361 urlh
= compat_urllib_request
.urlopen(request
)
362 if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download
363 basename
= url
.split('/')[-1]
364 title
,ext
= os
.path
.splitext(basename
)
365 title
= title
.decode('UTF-8')
366 ext
= ext
.replace('.', '')
367 self
.report_direct_download(title
)
377 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
378 raise ExtractorError(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
379 if info
is None: # Regular URL
381 json_code_bytes
= urlh
.read()
382 json_code
= json_code_bytes
.decode('utf-8')
383 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
384 raise ExtractorError(u
'Unable to read video info webpage: %s' % compat_str(err
))
387 json_data
= json
.loads(json_code
)
388 if 'Post' in json_data
:
389 data
= json_data
['Post']
393 upload_date
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
394 video_url
= data
['media']['url']
395 umobj
= re
.match(self
._URL
_EXT
, video_url
)
397 raise ValueError('Can not determine filename extension')
401 'id': data
['item_id'],
403 'uploader': data
['display_name'],
404 'upload_date': upload_date
,
405 'title': data
['title'],
407 'format': data
['media']['mimeType'],
408 'thumbnail': data
['thumbnailUrl'],
409 'description': data
['description'],
410 'player_url': data
['embedUrl'],
411 'user_agent': 'iTunes/10.6.1',
413 except (ValueError,KeyError) as err
:
414 raise ExtractorError(u
'Unable to parse video information: %s' % repr(err
))
419 class MyVideoIE(InfoExtractor
):
420 """Information Extractor for myvideo.de."""
422 _VALID_URL
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
425 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
426 # Released into the Public Domain by Tristan Fischer on 2013-05-19
427 # https://github.com/rg3/youtube-dl/pull/842
428 def __rc4crypt(self
,data
, key
):
430 box
= list(range(256))
431 for i
in list(range(256)):
432 x
= (x
+ box
[i
] + compat_ord(key
[i
% len(key
)])) % 256
433 box
[i
], box
[x
] = box
[x
], box
[i
]
439 y
= (y
+ box
[x
]) % 256
440 box
[x
], box
[y
] = box
[y
], box
[x
]
441 out
+= chr(compat_ord(char
) ^ box
[(box
[x
] + box
[y
]) % 256])
445 return hashlib
.md5(s
).hexdigest().encode()
447 def _real_extract(self
,url
):
448 mobj
= re
.match(self
._VALID
_URL
, url
)
450 raise ExtractorError(u
'invalid URL: %s' % url
)
452 video_id
= mobj
.group(1)
455 b
'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
456 b
'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
457 b
'TnpsbA0KTVRkbU1tSTRNdz09'
461 webpage_url
= 'http://www.myvideo.de/watch/%s' % video_id
462 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
464 mobj
= re
.search('source src=\'(.+?)[.]([^.]+)\'', webpage
)
466 self
.report_extraction(video_id
)
467 video_url
= mobj
.group(1) + '.flv'
469 video_title
= self
._html
_search
_regex
('<title>([^<]+)</title>',
472 video_ext
= self
._search
_regex
('[.](.+?)$', video_url
, u
'extension')
479 'title': video_title
,
484 mobj
= re
.search('var flashvars={(.+?)}', webpage
)
486 raise ExtractorError(u
'Unable to extract video')
491 for (a
, b
) in re
.findall('(.+?):\'(.+?)\',?', sec
):
492 if not a
== '_encxml':
495 encxml
= compat_urllib_parse
.unquote(b
)
496 if not params
.get('domain'):
497 params
['domain'] = 'www.myvideo.de'
498 xmldata_url
= '%s?%s' % (encxml
, compat_urllib_parse
.urlencode(params
))
499 if 'flash_playertype=MTV' in xmldata_url
:
500 self
._downloader
.report_warning(u
'avoiding MTV player')
502 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
503 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
507 enc_data
= self
._download
_webpage
(xmldata_url
, video_id
).split('=')[1]
508 enc_data_b
= binascii
.unhexlify(enc_data
)
510 base64
.b64decode(base64
.b64decode(GK
)) +
512 str(video_id
).encode('utf-8')
515 dec_data
= self
.__rc
4crypt
(enc_data_b
, sk
)
518 self
.report_extraction(video_id
)
521 mobj
= re
.search('connectionurl=\'(.*?)\'', dec_data
)
523 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
524 if 'myvideo2flash' in video_url
:
525 self
._downloader
.report_warning(u
'forcing RTMPT ...')
526 video_url
= video_url
.replace('rtmpe://', 'rtmpt://')
529 # extract non rtmp videos
530 mobj
= re
.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data
)
532 raise ExtractorError(u
'unable to extract url')
533 video_url
= compat_urllib_parse
.unquote(mobj
.group(1)) + compat_urllib_parse
.unquote(mobj
.group(2))
535 video_file
= self
._search
_regex
('source=\'(.*?)\'', dec_data
, u
'video file')
536 video_file
= compat_urllib_parse
.unquote(video_file
)
538 if not video_file
.endswith('f4m'):
539 ppath
, prefix
= video_file
.split('.')
540 video_playpath
= '%s:%s' % (prefix
, ppath
)
541 video_hls_playlist
= ''
544 video_hls_playlist
= (
545 video_filepath
+ video_file
546 ).replace('.f4m', '.m3u8')
548 video_swfobj
= self
._search
_regex
('swfobject.embedSWF\(\'(.+?)\'', webpage
, u
'swfobj')
549 video_swfobj
= compat_urllib_parse
.unquote(video_swfobj
)
551 video_title
= self
._html
_search
_regex
("<h1(?: class='globalHd')?>(.*?)</h1>",
560 'title': video_title
,
562 'play_path': video_playpath
,
563 'video_file': video_file
,
564 'video_hls_playlist': video_hls_playlist
,
565 'player_url': video_swfobj
,
569 class ComedyCentralIE(InfoExtractor
):
570 """Information extractor for The Daily Show and Colbert Report """
572 # urls can be abbreviations like :thedailyshow or :colbert
573 # urls for episodes like:
574 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
575 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
576 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
577 _VALID_URL
= r
"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
578 |(https?://)?(www\.)?
579 (?P<showname>thedailyshow|colbertnation)\.com/
580 (full-episodes/(?P<episode>.*)|
582 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
583 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
586 _available_formats
= ['3500', '2200', '1700', '1200', '750', '400']
588 _video_extensions
= {
596 _video_dimensions
= {
606 def suitable(cls
, url
):
607 """Receives a URL and returns True if suitable for this IE."""
608 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
610 def _print_formats(self
, formats
):
611 print('Available formats:')
613 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'mp4'), self
._video
_dimensions
.get(x
, '???')))
616 def _real_extract(self
, url
):
617 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
619 raise ExtractorError(u
'Invalid URL: %s' % url
)
621 if mobj
.group('shortname'):
622 if mobj
.group('shortname') in ('tds', 'thedailyshow'):
623 url
= u
'http://www.thedailyshow.com/full-episodes/'
625 url
= u
'http://www.colbertnation.com/full-episodes/'
626 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
627 assert mobj
is not None
629 if mobj
.group('clip'):
630 if mobj
.group('showname') == 'thedailyshow':
631 epTitle
= mobj
.group('tdstitle')
633 epTitle
= mobj
.group('cntitle')
636 dlNewest
= not mobj
.group('episode')
638 epTitle
= mobj
.group('showname')
640 epTitle
= mobj
.group('episode')
642 self
.report_extraction(epTitle
)
643 webpage
,htmlHandle
= self
._download
_webpage
_handle
(url
, epTitle
)
645 url
= htmlHandle
.geturl()
646 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
648 raise ExtractorError(u
'Invalid redirected URL: ' + url
)
649 if mobj
.group('episode') == '':
650 raise ExtractorError(u
'Redirected URL is still not specific: ' + url
)
651 epTitle
= mobj
.group('episode')
653 mMovieParams
= re
.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage
)
655 if len(mMovieParams
) == 0:
656 # The Colbert Report embeds the information in a without
657 # a URL prefix; so extract the alternate reference
658 # and then add the URL prefix manually.
660 altMovieParams
= re
.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage
)
661 if len(altMovieParams
) == 0:
662 raise ExtractorError(u
'unable to find Flash URL in webpage ' + url
)
664 mMovieParams
= [("http://media.mtvnservices.com/" + altMovieParams
[0], altMovieParams
[0])]
666 uri
= mMovieParams
[0][1]
667 indexUrl
= 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse
.urlencode({'uri': uri}
)
668 indexXml
= self
._download
_webpage
(indexUrl
, epTitle
,
669 u
'Downloading show index',
670 u
'unable to download episode index')
674 idoc
= xml
.etree
.ElementTree
.fromstring(indexXml
)
675 itemEls
= idoc
.findall('.//item')
676 for partNum
,itemEl
in enumerate(itemEls
):
677 mediaId
= itemEl
.findall('./guid')[0].text
678 shortMediaId
= mediaId
.split(':')[-1]
679 showId
= mediaId
.split(':')[-2].replace('.com', '')
680 officialTitle
= itemEl
.findall('./title')[0].text
681 officialDate
= unified_strdate(itemEl
.findall('./pubDate')[0].text
)
683 configUrl
= ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
684 compat_urllib_parse
.urlencode({'uri': mediaId}
))
685 configXml
= self
._download
_webpage
(configUrl
, epTitle
,
686 u
'Downloading configuration for %s' % shortMediaId
)
688 cdoc
= xml
.etree
.ElementTree
.fromstring(configXml
)
690 for rendition
in cdoc
.findall('.//rendition'):
691 finfo
= (rendition
.attrib
['bitrate'], rendition
.findall('./src')[0].text
)
695 self
._downloader
.report_error(u
'unable to download ' + mediaId
+ ': No videos found')
698 if self
._downloader
.params
.get('listformats', None):
699 self
._print
_formats
([i
[0] for i
in turls
])
702 # For now, just pick the highest bitrate
703 format
,rtmp_video_url
= turls
[-1]
705 # Get the format arg from the arg stream
706 req_format
= self
._downloader
.params
.get('format', None)
708 # Select format if we can find one
711 format
, rtmp_video_url
= f
, v
714 m
= re
.match(r
'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url
)
716 raise ExtractorError(u
'Cannot transform RTMP url')
717 base
= 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
718 video_url
= base
+ m
.group('finalid')
720 effTitle
= showId
+ u
'-' + epTitle
+ u
' part ' + compat_str(partNum
+1)
725 'upload_date': officialDate
,
730 'description': officialTitle
,
737 class EscapistIE(InfoExtractor
):
738 """Information extractor for The Escapist """
740 _VALID_URL
= r
'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
741 IE_NAME
= u
'escapist'
743 def _real_extract(self
, url
):
744 mobj
= re
.match(self
._VALID
_URL
, url
)
746 raise ExtractorError(u
'Invalid URL: %s' % url
)
747 showName
= mobj
.group('showname')
748 videoId
= mobj
.group('episode')
750 self
.report_extraction(videoId
)
751 webpage
= self
._download
_webpage
(url
, videoId
)
753 videoDesc
= self
._html
_search
_regex
('<meta name="description" content="([^"]*)"',
754 webpage
, u
'description', fatal
=False)
756 imgUrl
= self
._html
_search
_regex
('<meta property="og:image" content="([^"]*)"',
757 webpage
, u
'thumbnail', fatal
=False)
759 playerUrl
= self
._html
_search
_regex
('<meta property="og:video" content="([^"]*)"',
760 webpage
, u
'player url')
762 title
= self
._html
_search
_regex
('<meta name="title" content="([^"]*)"',
763 webpage
, u
'player url').split(' : ')[-1]
765 configUrl
= self
._search
_regex
('config=(.*)$', playerUrl
, u
'config url')
766 configUrl
= compat_urllib_parse
.unquote(configUrl
)
768 configJSON
= self
._download
_webpage
(configUrl
, videoId
,
769 u
'Downloading configuration',
770 u
'unable to download configuration')
772 # Technically, it's JavaScript, not JSON
773 configJSON
= configJSON
.replace("'", '"')
776 config
= json
.loads(configJSON
)
777 except (ValueError,) as err
:
778 raise ExtractorError(u
'Invalid JSON in configuration file: ' + compat_str(err
))
780 playlist
= config
['playlist']
781 videoUrl
= playlist
[1]['url']
786 'uploader': showName
,
791 'description': videoDesc
,
792 'player_url': playerUrl
,
797 class CollegeHumorIE(InfoExtractor
):
798 """Information extractor for collegehumor.com"""
801 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
802 IE_NAME
= u
'collegehumor'
804 def report_manifest(self
, video_id
):
805 """Report information extraction."""
806 self
.to_screen(u
'%s: Downloading XML manifest' % video_id
)
808 def _real_extract(self
, url
):
809 mobj
= re
.match(self
._VALID
_URL
, url
)
811 raise ExtractorError(u
'Invalid URL: %s' % url
)
812 video_id
= mobj
.group('videoid')
820 self
.report_extraction(video_id
)
821 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
823 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
824 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
825 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
827 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
829 videoNode
= mdoc
.findall('./video')[0]
830 info
['description'] = videoNode
.findall('./description')[0].text
831 info
['title'] = videoNode
.findall('./caption')[0].text
832 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
833 manifest_url
= videoNode
.findall('./file')[0].text
835 raise ExtractorError(u
'Invalid metadata XML file')
837 manifest_url
+= '?hdcore=2.10.3'
838 self
.report_manifest(video_id
)
840 manifestXml
= compat_urllib_request
.urlopen(manifest_url
).read()
841 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
842 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
844 adoc
= xml
.etree
.ElementTree
.fromstring(manifestXml
)
846 media_node
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
847 node_id
= media_node
.attrib
['url']
848 video_id
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
849 except IndexError as err
:
850 raise ExtractorError(u
'Invalid manifest file')
852 url_pr
= compat_urllib_parse_urlparse(manifest_url
)
853 url
= url_pr
.scheme
+ '://' + url_pr
.netloc
+ '/z' + video_id
[:-2] + '/' + node_id
+ 'Seg1-Frag1'
860 class XVideosIE(InfoExtractor
):
861 """Information extractor for xvideos.com"""
863 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
866 def _real_extract(self
, url
):
867 mobj
= re
.match(self
._VALID
_URL
, url
)
869 raise ExtractorError(u
'Invalid URL: %s' % url
)
870 video_id
= mobj
.group(1)
872 webpage
= self
._download
_webpage
(url
, video_id
)
874 self
.report_extraction(video_id
)
877 video_url
= compat_urllib_parse
.unquote(self
._search
_regex
(r
'flv_url=(.+?)&',
878 webpage
, u
'video URL'))
881 video_title
= self
._html
_search
_regex
(r
'<title>(.*?)\s+-\s+XVID',
884 # Extract video thumbnail
885 video_thumbnail
= self
._search
_regex
(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
886 webpage
, u
'thumbnail', fatal
=False)
893 'title': video_title
,
895 'thumbnail': video_thumbnail
,
902 class SoundcloudIE(InfoExtractor
):
903 """Information extractor for soundcloud.com
904 To access the media, the uid of the song and a stream token
905 must be extracted from the page source and the script must make
906 a request to media.soundcloud.com/crossdomain.xml. Then
907 the media can be grabbed by requesting from an url composed
908 of the stream token and uid
911 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
912 IE_NAME
= u
'soundcloud'
914 def report_resolve(self
, video_id
):
915 """Report information extraction."""
916 self
.to_screen(u
'%s: Resolving id' % video_id
)
918 def _real_extract(self
, url
):
919 mobj
= re
.match(self
._VALID
_URL
, url
)
921 raise ExtractorError(u
'Invalid URL: %s' % url
)
923 # extract uploader (which is in the url)
924 uploader
= mobj
.group(1)
925 # extract simple title (uploader + slug of song title)
926 slug_title
= mobj
.group(2)
927 simple_title
= uploader
+ u
'-' + slug_title
928 full_title
= '%s/%s' % (uploader
, slug_title
)
930 self
.report_resolve(full_title
)
932 url
= 'http://soundcloud.com/%s/%s' % (uploader
, slug_title
)
933 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
934 info_json
= self
._download
_webpage
(resolv_url
, full_title
, u
'Downloading info JSON')
936 info
= json
.loads(info_json
)
937 video_id
= info
['id']
938 self
.report_extraction(full_title
)
940 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
941 stream_json
= self
._download
_webpage
(streams_url
, full_title
,
942 u
'Downloading stream definitions',
943 u
'unable to download stream definitions')
945 streams
= json
.loads(stream_json
)
946 mediaURL
= streams
['http_mp3_128_url']
947 upload_date
= unified_strdate(info
['created_at'])
952 'uploader': info
['user']['username'],
953 'upload_date': upload_date
,
954 'title': info
['title'],
956 'description': info
['description'],
959 class SoundcloudSetIE(InfoExtractor
):
960 """Information extractor for soundcloud.com sets
961 To access the media, the uid of the song and a stream token
962 must be extracted from the page source and the script must make
963 a request to media.soundcloud.com/crossdomain.xml. Then
964 the media can be grabbed by requesting from an url composed
965 of the stream token and uid
968 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
969 IE_NAME
= u
'soundcloud:set'
971 def report_resolve(self
, video_id
):
972 """Report information extraction."""
973 self
.to_screen(u
'%s: Resolving id' % video_id
)
975 def _real_extract(self
, url
):
976 mobj
= re
.match(self
._VALID
_URL
, url
)
978 raise ExtractorError(u
'Invalid URL: %s' % url
)
980 # extract uploader (which is in the url)
981 uploader
= mobj
.group(1)
982 # extract simple title (uploader + slug of song title)
983 slug_title
= mobj
.group(2)
984 simple_title
= uploader
+ u
'-' + slug_title
985 full_title
= '%s/sets/%s' % (uploader
, slug_title
)
987 self
.report_resolve(full_title
)
989 url
= 'http://soundcloud.com/%s/sets/%s' % (uploader
, slug_title
)
990 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
991 info_json
= self
._download
_webpage
(resolv_url
, full_title
)
994 info
= json
.loads(info_json
)
996 for err
in info
['errors']:
997 self
._downloader
.report_error(u
'unable to download video webpage: %s' % compat_str(err
['error_message']))
1000 self
.report_extraction(full_title
)
1001 for track
in info
['tracks']:
1002 video_id
= track
['id']
1004 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1005 stream_json
= self
._download
_webpage
(streams_url
, video_id
, u
'Downloading track info JSON')
1007 self
.report_extraction(video_id
)
1008 streams
= json
.loads(stream_json
)
1009 mediaURL
= streams
['http_mp3_128_url']
1014 'uploader': track
['user']['username'],
1015 'upload_date': unified_strdate(track
['created_at']),
1016 'title': track
['title'],
1018 'description': track
['description'],
1023 class InfoQIE(InfoExtractor
):
1024 """Information extractor for infoq.com"""
1025 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1027 def _real_extract(self
, url
):
1028 mobj
= re
.match(self
._VALID
_URL
, url
)
1030 raise ExtractorError(u
'Invalid URL: %s' % url
)
1032 webpage
= self
._download
_webpage
(url
, video_id
=url
)
1033 self
.report_extraction(url
)
1036 mobj
= re
.search(r
"jsclassref ?= ?'([^']*)'", webpage
)
1038 raise ExtractorError(u
'Unable to extract video url')
1039 real_id
= compat_urllib_parse
.unquote(base64
.b64decode(mobj
.group(1).encode('ascii')).decode('utf-8'))
1040 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + real_id
1043 video_title
= self
._search
_regex
(r
'contentTitle = "(.*?)";',
1046 # Extract description
1047 video_description
= self
._html
_search
_regex
(r
'<meta name="description" content="(.*)"(?:\s*/)?>',
1048 webpage
, u
'description', fatal
=False)
1050 video_filename
= video_url
.split('/')[-1]
1051 video_id
, extension
= video_filename
.split('.')
1057 'upload_date': None,
1058 'title': video_title
,
1059 'ext': extension
, # Extension is always(?) mp4, but seems to be flv
1061 'description': video_description
,
1066 class MixcloudIE(InfoExtractor
):
1067 """Information extractor for www.mixcloud.com"""
1069 _WORKING
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1070 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1071 IE_NAME
= u
'mixcloud'
1073 def report_download_json(self
, file_id
):
1074 """Report JSON download."""
1075 self
.to_screen(u
'Downloading json')
1077 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
1078 """Get urls from 'audio_formats' section in json"""
1081 bitrate_list
= jsonData
[fmt
]
1082 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
1083 bitrate
= max(bitrate_list
) # select highest
1085 url_list
= jsonData
[fmt
][bitrate
]
1086 except TypeError: # we have no bitrate info.
1087 url_list
= jsonData
[fmt
]
1090 def check_urls(self
, url_list
):
1091 """Returns 1st active url from list"""
1092 for url
in url_list
:
1094 compat_urllib_request
.urlopen(url
)
1096 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1101 def _print_formats(self
, formats
):
1102 print('Available formats:')
1103 for fmt
in formats
.keys():
1104 for b
in formats
[fmt
]:
1106 ext
= formats
[fmt
][b
][0]
1107 print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]))
1108 except TypeError: # we have no bitrate info
1109 ext
= formats
[fmt
][0]
1110 print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]))
1113 def _real_extract(self
, url
):
1114 mobj
= re
.match(self
._VALID
_URL
, url
)
1116 raise ExtractorError(u
'Invalid URL: %s' % url
)
1117 # extract uploader & filename from url
1118 uploader
= mobj
.group(1).decode('utf-8')
1119 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
1121 # construct API request
1122 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
1123 # retrieve .json file with links to files
1124 request
= compat_urllib_request
.Request(file_url
)
1126 self
.report_download_json(file_url
)
1127 jsonData
= compat_urllib_request
.urlopen(request
).read()
1128 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1129 raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
))
1132 json_data
= json
.loads(jsonData
)
1133 player_url
= json_data
['player_swf_url']
1134 formats
= dict(json_data
['audio_formats'])
1136 req_format
= self
._downloader
.params
.get('format', None)
1139 if self
._downloader
.params
.get('listformats', None):
1140 self
._print
_formats
(formats
)
1143 if req_format
is None or req_format
== 'best':
1144 for format_param
in formats
.keys():
1145 url_list
= self
.get_urls(formats
, format_param
)
1147 file_url
= self
.check_urls(url_list
)
1148 if file_url
is not None:
1151 if req_format
not in formats
:
1152 raise ExtractorError(u
'Format is not available')
1154 url_list
= self
.get_urls(formats
, req_format
)
1155 file_url
= self
.check_urls(url_list
)
1156 format_param
= req_format
1159 'id': file_id
.decode('utf-8'),
1160 'url': file_url
.decode('utf-8'),
1161 'uploader': uploader
.decode('utf-8'),
1162 'upload_date': None,
1163 'title': json_data
['name'],
1164 'ext': file_url
.split('.')[-1].decode('utf-8'),
1165 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
1166 'thumbnail': json_data
['thumbnail_url'],
1167 'description': json_data
['description'],
1168 'player_url': player_url
.decode('utf-8'),
1171 class StanfordOpenClassroomIE(InfoExtractor
):
1172 """Information extractor for Stanford's Open ClassRoom"""
1174 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1175 IE_NAME
= u
'stanfordoc'
1177 def _real_extract(self
, url
):
1178 mobj
= re
.match(self
._VALID
_URL
, url
)
1180 raise ExtractorError(u
'Invalid URL: %s' % url
)
1182 if mobj
.group('course') and mobj
.group('video'): # A specific video
1183 course
= mobj
.group('course')
1184 video
= mobj
.group('video')
1186 'id': course
+ '_' + video
,
1188 'upload_date': None,
1191 self
.report_extraction(info
['id'])
1192 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
1193 xmlUrl
= baseUrl
+ video
+ '.xml'
1195 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
1196 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1197 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
1198 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
1200 info
['title'] = mdoc
.findall('./title')[0].text
1201 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
1203 raise ExtractorError(u
'Invalid metadata XML file')
1204 info
['ext'] = info
['url'].rpartition('.')[2]
1206 elif mobj
.group('course'): # A course page
1207 course
= mobj
.group('course')
1212 'upload_date': None,
1215 coursepage
= self
._download
_webpage
(url
, info
['id'],
1216 note
='Downloading course info page',
1217 errnote
='Unable to download course info page')
1219 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
1221 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
1222 coursepage
, u
'description', fatal
=False)
1224 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
1227 'type': 'reference',
1228 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
1232 for entry
in info
['list']:
1233 assert entry
['type'] == 'reference'
1234 results
+= self
.extract(entry
['url'])
1238 'id': 'Stanford OpenClassroom',
1241 'upload_date': None,
1244 self
.report_download_webpage(info
['id'])
1245 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1247 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
1248 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1249 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
))
1251 info
['title'] = info
['id']
1253 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
1256 'type': 'reference',
1257 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
1262 for entry
in info
['list']:
1263 assert entry
['type'] == 'reference'
1264 results
+= self
.extract(entry
['url'])
1267 class MTVIE(InfoExtractor
):
1268 """Information extractor for MTV.com"""
1270 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1273 def _real_extract(self
, url
):
1274 mobj
= re
.match(self
._VALID
_URL
, url
)
1276 raise ExtractorError(u
'Invalid URL: %s' % url
)
1277 if not mobj
.group('proto'):
1278 url
= 'http://' + url
1279 video_id
= mobj
.group('videoid')
1281 webpage
= self
._download
_webpage
(url
, video_id
)
1283 song_name
= self
._html
_search
_regex
(r
'<meta name="mtv_vt" content="([^"]+)"/>',
1284 webpage
, u
'song name', fatal
=False)
1286 video_title
= self
._html
_search
_regex
(r
'<meta name="mtv_an" content="([^"]+)"/>',
1289 mtvn_uri
= self
._html
_search
_regex
(r
'<meta name="mtvn_uri" content="([^"]+)"/>',
1290 webpage
, u
'mtvn_uri', fatal
=False)
1292 content_id
= self
._search
_regex
(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1293 webpage
, u
'content id', fatal
=False)
1295 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1296 self
.report_extraction(video_id
)
1297 request
= compat_urllib_request
.Request(videogen_url
)
1299 metadataXml
= compat_urllib_request
.urlopen(request
).read()
1300 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1301 raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
))
1303 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
1304 renditions
= mdoc
.findall('.//rendition')
1306 # For now, always pick the highest quality.
1307 rendition
= renditions
[-1]
1310 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
1311 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
1312 video_url
= rendition
.find('./src').text
1314 raise ExtractorError('Invalid rendition field.')
1319 'uploader': performer
,
1320 'upload_date': None,
1321 'title': video_title
,
1329 class YoukuIE(InfoExtractor
):
1330 _VALID_URL
= r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1333 nowTime
= int(time
.time() * 1000)
1334 random1
= random
.randint(1000,1998)
1335 random2
= random
.randint(1000,9999)
1337 return "%d%d%d" %(nowTime
,random1
,random2
)
1339 def _get_file_ID_mix_string(self
, seed
):
1341 source
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1343 for i
in range(len(source
)):
1344 seed
= (seed
* 211 + 30031 ) % 65536
1345 index
= math
.floor(seed
/ 65536 * len(source
) )
1346 mixed
.append(source
[int(index
)])
1347 source
.remove(source
[int(index
)])
1348 #return ''.join(mixed)
1351 def _get_file_id(self
, fileId
, seed
):
1352 mixed
= self
._get
_file
_ID
_mix
_string
(seed
)
1353 ids
= fileId
.split('*')
1357 realId
.append(mixed
[int(ch
)])
1358 return ''.join(realId
)
1360 def _real_extract(self
, url
):
1361 mobj
= re
.match(self
._VALID
_URL
, url
)
1363 raise ExtractorError(u
'Invalid URL: %s' % url
)
1364 video_id
= mobj
.group('ID')
1366 info_url
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1368 jsondata
= self
._download
_webpage
(info_url
, video_id
)
1370 self
.report_extraction(video_id
)
1372 config
= json
.loads(jsondata
)
1374 video_title
= config
['data'][0]['title']
1375 seed
= config
['data'][0]['seed']
1377 format
= self
._downloader
.params
.get('format', None)
1378 supported_format
= list(config
['data'][0]['streamfileids'].keys())
1380 if format
is None or format
== 'best':
1381 if 'hd2' in supported_format
:
1386 elif format
== 'worst':
1394 fileid
= config
['data'][0]['streamfileids'][format
]
1395 keys
= [s
['k'] for s
in config
['data'][0]['segs'][format
]]
1396 except (UnicodeDecodeError, ValueError, KeyError):
1397 raise ExtractorError(u
'Unable to extract info section')
1400 sid
= self
._gen
_sid
()
1401 fileid
= self
._get
_file
_id
(fileid
, seed
)
1403 #column 8,9 of fileid represent the segment number
1404 #fileid[7:9] should be changed
1405 for index
, key
in enumerate(keys
):
1407 temp_fileid
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:])
1408 download_url
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
)
1411 'id': '%s_part%02d' % (video_id
, index
),
1412 'url': download_url
,
1414 'upload_date': None,
1415 'title': video_title
,
1418 files_info
.append(info
)
1423 class XNXXIE(InfoExtractor
):
1424 """Information extractor for xnxx.com"""
1426 _VALID_URL
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1428 VIDEO_URL_RE
= r
'flv_url=(.*?)&'
1429 VIDEO_TITLE_RE
= r
'<title>(.*?)\s+-\s+XNXX.COM'
1430 VIDEO_THUMB_RE
= r
'url_bigthumb=(.*?)&'
1432 def _real_extract(self
, url
):
1433 mobj
= re
.match(self
._VALID
_URL
, url
)
1435 raise ExtractorError(u
'Invalid URL: %s' % url
)
1436 video_id
= mobj
.group(1)
1438 # Get webpage content
1439 webpage
= self
._download
_webpage
(url
, video_id
)
1441 video_url
= self
._search
_regex
(self
.VIDEO_URL_RE
,
1442 webpage
, u
'video URL')
1443 video_url
= compat_urllib_parse
.unquote(video_url
)
1445 video_title
= self
._html
_search
_regex
(self
.VIDEO_TITLE_RE
,
1448 video_thumbnail
= self
._search
_regex
(self
.VIDEO_THUMB_RE
,
1449 webpage
, u
'thumbnail', fatal
=False)
1455 'upload_date': None,
1456 'title': video_title
,
1458 'thumbnail': video_thumbnail
,
1459 'description': None,
1463 class GooglePlusIE(InfoExtractor
):
1464 """Information extractor for plus.google.com."""
1466 _VALID_URL
= r
'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1467 IE_NAME
= u
'plus.google'
1469 def _real_extract(self
, url
):
1470 # Extract id from URL
1471 mobj
= re
.match(self
._VALID
_URL
, url
)
1473 raise ExtractorError(u
'Invalid URL: %s' % url
)
1475 post_url
= mobj
.group(0)
1476 video_id
= mobj
.group(1)
1478 video_extension
= 'flv'
1480 # Step 1, Retrieve post webpage to extract further information
1481 webpage
= self
._download
_webpage
(post_url
, video_id
, u
'Downloading entry webpage')
1483 self
.report_extraction(video_id
)
1485 # Extract update date
1486 upload_date
= self
._html
_search
_regex
('title="Timestamp">(.*?)</a>',
1487 webpage
, u
'upload date', fatal
=False)
1489 # Convert timestring to a format suitable for filename
1490 upload_date
= datetime
.datetime
.strptime(upload_date
, "%Y-%m-%d")
1491 upload_date
= upload_date
.strftime('%Y%m%d')
1494 uploader
= self
._html
_search
_regex
(r
'rel\="author".*?>(.*?)</a>',
1495 webpage
, u
'uploader', fatal
=False)
1498 # Get the first line for title
1499 video_title
= self
._html
_search
_regex
(r
'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1500 webpage
, 'title', default
=u
'NA')
1502 # Step 2, Stimulate clicking the image box to launch video
1503 video_page
= self
._search
_regex
('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1504 webpage
, u
'video page URL')
1505 webpage
= self
._download
_webpage
(video_page
, video_id
, u
'Downloading video page')
1507 # Extract video links on video page
1508 """Extract video links of all sizes"""
1509 pattern
= '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1510 mobj
= re
.findall(pattern
, webpage
)
1512 raise ExtractorError(u
'Unable to extract video links')
1514 # Sort in resolution
1515 links
= sorted(mobj
)
1517 # Choose the lowest of the sort, i.e. highest resolution
1518 video_url
= links
[-1]
1519 # Only get the url. The resolution part in the tuple has no use anymore
1520 video_url
= video_url
[-1]
1521 # Treat escaped \u0026 style hex
1523 video_url
= video_url
.decode("unicode_escape")
1524 except AttributeError: # Python 3
1525 video_url
= bytes(video_url
, 'ascii').decode('unicode-escape')
1531 'uploader': uploader
,
1532 'upload_date': upload_date
,
1533 'title': video_title
,
1534 'ext': video_extension
,
1537 class NBAIE(InfoExtractor
):
1538 _VALID_URL
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1541 def _real_extract(self
, url
):
1542 mobj
= re
.match(self
._VALID
_URL
, url
)
1544 raise ExtractorError(u
'Invalid URL: %s' % url
)
1546 video_id
= mobj
.group(1)
1548 webpage
= self
._download
_webpage
(url
, video_id
)
1550 video_url
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id
+ '_nba_1280x720.mp4'
1552 shortened_video_id
= video_id
.rpartition('/')[2]
1553 title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(.*?)"',
1554 webpage
, 'title', default
=shortened_video_id
).replace('NBA.com: ', '')
1556 # It isn't there in the HTML it returns to us
1557 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1559 description
= self
._html
_search
_regex
(r
'<meta name="description" (?:content|value)="(.*?)" />', webpage
, 'description', fatal
=False)
1562 'id': shortened_video_id
,
1566 # 'uploader_date': uploader_date,
1567 'description': description
,
1571 class JustinTVIE(InfoExtractor
):
1572 """Information extractor for justin.tv and twitch.tv"""
1573 # TODO: One broadcast may be split into multiple videos. The key
1574 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1575 # starts at 1 and increases. Can we treat all parts as one video?
1577 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1579 (?P<channelid>[^/]+)|
1580 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1581 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1585 _JUSTIN_PAGE_LIMIT
= 100
1586 IE_NAME
= u
'justin.tv'
1588 def report_download_page(self
, channel
, offset
):
1589 """Report attempt to download a single page of videos."""
1590 self
.to_screen(u
'%s: Downloading video information from %d to %d' %
1591 (channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
1593 # Return count of items, list of *valid* items
1594 def _parse_page(self
, url
, video_id
):
1595 webpage
= self
._download
_webpage
(url
, video_id
,
1596 u
'Downloading video info JSON',
1597 u
'unable to download video info JSON')
1599 response
= json
.loads(webpage
)
1600 if type(response
) != list:
1601 error_text
= response
.get('error', 'unknown error')
1602 raise ExtractorError(u
'Justin.tv API: %s' % error_text
)
1604 for clip
in response
:
1605 video_url
= clip
['video_file_url']
1607 video_extension
= os
.path
.splitext(video_url
)[1][1:]
1608 video_date
= re
.sub('-', '', clip
['start_time'][:10])
1609 video_uploader_id
= clip
.get('user_id', clip
.get('channel_id'))
1610 video_id
= clip
['id']
1611 video_title
= clip
.get('title', video_id
)
1615 'title': video_title
,
1616 'uploader': clip
.get('channel_name', video_uploader_id
),
1617 'uploader_id': video_uploader_id
,
1618 'upload_date': video_date
,
1619 'ext': video_extension
,
1621 return (len(response
), info
)
1623 def _real_extract(self
, url
):
1624 mobj
= re
.match(self
._VALID
_URL
, url
)
1626 raise ExtractorError(u
'invalid URL: %s' % url
)
1628 api_base
= 'http://api.justin.tv'
1630 if mobj
.group('channelid'):
1632 video_id
= mobj
.group('channelid')
1633 api
= api_base
+ '/channel/archives/%s.json' % video_id
1634 elif mobj
.group('chapterid'):
1635 chapter_id
= mobj
.group('chapterid')
1637 webpage
= self
._download
_webpage
(url
, chapter_id
)
1638 m
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
)
1640 raise ExtractorError(u
'Cannot find archive of a chapter')
1641 archive_id
= m
.group(1)
1643 api
= api_base
+ '/broadcast/by_chapter/%s.xml' % chapter_id
1644 chapter_info_xml
= self
._download
_webpage
(api
, chapter_id
,
1645 note
=u
'Downloading chapter information',
1646 errnote
=u
'Chapter information download failed')
1647 doc
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
)
1648 for a
in doc
.findall('.//archive'):
1649 if archive_id
== a
.find('./id').text
:
1652 raise ExtractorError(u
'Could not find chapter in chapter information')
1654 video_url
= a
.find('./video_file_url').text
1655 video_ext
= video_url
.rpartition('.')[2] or u
'flv'
1657 chapter_api_url
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
1658 chapter_info_json
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
,
1659 note
='Downloading chapter metadata',
1660 errnote
='Download of chapter metadata failed')
1661 chapter_info
= json
.loads(chapter_info_json
)
1663 bracket_start
= int(doc
.find('.//bracket_start').text
)
1664 bracket_end
= int(doc
.find('.//bracket_end').text
)
1666 # TODO determine start (and probably fix up file)
1667 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1668 #video_url += u'?start=' + TODO:start_timestamp
1669 # bracket_start is 13290, but we want 51670615
1670 self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. '
1671 u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
)))
1674 'id': u
'c' + chapter_id
,
1677 'title': chapter_info
['title'],
1678 'thumbnail': chapter_info
['preview'],
1679 'description': chapter_info
['description'],
1680 'uploader': chapter_info
['channel']['display_name'],
1681 'uploader_id': chapter_info
['channel']['name'],
1685 video_id
= mobj
.group('videoid')
1686 api
= api_base
+ '/broadcast/by_archive/%s.json' % video_id
1688 self
.report_extraction(video_id
)
1692 limit
= self
._JUSTIN
_PAGE
_LIMIT
1695 self
.report_download_page(video_id
, offset
)
1696 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
1697 page_count
, page_info
= self
._parse
_page
(page_url
, video_id
)
1698 info
.extend(page_info
)
1699 if not paged
or page_count
!= limit
:
1704 class FunnyOrDieIE(InfoExtractor
):
1705 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1707 def _real_extract(self
, url
):
1708 mobj
= re
.match(self
._VALID
_URL
, url
)
1710 raise ExtractorError(u
'invalid URL: %s' % url
)
1712 video_id
= mobj
.group('id')
1713 webpage
= self
._download
_webpage
(url
, video_id
)
1715 video_url
= self
._html
_search
_regex
(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1716 webpage
, u
'video URL', flags
=re
.DOTALL
)
1718 title
= self
._html
_search
_regex
((r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1719 r
'<title>(?P<title>[^<]+?)</title>'), webpage
, 'title', flags
=re
.DOTALL
)
1721 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
1722 webpage
, u
'description', fatal
=False, flags
=re
.DOTALL
)
1729 'description': video_description
,
1733 class SteamIE(InfoExtractor
):
1734 _VALID_URL
= r
"""http://store\.steampowered\.com/
1736 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1738 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1740 _VIDEO_PAGE_TEMPLATE
= 'http://store.steampowered.com/video/%s/'
1741 _AGECHECK_TEMPLATE
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1744 def suitable(cls
, url
):
1745 """Receives a URL and returns True if suitable for this IE."""
1746 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1748 def _real_extract(self
, url
):
1749 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1750 gameID
= m
.group('gameID')
1752 videourl
= self
._VIDEO
_PAGE
_TEMPLATE
% gameID
1753 webpage
= self
._download
_webpage
(videourl
, gameID
)
1755 if re
.search('<h2>Please enter your birth date to continue:</h2>', webpage
) is not None:
1756 videourl
= self
._AGECHECK
_TEMPLATE
% gameID
1757 self
.report_age_confirmation()
1758 webpage
= self
._download
_webpage
(videourl
, gameID
)
1760 self
.report_extraction(gameID
)
1761 game_title
= self
._html
_search
_regex
(r
'<h2 class="pageheader">(.*?)</h2>',
1762 webpage
, 'game title')
1764 urlRE
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1765 mweb
= re
.finditer(urlRE
, webpage
)
1766 namesRE
= r
'<span class="title">(?P<videoName>.+?)</span>'
1767 titles
= re
.finditer(namesRE
, webpage
)
1768 thumbsRE
= r
'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1769 thumbs
= re
.finditer(thumbsRE
, webpage
)
1771 for vid
,vtitle
,thumb
in zip(mweb
,titles
,thumbs
):
1772 video_id
= vid
.group('videoID')
1773 title
= vtitle
.group('videoName')
1774 video_url
= vid
.group('videoURL')
1775 video_thumb
= thumb
.group('thumbnail')
1777 raise ExtractorError(u
'Cannot find video url for %s' % video_id
)
1782 'title': unescapeHTML(title
),
1783 'thumbnail': video_thumb
1786 return [self
.playlist_result(videos
, gameID
, game_title
)]
1788 class UstreamIE(InfoExtractor
):
1789 _VALID_URL
= r
'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1790 IE_NAME
= u
'ustream'
1792 def _real_extract(self
, url
):
1793 m
= re
.match(self
._VALID
_URL
, url
)
1794 video_id
= m
.group('videoID')
1796 video_url
= u
'http://tcdn.ustream.tv/video/%s' % video_id
1797 webpage
= self
._download
_webpage
(url
, video_id
)
1799 self
.report_extraction(video_id
)
1801 video_title
= self
._html
_search
_regex
(r
'data-title="(?P<title>.+)"',
1804 uploader
= self
._html
_search
_regex
(r
'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1805 webpage
, u
'uploader', fatal
=False, flags
=re
.DOTALL
)
1807 thumbnail
= self
._html
_search
_regex
(r
'<link rel="image_src" href="(?P<thumb>.*?)"',
1808 webpage
, u
'thumbnail', fatal
=False)
1814 'title': video_title
,
1815 'uploader': uploader
,
1816 'thumbnail': thumbnail
,
1820 class WorldStarHipHopIE(InfoExtractor
):
1821 _VALID_URL
= r
'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1822 IE_NAME
= u
'WorldStarHipHop'
1824 def _real_extract(self
, url
):
1825 m
= re
.match(self
._VALID
_URL
, url
)
1826 video_id
= m
.group('id')
1828 webpage_src
= self
._download
_webpage
(url
, video_id
)
1830 video_url
= self
._search
_regex
(r
'so\.addVariable\("file","(.*?)"\)',
1831 webpage_src
, u
'video URL')
1833 if 'mp4' in video_url
:
1838 video_title
= self
._html
_search
_regex
(r
"<title>(.*)</title>",
1839 webpage_src
, u
'title')
1841 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1842 thumbnail
= self
._html
_search
_regex
(r
'rel="image_src" href="(.*)" />',
1843 webpage_src
, u
'thumbnail', fatal
=False)
1846 _title
= r
"""candytitles.*>(.*)</span>"""
1847 mobj
= re
.search(_title
, webpage_src
)
1848 if mobj
is not None:
1849 video_title
= mobj
.group(1)
1854 'title' : video_title
,
1855 'thumbnail' : thumbnail
,
1860 class RBMARadioIE(InfoExtractor
):
1861 _VALID_URL
= r
'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1863 def _real_extract(self
, url
):
1864 m
= re
.match(self
._VALID
_URL
, url
)
1865 video_id
= m
.group('videoID')
1867 webpage
= self
._download
_webpage
(url
, video_id
)
1869 json_data
= self
._search
_regex
(r
'window\.gon.*?gon\.show=(.+?);$',
1870 webpage
, u
'json data', flags
=re
.MULTILINE
)
1873 data
= json
.loads(json_data
)
1874 except ValueError as e
:
1875 raise ExtractorError(u
'Invalid JSON: ' + str(e
))
1877 video_url
= data
['akamai_url'] + '&cbr=256'
1878 url_parts
= compat_urllib_parse_urlparse(video_url
)
1879 video_ext
= url_parts
.path
.rpartition('.')[2]
1884 'title': data
['title'],
1885 'description': data
.get('teaser_text'),
1886 'location': data
.get('country_of_origin'),
1887 'uploader': data
.get('host', {}).get('name'),
1888 'uploader_id': data
.get('host', {}).get('slug'),
1889 'thumbnail': data
.get('image', {}).get('large_url_2x'),
1890 'duration': data
.get('duration'),
1895 class YouPornIE(InfoExtractor
):
1896 """Information extractor for youporn.com."""
1897 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1899 def _print_formats(self
, formats
):
1900 """Print all available formats"""
1901 print(u
'Available formats:')
1902 print(u
'ext\t\tformat')
1903 print(u
'---------------------------------')
1904 for format
in formats
:
1905 print(u
'%s\t\t%s' % (format
['ext'], format
['format']))
1907 def _specific(self
, req_format
, formats
):
1909 if(x
["format"]==req_format
):
1913 def _real_extract(self
, url
):
1914 mobj
= re
.match(self
._VALID
_URL
, url
)
1916 raise ExtractorError(u
'Invalid URL: %s' % url
)
1917 video_id
= mobj
.group('videoid')
1919 req
= compat_urllib_request
.Request(url
)
1920 req
.add_header('Cookie', 'age_verified=1')
1921 webpage
= self
._download
_webpage
(req
, video_id
)
1923 # Get JSON parameters
1924 json_params
= self
._search
_regex
(r
'var currentVideo = new Video\((.*)\);', webpage
, u
'JSON parameters')
1926 params
= json
.loads(json_params
)
1928 raise ExtractorError(u
'Invalid JSON')
1930 self
.report_extraction(video_id
)
1932 video_title
= params
['title']
1933 upload_date
= unified_strdate(params
['release_date_f'])
1934 video_description
= params
['description']
1935 video_uploader
= params
['submitted_by']
1936 thumbnail
= params
['thumbnails'][0]['image']
1938 raise ExtractorError('Missing JSON parameter: ' + sys
.exc_info()[1])
1940 # Get all of the formats available
1941 DOWNLOAD_LIST_RE
= r
'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1942 download_list_html
= self
._search
_regex
(DOWNLOAD_LIST_RE
,
1943 webpage
, u
'download list').strip()
1945 # Get all of the links from the page
1946 LINK_RE
= r
'(?s)<a href="(?P<url>[^"]+)">'
1947 links
= re
.findall(LINK_RE
, download_list_html
)
1948 if(len(links
) == 0):
1949 raise ExtractorError(u
'ERROR: no known formats available for video')
1951 self
.to_screen(u
'Links found: %d' % len(links
))
1956 # A link looks like this:
1957 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1958 # A path looks like this:
1959 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1960 video_url
= unescapeHTML( link
)
1961 path
= compat_urllib_parse_urlparse( video_url
).path
1962 extension
= os
.path
.splitext( path
)[1][1:]
1963 format
= path
.split('/')[4].split('_')[:2]
1966 format
= "-".join( format
)
1967 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1972 'uploader': video_uploader
,
1973 'upload_date': upload_date
,
1974 'title': video_title
,
1977 'thumbnail': thumbnail
,
1978 'description': video_description
1981 if self
._downloader
.params
.get('listformats', None):
1982 self
._print
_formats
(formats
)
1985 req_format
= self
._downloader
.params
.get('format', None)
1986 self
.to_screen(u
'Format: %s' % req_format
)
1988 if req_format
is None or req_format
== 'best':
1990 elif req_format
== 'worst':
1991 return [formats
[-1]]
1992 elif req_format
in ('-1', 'all'):
1995 format
= self
._specific
( req_format
, formats
)
1997 raise ExtractorError(u
'Requested format not available')
2002 class PornotubeIE(InfoExtractor
):
2003 """Information extractor for pornotube.com."""
2004 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2006 def _real_extract(self
, url
):
2007 mobj
= re
.match(self
._VALID
_URL
, url
)
2009 raise ExtractorError(u
'Invalid URL: %s' % url
)
2011 video_id
= mobj
.group('videoid')
2012 video_title
= mobj
.group('title')
2014 # Get webpage content
2015 webpage
= self
._download
_webpage
(url
, video_id
)
2018 VIDEO_URL_RE
= r
'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2019 video_url
= self
._search
_regex
(VIDEO_URL_RE
, webpage
, u
'video url')
2020 video_url
= compat_urllib_parse
.unquote(video_url
)
2022 #Get the uploaded date
2023 VIDEO_UPLOADED_RE
= r
'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2024 upload_date
= self
._html
_search
_regex
(VIDEO_UPLOADED_RE
, webpage
, u
'upload date', fatal
=False)
2025 if upload_date
: upload_date
= unified_strdate(upload_date
)
2027 info
= {'id': video_id
,
2030 'upload_date': upload_date
,
2031 'title': video_title
,
2037 class YouJizzIE(InfoExtractor
):
2038 """Information extractor for youjizz.com."""
2039 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2041 def _real_extract(self
, url
):
2042 mobj
= re
.match(self
._VALID
_URL
, url
)
2044 raise ExtractorError(u
'Invalid URL: %s' % url
)
2046 video_id
= mobj
.group('videoid')
2048 # Get webpage content
2049 webpage
= self
._download
_webpage
(url
, video_id
)
2051 # Get the video title
2052 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*)</title>',
2053 webpage
, u
'title').strip()
2055 # Get the embed page
2056 result
= re
.search(r
'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage
)
2058 raise ExtractorError(u
'ERROR: unable to extract embed page')
2060 embed_page_url
= result
.group(0).strip()
2061 video_id
= result
.group('videoid')
2063 webpage
= self
._download
_webpage
(embed_page_url
, video_id
)
2066 video_url
= self
._search
_regex
(r
'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2067 webpage
, u
'video URL')
2069 info
= {'id': video_id
,
2071 'title': video_title
,
2074 'player_url': embed_page_url
}
2078 class EightTracksIE(InfoExtractor
):
2080 _VALID_URL
= r
'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2082 def _real_extract(self
, url
):
2083 mobj
= re
.match(self
._VALID
_URL
, url
)
2085 raise ExtractorError(u
'Invalid URL: %s' % url
)
2086 playlist_id
= mobj
.group('id')
2088 webpage
= self
._download
_webpage
(url
, playlist_id
)
2090 json_like
= self
._search
_regex
(r
"PAGE.mix = (.*?);\n", webpage
, u
'trax information', flags
=re
.DOTALL
)
2091 data
= json
.loads(json_like
)
2093 session
= str(random
.randint(0, 1000000000))
2095 track_count
= data
['tracks_count']
2096 first_url
= 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session
, mix_id
)
2097 next_url
= first_url
2099 for i
in itertools
.count():
2100 api_json
= self
._download
_webpage
(next_url
, playlist_id
,
2101 note
=u
'Downloading song information %s/%s' % (str(i
+1), track_count
),
2102 errnote
=u
'Failed to download song information')
2103 api_data
= json
.loads(api_json
)
2104 track_data
= api_data
[u
'set']['track']
2106 'id': track_data
['id'],
2107 'url': track_data
['track_file_stream_url'],
2108 'title': track_data
['performer'] + u
' - ' + track_data
['name'],
2109 'raw_title': track_data
['name'],
2110 'uploader_id': data
['user']['login'],
2114 if api_data
['set']['at_last_track']:
2116 next_url
= 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session
, mix_id
, track_data
['id'])
2119 class KeekIE(InfoExtractor
):
2120 _VALID_URL
= r
'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2123 def _real_extract(self
, url
):
2124 m
= re
.match(self
._VALID
_URL
, url
)
2125 video_id
= m
.group('videoID')
2127 video_url
= u
'http://cdn.keek.com/keek/video/%s' % video_id
2128 thumbnail
= u
'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2129 webpage
= self
._download
_webpage
(url
, video_id
)
2131 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
2134 uploader
= self
._html
_search
_regex
(r
'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2135 webpage
, u
'uploader', fatal
=False)
2141 'title': video_title
,
2142 'thumbnail': thumbnail
,
2143 'uploader': uploader
2147 class TEDIE(InfoExtractor
):
2148 _VALID_URL
=r
'''http://www\.ted\.com/
2150 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2152 ((?P<type_talk>talks)) # We have a simple talk
2154 (/lang/(.*?))? # The url may contain the language
2155 /(?P<name>\w+) # Here goes the name and then ".html"
2159 def suitable(cls
, url
):
2160 """Receives a URL and returns True if suitable for this IE."""
2161 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
2163 def _real_extract(self
, url
):
2164 m
=re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
2165 if m
.group('type_talk'):
2166 return [self
._talk
_info
(url
)]
2168 playlist_id
=m
.group('playlist_id')
2169 name
=m
.group('name')
2170 self
.to_screen(u
'Getting info of playlist %s: "%s"' % (playlist_id
,name
))
2171 return [self
._playlist
_videos
_info
(url
,name
,playlist_id
)]
2173 def _playlist_videos_info(self
,url
,name
,playlist_id
=0):
2174 '''Returns the videos of the playlist'''
2176 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2177 ([.\s]*?)data-playlist_item_id="(\d+)"
2178 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2180 video_name_RE
=r
'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2181 webpage
=self
._download
_webpage
(url
, playlist_id
, 'Downloading playlist webpage')
2182 m_videos
=re
.finditer(video_RE
,webpage
,re
.VERBOSE
)
2183 m_names
=re
.finditer(video_name_RE
,webpage
)
2185 playlist_title
= self
._html
_search
_regex
(r
'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2186 webpage
, 'playlist title')
2188 playlist_entries
= []
2189 for m_video
, m_name
in zip(m_videos
,m_names
):
2190 video_id
=m_video
.group('video_id')
2191 talk_url
='http://www.ted.com%s' % m_name
.group('talk_url')
2192 playlist_entries
.append(self
.url_result(talk_url
, 'TED'))
2193 return self
.playlist_result(playlist_entries
, playlist_id
= playlist_id
, playlist_title
= playlist_title
)
2195 def _talk_info(self
, url
, video_id
=0):
2196 """Return the video for the talk in the url"""
2197 m
= re
.match(self
._VALID
_URL
, url
,re
.VERBOSE
)
2198 video_name
= m
.group('name')
2199 webpage
= self
._download
_webpage
(url
, video_id
, 'Downloading \"%s\" page' % video_name
)
2200 self
.report_extraction(video_name
)
2201 # If the url includes the language we get the title translated
2202 title
= self
._html
_search
_regex
(r
'<span id="altHeadline" >(?P<title>.*)</span>',
2204 json_data
= self
._search
_regex
(r
'<script.*?>var talkDetails = ({.*?})</script>',
2205 webpage
, 'json data')
2206 info
= json
.loads(json_data
)
2207 desc
= self
._html
_search
_regex
(r
'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2208 webpage
, 'description', flags
= re
.DOTALL
)
2210 thumbnail
= self
._search
_regex
(r
'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2211 webpage
, 'thumbnail')
2214 'url': info
['htmlStreams'][-1]['file'],
2217 'thumbnail': thumbnail
,
2218 'description': desc
,
2222 class MySpassIE(InfoExtractor
):
2223 _VALID_URL
= r
'http://www.myspass.de/.*'
2225 def _real_extract(self
, url
):
2226 META_DATA_URL_TEMPLATE
= 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2228 # video id is the last path element of the URL
2229 # usually there is a trailing slash, so also try the second but last
2230 url_path
= compat_urllib_parse_urlparse(url
).path
2231 url_parent_path
, video_id
= os
.path
.split(url_path
)
2233 _
, video_id
= os
.path
.split(url_parent_path
)
2236 metadata_url
= META_DATA_URL_TEMPLATE
% video_id
2237 metadata_text
= self
._download
_webpage
(metadata_url
, video_id
)
2238 metadata
= xml
.etree
.ElementTree
.fromstring(metadata_text
.encode('utf-8'))
2240 # extract values from metadata
2241 url_flv_el
= metadata
.find('url_flv')
2242 if url_flv_el
is None:
2243 raise ExtractorError(u
'Unable to extract download url')
2244 video_url
= url_flv_el
.text
2245 extension
= os
.path
.splitext(video_url
)[1][1:]
2246 title_el
= metadata
.find('title')
2247 if title_el
is None:
2248 raise ExtractorError(u
'Unable to extract title')
2249 title
= title_el
.text
2250 format_id_el
= metadata
.find('format_id')
2251 if format_id_el
is None:
2254 format
= format_id_el
.text
2255 description_el
= metadata
.find('description')
2256 if description_el
is not None:
2257 description
= description_el
.text
2260 imagePreview_el
= metadata
.find('imagePreview')
2261 if imagePreview_el
is not None:
2262 thumbnail
= imagePreview_el
.text
2271 'thumbnail': thumbnail
,
2272 'description': description
2276 class SpiegelIE(InfoExtractor
):
2277 _VALID_URL
= r
'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2279 def _real_extract(self
, url
):
2280 m
= re
.match(self
._VALID
_URL
, url
)
2281 video_id
= m
.group('videoID')
2283 webpage
= self
._download
_webpage
(url
, video_id
)
2285 video_title
= self
._html
_search
_regex
(r
'<div class="module-title">(.*?)</div>',
2288 xml_url
= u
'http://video2.spiegel.de/flash/' + video_id
+ u
'.xml'
2289 xml_code
= self
._download
_webpage
(xml_url
, video_id
,
2290 note
=u
'Downloading XML', errnote
=u
'Failed to download XML')
2292 idoc
= xml
.etree
.ElementTree
.fromstring(xml_code
)
2293 last_type
= idoc
[-1]
2294 filename
= last_type
.findall('./filename')[0].text
2295 duration
= float(last_type
.findall('./duration')[0].text
)
2297 video_url
= 'http://video2.spiegel.de/flash/' + filename
2298 video_ext
= filename
.rpartition('.')[2]
2303 'title': video_title
,
2304 'duration': duration
,
2308 class LiveLeakIE(InfoExtractor
):
2310 _VALID_URL
= r
'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2311 IE_NAME
= u
'liveleak'
2313 def _real_extract(self
, url
):
2314 mobj
= re
.match(self
._VALID
_URL
, url
)
2316 raise ExtractorError(u
'Invalid URL: %s' % url
)
2318 video_id
= mobj
.group('video_id')
2320 webpage
= self
._download
_webpage
(url
, video_id
)
2322 video_url
= self
._search
_regex
(r
'file: "(.*?)",',
2323 webpage
, u
'video URL')
2325 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
2326 webpage
, u
'title').replace('LiveLeak.com -', '').strip()
2328 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
2329 webpage
, u
'description', fatal
=False)
2331 video_uploader
= self
._html
_search
_regex
(r
'By:.*?(\w+)</a>',
2332 webpage
, u
'uploader', fatal
=False)
2338 'title': video_title
,
2339 'description': video_description
,
2340 'uploader': video_uploader
2347 class TumblrIE(InfoExtractor
):
2348 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2350 def _real_extract(self
, url
):
2351 m_url
= re
.match(self
._VALID
_URL
, url
)
2352 video_id
= m_url
.group('id')
2353 blog
= m_url
.group('blog_name')
2355 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
2356 webpage
= self
._download
_webpage
(url
, video_id
)
2358 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
2359 video
= re
.search(re_video
, webpage
)
2361 raise ExtractorError(u
'Unable to extract video')
2362 video_url
= video
.group('video_url')
2363 ext
= video
.group('ext')
2365 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2366 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
2367 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
2369 # The only place where you can get a title, it's not complete,
2370 # but searching in other places doesn't work for all videos
2371 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
2372 webpage
, u
'title', flags
=re
.DOTALL
)
2374 return [{'id': video_id
,
2376 'title': video_title
,
2377 'thumbnail': video_thumbnail
,
2381 class BandcampIE(InfoExtractor
):
2382 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2384 def _real_extract(self
, url
):
2385 mobj
= re
.match(self
._VALID
_URL
, url
)
2386 title
= mobj
.group('title')
2387 webpage
= self
._download
_webpage
(url
, title
)
2388 # We get the link to the free download page
2389 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
2390 if m_download
is None:
2391 raise ExtractorError(u
'No free songs found')
2393 download_link
= m_download
.group(1)
2394 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2395 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
2397 download_webpage
= self
._download
_webpage
(download_link
, id,
2398 'Downloading free downloads page')
2399 # We get the dictionary of the track from some javascrip code
2400 info
= re
.search(r
'items: (.*?),$',
2401 download_webpage
, re
.MULTILINE
).group(1)
2402 info
= json
.loads(info
)[0]
2403 # We pick mp3-320 for now, until format selection can be easily implemented.
2404 mp3_info
= info
[u
'downloads'][u
'mp3-320']
2405 # If we try to use this url it says the link has expired
2406 initial_url
= mp3_info
[u
'url']
2407 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2408 m_url
= re
.match(re_url
, initial_url
)
2409 #We build the url we will use to get the final track url
2410 # This url is build in Bandcamp in the script download_bunde_*.js
2411 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
2412 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
2413 # If we could correctly generate the .rand field the url would be
2414 #in the "download_url" key
2415 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
2417 track_info
= {'id':id,
2418 'title' : info
[u
'title'],
2421 'thumbnail' : info
[u
'thumb_url'],
2422 'uploader' : info
[u
'artist']
2427 class RedTubeIE(InfoExtractor
):
2428 """Information Extractor for redtube"""
2429 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2431 def _real_extract(self
,url
):
2432 mobj
= re
.match(self
._VALID
_URL
, url
)
2434 raise ExtractorError(u
'Invalid URL: %s' % url
)
2436 video_id
= mobj
.group('id')
2437 video_extension
= 'mp4'
2438 webpage
= self
._download
_webpage
(url
, video_id
)
2440 self
.report_extraction(video_id
)
2442 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
2443 webpage
, u
'video URL')
2445 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2451 'ext': video_extension
,
2452 'title': video_title
,
2455 class InaIE(InfoExtractor
):
2456 """Information Extractor for Ina.fr"""
2457 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2459 def _real_extract(self
,url
):
2460 mobj
= re
.match(self
._VALID
_URL
, url
)
2462 video_id
= mobj
.group('id')
2463 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
2464 video_extension
= 'mp4'
2465 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
2467 self
.report_extraction(video_id
)
2469 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2470 webpage
, u
'video URL')
2472 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2478 'ext': video_extension
,
2479 'title': video_title
,
2482 class HowcastIE(InfoExtractor
):
2483 """Information Extractor for Howcast.com"""
2484 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2486 def _real_extract(self
, url
):
2487 mobj
= re
.match(self
._VALID
_URL
, url
)
2489 video_id
= mobj
.group('id')
2490 webpage_url
= 'http://www.howcast.com/videos/' + video_id
2491 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
2493 self
.report_extraction(video_id
)
2495 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2496 webpage, u'video URL')
2498 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
2501 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2502 webpage, u'description', fatal=False)
2504 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2505 webpage, u'thumbnail', fatal=False)
2511 'title': video_title,
2512 'description': video_description,
2513 'thumbnail': thumbnail,
2516 class VineIE(InfoExtractor):
2517 """Information Extractor for Vine.co"""
2518 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2520 def _real_extract(self, url):
2521 mobj = re.match(self._VALID_URL, url)
2523 video_id = mobj.group('id')
2524 webpage_url = 'https://vine.co/v/' + video_id
2525 webpage = self._download_webpage(webpage_url, video_id)
2527 self.report_extraction(video_id)
2529 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
2530 webpage, u'video URL')
2532 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
2535 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
2536 webpage, u'thumbnail', fatal=False)
2538 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
2539 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2545 'title': video_title,
2546 'thumbnail': thumbnail,
2547 'uploader': uploader,
2550 class FlickrIE(InfoExtractor):
2551 """Information Extractor for Flickr videos"""
2552 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2554 def _real_extract(self, url):
2555 mobj = re.match(self._VALID_URL, url)
2557 video_id = mobj.group('id')
2558 video_uploader_id = mobj.group('uploader_id')
2559 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2560 webpage = self._download_webpage(webpage_url, video_id)
2562 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
2564 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2565 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2567 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2568 first_xml, u'node_id')
2570 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2571 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2573 self.report_extraction(video_id)
2575 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
2577 raise ExtractorError(u'Unable to extract video url')
2578 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2580 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
2581 webpage, u'video title
')
2583 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
2584 webpage, u'description', fatal=False)
2586 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
2587 webpage, u'thumbnail
', fatal=False)
2593 'title
': video_title,
2594 'description
': video_description,
2595 'thumbnail
': thumbnail,
2596 'uploader_id
': video_uploader_id,
2599 class TeamcocoIE(InfoExtractor):
2600 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
2602 def _real_extract(self, url):
2603 mobj = re.match(self._VALID_URL, url)
2605 raise ExtractorError(u'Invalid URL
: %s' % url)
2606 url_title = mobj.group('url_title
')
2607 webpage = self._download_webpage(url, url_title)
2609 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
2610 webpage, u'video
id')
2612 self.report_extraction(video_id)
2614 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
2617 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
2618 webpage, u'thumbnail
', fatal=False)
2620 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
2621 webpage, u'description
', fatal=False)
2623 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
2624 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
2626 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
2633 'title
': video_title,
2634 'thumbnail
': thumbnail,
2635 'description
': video_description,
2638 class XHamsterIE(InfoExtractor):
2639 """Information Extractor for xHamster"""
2640 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
2642 def _real_extract(self,url):
2643 mobj = re.match(self._VALID_URL, url)
2645 video_id = mobj.group('id')
2646 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
2647 webpage = self._download_webpage(mrss_url, video_id)
2649 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
2651 raise ExtractorError(u'Unable to extract media URL
')
2652 if len(mobj.group('server
')) == 0:
2653 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2655 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
2656 video_extension = video_url.split('.')[-1]
2658 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
2661 # Can't see the description anywhere
in the UI
2662 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2663 # webpage, u'description', fatal=False)
2664 # if video_description: video_description = unescapeHTML(video_description)
2666 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}
\'', webpage)
2668 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
2670 video_upload_date = None
2671 self._downloader.report_warning(u'Unable to extract upload date
')
2673 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
2674 webpage, u'uploader
id', default=u'anonymous
')
2676 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
2677 webpage, u'thumbnail
', fatal=False)
2682 'ext
': video_extension,
2683 'title
': video_title,
2684 # 'description
': video_description,
2685 'upload_date
': video_upload_date,
2686 'uploader_id
': video_uploader_id,
2687 'thumbnail
': video_thumbnail
2690 class HypemIE(InfoExtractor):
2691 """Information Extractor for hypem"""
2692 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
2694 def _real_extract(self, url):
2695 mobj = re.match(self._VALID_URL, url)
2697 raise ExtractorError(u'Invalid URL
: %s' % url)
2698 track_id = mobj.group(1)
2700 data = { 'ax': 1, 'ts': time.time() }
2701 data_encoded = compat_urllib_parse.urlencode(data)
2702 complete_url = url + "?" + data_encoded
2703 request = compat_urllib_request.Request(complete_url)
2704 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
2705 cookie = urlh.headers.get('Set
-Cookie
', '')
2707 self.report_extraction(track_id)
2709 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
2710 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
2712 track_list = json.loads(html_tracks)
2713 track = track_list[u'tracks
'][0]
2715 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
2718 track_id = track[u"id"]
2719 artist = track[u"artist"]
2720 title = track[u"song"]
2722 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2723 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2724 request.add_header('cookie
', cookie)
2725 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
2727 song_data = json.loads(song_data_json)
2729 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
2730 final_url = song_data[u"url"]
2740 class Vbox7IE(InfoExtractor):
2741 """Information Extractor for Vbox7"""
2742 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
2744 def _real_extract(self,url):
2745 mobj = re.match(self._VALID_URL, url)
2747 raise ExtractorError(u'Invalid URL
: %s' % url)
2748 video_id = mobj.group(1)
2750 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2751 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
2752 redirect_url = urlh.geturl() + new_location
2753 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
2755 title = self._html_search_regex(r'<title
>(.*)</title
>',
2756 webpage, u'title
').split('/')[0].strip()
2759 info_url = "http://vbox7.com/play/magare.do"
2760 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2761 info_request = compat_urllib_request.Request(info_url, data)
2762 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
2763 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
2764 if info_response is None:
2765 raise ExtractorError(u'Unable to extract the media url
')
2766 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2773 'thumbnail
': thumbnail_url,
2777 def gen_extractors():
2778 """ Return a list of an instance of every supported extractor.
2779 The order does matter; the first extractor matched is the one handling the URL.
2782 YoutubePlaylistIE(),
2807 StanfordOpenClassroomIE(),
2817 WorldStarHipHopIE(),
2847 def get_info_extractor(ie_name):
2848 """Returns the info extractor class with the given ie_name"""
2849 return globals()[ie_name+'IE
']