2 # -*- coding: utf-8 -*-
4 from __future__
import absolute_import
15 import xml
.etree
.ElementTree
24 from .extractor
.common
import InfoExtractor
, SearchInfoExtractor
26 from .extractor
.ard
import ARDIE
27 from .extractor
.arte
import ArteTvIE
28 from .extractor
.dailymotion
import DailymotionIE
29 from .extractor
.gametrailers
import GametrailersIE
30 from .extractor
.generic
import GenericIE
31 from .extractor
.metacafe
import MetacafeIE
32 from .extractor
.statigram
import StatigramIE
33 from .extractor
.photobucket
import PhotobucketIE
34 from .extractor
.vimeo
import VimeoIE
35 from .extractor
.yahoo
import YahooIE
36 from .extractor
.youtube
import YoutubeIE
, YoutubePlaylistIE
, YoutubeSearchIE
, YoutubeUserIE
, YoutubeChannelIE
37 from .extractor
.zdf
import ZDFIE
53 class YahooSearchIE(SearchInfoExtractor
):
54 """Information Extractor for Yahoo! Video search queries."""
57 IE_NAME
= u
'screen.yahoo:search'
58 _SEARCH_KEY
= 'yvsearch'
60 def _get_n_results(self
, query
, n
):
61 """Get a specified number of results for a query"""
68 for pagenum
in itertools
.count(0):
69 result_url
= u
'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum
* 30)
70 webpage
= self
._download
_webpage
(result_url
, query
,
71 note
='Downloading results page '+str(pagenum
+1))
72 info
= json
.loads(webpage
)
74 results
= info
[u
'results']
76 for (i
, r
) in enumerate(results
):
77 if (pagenum
* 30) +i
>= n
:
79 mobj
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
)
80 e
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo')
81 res
['entries'].append(e
)
82 if (pagenum
* 30 +i
>= n
) or (m
[u
'last'] >= (m
[u
'total'] -1 )):
88 class BlipTVUserIE(InfoExtractor
):
89 """Information Extractor for blip.tv users."""
91 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
93 IE_NAME
= u
'blip.tv:user'
95 def _real_extract(self
, url
):
97 mobj
= re
.match(self
._VALID
_URL
, url
)
99 raise ExtractorError(u
'Invalid URL: %s' % url
)
101 username
= mobj
.group(1)
103 page_base
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
105 page
= self
._download
_webpage
(url
, username
, u
'Downloading user page')
106 mobj
= re
.search(r
'data-users-id="([^"]+)"', page
)
107 page_base
= page_base
% mobj
.group(1)
110 # Download video ids using BlipTV Ajax calls. Result size per
111 # query is limited (currently to 12 videos) so we need to query
112 # page by page until there are no video ids - it means we got
119 url
= page_base
+ "&page=" + str(pagenum
)
120 page
= self
._download
_webpage
(url
, username
,
121 u
'Downloading video ids from page %d' % pagenum
)
123 # Extract video identifiers
126 for mobj
in re
.finditer(r
'href="/([^"]+)"', page
):
127 if mobj
.group(1) not in ids_in_page
:
128 ids_in_page
.append(unescapeHTML(mobj
.group(1)))
130 video_ids
.extend(ids_in_page
)
132 # A little optimization - if current page is not
133 # "full", ie. does not contain PAGE_SIZE video ids then
134 # we can assume that this page is the last one - there
135 # are no more ids on further pages - no need to query
138 if len(ids_in_page
) < self
._PAGE
_SIZE
:
143 urls
= [u
'http://blip.tv/%s' % video_id
for video_id
in video_ids
]
144 url_entries
= [self
.url_result(url
, 'BlipTV') for url
in urls
]
145 return [self
.playlist_result(url_entries
, playlist_title
= username
)]
148 class DepositFilesIE(InfoExtractor
):
149 """Information extractor for depositfiles.com"""
151 _VALID_URL
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
153 def _real_extract(self
, url
):
154 file_id
= url
.split('/')[-1]
155 # Rebuild url in english locale
156 url
= 'http://depositfiles.com/en/files/' + file_id
158 # Retrieve file webpage with 'Free download' button pressed
159 free_download_indication
= { 'gateway_result' : '1' }
160 request
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
))
162 self
.report_download_webpage(file_id
)
163 webpage
= compat_urllib_request
.urlopen(request
).read()
164 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
165 raise ExtractorError(u
'Unable to retrieve file webpage: %s' % compat_str(err
))
167 # Search for the real file URL
168 mobj
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
)
169 if (mobj
is None) or (mobj
.group(1) is None):
170 # Try to figure out reason of the error.
171 mobj
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
)
172 if (mobj
is not None) and (mobj
.group(1) is not None):
173 restriction_message
= re
.sub('\s+', ' ', mobj
.group(1)).strip()
174 raise ExtractorError(u
'%s' % restriction_message
)
176 raise ExtractorError(u
'Unable to extract download URL from: %s' % url
)
178 file_url
= mobj
.group(1)
179 file_extension
= os
.path
.splitext(file_url
)[1][1:]
181 # Search for file title
182 file_title
= self
._search
_regex
(r
'<b title="(.*?)">', webpage
, u
'title')
185 'id': file_id
.decode('utf-8'),
186 'url': file_url
.decode('utf-8'),
190 'ext': file_extension
.decode('utf-8'),
194 class FacebookIE(InfoExtractor
):
195 """Information Extractor for Facebook"""
197 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
198 _LOGIN_URL
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
199 _NETRC_MACHINE
= 'facebook'
200 IE_NAME
= u
'facebook'
202 def report_login(self
):
203 """Report attempt to log in."""
204 self
.to_screen(u
'Logging in')
206 def _real_initialize(self
):
207 if self
._downloader
is None:
212 downloader_params
= self
._downloader
.params
214 # Attempt to use provided username and password or .netrc data
215 if downloader_params
.get('username', None) is not None:
216 useremail
= downloader_params
['username']
217 password
= downloader_params
['password']
218 elif downloader_params
.get('usenetrc', False):
220 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
225 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
226 except (IOError, netrc
.NetrcParseError
) as err
:
227 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
))
230 if useremail
is None:
239 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
242 login_results
= compat_urllib_request
.urlopen(request
).read()
243 if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None:
244 self
._downloader
.report_warning(u
'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
246 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
247 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
250 def _real_extract(self
, url
):
251 mobj
= re
.match(self
._VALID
_URL
, url
)
253 raise ExtractorError(u
'Invalid URL: %s' % url
)
254 video_id
= mobj
.group('ID')
256 url
= 'https://www.facebook.com/video/video.php?v=%s' % video_id
257 webpage
= self
._download
_webpage
(url
, video_id
)
259 BEFORE
= '{swf.addParam(param[0], param[1]);});\n'
260 AFTER
= '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
261 m
= re
.search(re
.escape(BEFORE
) + '(.*?)' + re
.escape(AFTER
), webpage
)
263 raise ExtractorError(u
'Cannot parse data')
264 data
= dict(json
.loads(m
.group(1)))
265 params_raw
= compat_urllib_parse
.unquote(data
['params'])
266 params
= json
.loads(params_raw
)
267 video_data
= params
['video_data'][0]
268 video_url
= video_data
.get('hd_src')
270 video_url
= video_data
['sd_src']
272 raise ExtractorError(u
'Cannot find video URL')
273 video_duration
= int(video_data
['video_duration'])
274 thumbnail
= video_data
['thumbnail_src']
276 video_title
= self
._html
_search
_regex
('<h2 class="uiHeaderTitle">([^<]+)</h2>',
281 'title': video_title
,
284 'duration': video_duration
,
285 'thumbnail': thumbnail
,
290 class BlipTVIE(InfoExtractor
):
291 """Information extractor for blip.tv"""
293 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
294 _URL_EXT
= r
'^.*\.([a-z0-9]+)$'
297 def report_direct_download(self
, title
):
298 """Report information extraction."""
299 self
.to_screen(u
'%s: Direct download detected' % title
)
301 def _real_extract(self
, url
):
302 mobj
= re
.match(self
._VALID
_URL
, url
)
304 raise ExtractorError(u
'Invalid URL: %s' % url
)
306 # See https://github.com/rg3/youtube-dl/issues/857
307 api_mobj
= re
.match(r
'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url
)
308 if api_mobj
is not None:
309 url
= 'http://blip.tv/play/g_%s' % api_mobj
.group('video_id')
310 urlp
= compat_urllib_parse_urlparse(url
)
311 if urlp
.path
.startswith('/play/'):
312 request
= compat_urllib_request
.Request(url
)
313 response
= compat_urllib_request
.urlopen(request
)
314 redirecturl
= response
.geturl()
315 rurlp
= compat_urllib_parse_urlparse(redirecturl
)
316 file_id
= compat_parse_qs(rurlp
.fragment
)['file'][0].rpartition('/')[2]
317 url
= 'http://blip.tv/a/a-' + file_id
318 return self
._real
_extract
(url
)
325 json_url
= url
+ cchar
+ 'skin=json&version=2&no_wrap=1'
326 request
= compat_urllib_request
.Request(json_url
)
327 request
.add_header('User-Agent', 'iTunes/10.6.1')
328 self
.report_extraction(mobj
.group(1))
331 urlh
= compat_urllib_request
.urlopen(request
)
332 if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download
333 basename
= url
.split('/')[-1]
334 title
,ext
= os
.path
.splitext(basename
)
335 title
= title
.decode('UTF-8')
336 ext
= ext
.replace('.', '')
337 self
.report_direct_download(title
)
347 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
348 raise ExtractorError(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
349 if info
is None: # Regular URL
351 json_code_bytes
= urlh
.read()
352 json_code
= json_code_bytes
.decode('utf-8')
353 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
354 raise ExtractorError(u
'Unable to read video info webpage: %s' % compat_str(err
))
357 json_data
= json
.loads(json_code
)
358 if 'Post' in json_data
:
359 data
= json_data
['Post']
363 upload_date
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
364 video_url
= data
['media']['url']
365 umobj
= re
.match(self
._URL
_EXT
, video_url
)
367 raise ValueError('Can not determine filename extension')
371 'id': data
['item_id'],
373 'uploader': data
['display_name'],
374 'upload_date': upload_date
,
375 'title': data
['title'],
377 'format': data
['media']['mimeType'],
378 'thumbnail': data
['thumbnailUrl'],
379 'description': data
['description'],
380 'player_url': data
['embedUrl'],
381 'user_agent': 'iTunes/10.6.1',
383 except (ValueError,KeyError) as err
:
384 raise ExtractorError(u
'Unable to parse video information: %s' % repr(err
))
389 class MyVideoIE(InfoExtractor
):
390 """Information Extractor for myvideo.de."""
392 _VALID_URL
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
395 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
396 # Released into the Public Domain by Tristan Fischer on 2013-05-19
397 # https://github.com/rg3/youtube-dl/pull/842
398 def __rc4crypt(self
,data
, key
):
400 box
= list(range(256))
401 for i
in list(range(256)):
402 x
= (x
+ box
[i
] + compat_ord(key
[i
% len(key
)])) % 256
403 box
[i
], box
[x
] = box
[x
], box
[i
]
409 y
= (y
+ box
[x
]) % 256
410 box
[x
], box
[y
] = box
[y
], box
[x
]
411 out
+= chr(compat_ord(char
) ^ box
[(box
[x
] + box
[y
]) % 256])
415 return hashlib
.md5(s
).hexdigest().encode()
417 def _real_extract(self
,url
):
418 mobj
= re
.match(self
._VALID
_URL
, url
)
420 raise ExtractorError(u
'invalid URL: %s' % url
)
422 video_id
= mobj
.group(1)
425 b
'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
426 b
'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
427 b
'TnpsbA0KTVRkbU1tSTRNdz09'
431 webpage_url
= 'http://www.myvideo.de/watch/%s' % video_id
432 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
434 mobj
= re
.search('source src=\'(.+?)[.]([^.]+)\'', webpage
)
436 self
.report_extraction(video_id
)
437 video_url
= mobj
.group(1) + '.flv'
439 video_title
= self
._html
_search
_regex
('<title>([^<]+)</title>',
442 video_ext
= self
._search
_regex
('[.](.+?)$', video_url
, u
'extension')
449 'title': video_title
,
454 mobj
= re
.search('var flashvars={(.+?)}', webpage
)
456 raise ExtractorError(u
'Unable to extract video')
461 for (a
, b
) in re
.findall('(.+?):\'(.+?)\',?', sec
):
462 if not a
== '_encxml':
465 encxml
= compat_urllib_parse
.unquote(b
)
466 if not params
.get('domain'):
467 params
['domain'] = 'www.myvideo.de'
468 xmldata_url
= '%s?%s' % (encxml
, compat_urllib_parse
.urlencode(params
))
469 if 'flash_playertype=MTV' in xmldata_url
:
470 self
._downloader
.report_warning(u
'avoiding MTV player')
472 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
473 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
477 enc_data
= self
._download
_webpage
(xmldata_url
, video_id
).split('=')[1]
478 enc_data_b
= binascii
.unhexlify(enc_data
)
480 base64
.b64decode(base64
.b64decode(GK
)) +
482 str(video_id
).encode('utf-8')
485 dec_data
= self
.__rc
4crypt
(enc_data_b
, sk
)
488 self
.report_extraction(video_id
)
491 mobj
= re
.search('connectionurl=\'(.*?)\'', dec_data
)
493 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
494 if 'myvideo2flash' in video_url
:
495 self
._downloader
.report_warning(u
'forcing RTMPT ...')
496 video_url
= video_url
.replace('rtmpe://', 'rtmpt://')
499 # extract non rtmp videos
500 mobj
= re
.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data
)
502 raise ExtractorError(u
'unable to extract url')
503 video_url
= compat_urllib_parse
.unquote(mobj
.group(1)) + compat_urllib_parse
.unquote(mobj
.group(2))
505 video_file
= self
._search
_regex
('source=\'(.*?)\'', dec_data
, u
'video file')
506 video_file
= compat_urllib_parse
.unquote(video_file
)
508 if not video_file
.endswith('f4m'):
509 ppath
, prefix
= video_file
.split('.')
510 video_playpath
= '%s:%s' % (prefix
, ppath
)
511 video_hls_playlist
= ''
514 video_hls_playlist
= (
515 video_filepath
+ video_file
516 ).replace('.f4m', '.m3u8')
518 video_swfobj
= self
._search
_regex
('swfobject.embedSWF\(\'(.+?)\'', webpage
, u
'swfobj')
519 video_swfobj
= compat_urllib_parse
.unquote(video_swfobj
)
521 video_title
= self
._html
_search
_regex
("<h1(?: class='globalHd')?>(.*?)</h1>",
530 'title': video_title
,
532 'play_path': video_playpath
,
533 'video_file': video_file
,
534 'video_hls_playlist': video_hls_playlist
,
535 'player_url': video_swfobj
,
539 class ComedyCentralIE(InfoExtractor
):
540 """Information extractor for The Daily Show and Colbert Report """
542 # urls can be abbreviations like :thedailyshow or :colbert
543 # urls for episodes like:
544 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
545 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
546 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
547 _VALID_URL
= r
"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
548 |(https?://)?(www\.)?
549 (?P<showname>thedailyshow|colbertnation)\.com/
550 (full-episodes/(?P<episode>.*)|
552 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
553 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
556 _available_formats
= ['3500', '2200', '1700', '1200', '750', '400']
558 _video_extensions
= {
566 _video_dimensions
= {
576 def suitable(cls
, url
):
577 """Receives a URL and returns True if suitable for this IE."""
578 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
580 def _print_formats(self
, formats
):
581 print('Available formats:')
583 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'mp4'), self
._video
_dimensions
.get(x
, '???')))
586 def _real_extract(self
, url
):
587 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
589 raise ExtractorError(u
'Invalid URL: %s' % url
)
591 if mobj
.group('shortname'):
592 if mobj
.group('shortname') in ('tds', 'thedailyshow'):
593 url
= u
'http://www.thedailyshow.com/full-episodes/'
595 url
= u
'http://www.colbertnation.com/full-episodes/'
596 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
597 assert mobj
is not None
599 if mobj
.group('clip'):
600 if mobj
.group('showname') == 'thedailyshow':
601 epTitle
= mobj
.group('tdstitle')
603 epTitle
= mobj
.group('cntitle')
606 dlNewest
= not mobj
.group('episode')
608 epTitle
= mobj
.group('showname')
610 epTitle
= mobj
.group('episode')
612 self
.report_extraction(epTitle
)
613 webpage
,htmlHandle
= self
._download
_webpage
_handle
(url
, epTitle
)
615 url
= htmlHandle
.geturl()
616 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
618 raise ExtractorError(u
'Invalid redirected URL: ' + url
)
619 if mobj
.group('episode') == '':
620 raise ExtractorError(u
'Redirected URL is still not specific: ' + url
)
621 epTitle
= mobj
.group('episode')
623 mMovieParams
= re
.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage
)
625 if len(mMovieParams
) == 0:
626 # The Colbert Report embeds the information in a without
627 # a URL prefix; so extract the alternate reference
628 # and then add the URL prefix manually.
630 altMovieParams
= re
.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage
)
631 if len(altMovieParams
) == 0:
632 raise ExtractorError(u
'unable to find Flash URL in webpage ' + url
)
634 mMovieParams
= [("http://media.mtvnservices.com/" + altMovieParams
[0], altMovieParams
[0])]
636 uri
= mMovieParams
[0][1]
637 indexUrl
= 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse
.urlencode({'uri': uri}
)
638 indexXml
= self
._download
_webpage
(indexUrl
, epTitle
,
639 u
'Downloading show index',
640 u
'unable to download episode index')
644 idoc
= xml
.etree
.ElementTree
.fromstring(indexXml
)
645 itemEls
= idoc
.findall('.//item')
646 for partNum
,itemEl
in enumerate(itemEls
):
647 mediaId
= itemEl
.findall('./guid')[0].text
648 shortMediaId
= mediaId
.split(':')[-1]
649 showId
= mediaId
.split(':')[-2].replace('.com', '')
650 officialTitle
= itemEl
.findall('./title')[0].text
651 officialDate
= unified_strdate(itemEl
.findall('./pubDate')[0].text
)
653 configUrl
= ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
654 compat_urllib_parse
.urlencode({'uri': mediaId}
))
655 configXml
= self
._download
_webpage
(configUrl
, epTitle
,
656 u
'Downloading configuration for %s' % shortMediaId
)
658 cdoc
= xml
.etree
.ElementTree
.fromstring(configXml
)
660 for rendition
in cdoc
.findall('.//rendition'):
661 finfo
= (rendition
.attrib
['bitrate'], rendition
.findall('./src')[0].text
)
665 self
._downloader
.report_error(u
'unable to download ' + mediaId
+ ': No videos found')
668 if self
._downloader
.params
.get('listformats', None):
669 self
._print
_formats
([i
[0] for i
in turls
])
672 # For now, just pick the highest bitrate
673 format
,rtmp_video_url
= turls
[-1]
675 # Get the format arg from the arg stream
676 req_format
= self
._downloader
.params
.get('format', None)
678 # Select format if we can find one
681 format
, rtmp_video_url
= f
, v
684 m
= re
.match(r
'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url
)
686 raise ExtractorError(u
'Cannot transform RTMP url')
687 base
= 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
688 video_url
= base
+ m
.group('finalid')
690 effTitle
= showId
+ u
'-' + epTitle
+ u
' part ' + compat_str(partNum
+1)
695 'upload_date': officialDate
,
700 'description': officialTitle
,
707 class EscapistIE(InfoExtractor
):
708 """Information extractor for The Escapist """
710 _VALID_URL
= r
'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
711 IE_NAME
= u
'escapist'
713 def _real_extract(self
, url
):
714 mobj
= re
.match(self
._VALID
_URL
, url
)
716 raise ExtractorError(u
'Invalid URL: %s' % url
)
717 showName
= mobj
.group('showname')
718 videoId
= mobj
.group('episode')
720 self
.report_extraction(videoId
)
721 webpage
= self
._download
_webpage
(url
, videoId
)
723 videoDesc
= self
._html
_search
_regex
('<meta name="description" content="([^"]*)"',
724 webpage
, u
'description', fatal
=False)
726 imgUrl
= self
._html
_search
_regex
('<meta property="og:image" content="([^"]*)"',
727 webpage
, u
'thumbnail', fatal
=False)
729 playerUrl
= self
._html
_search
_regex
('<meta property="og:video" content="([^"]*)"',
730 webpage
, u
'player url')
732 title
= self
._html
_search
_regex
('<meta name="title" content="([^"]*)"',
733 webpage
, u
'player url').split(' : ')[-1]
735 configUrl
= self
._search
_regex
('config=(.*)$', playerUrl
, u
'config url')
736 configUrl
= compat_urllib_parse
.unquote(configUrl
)
738 configJSON
= self
._download
_webpage
(configUrl
, videoId
,
739 u
'Downloading configuration',
740 u
'unable to download configuration')
742 # Technically, it's JavaScript, not JSON
743 configJSON
= configJSON
.replace("'", '"')
746 config
= json
.loads(configJSON
)
747 except (ValueError,) as err
:
748 raise ExtractorError(u
'Invalid JSON in configuration file: ' + compat_str(err
))
750 playlist
= config
['playlist']
751 videoUrl
= playlist
[1]['url']
756 'uploader': showName
,
761 'description': videoDesc
,
762 'player_url': playerUrl
,
767 class CollegeHumorIE(InfoExtractor
):
768 """Information extractor for collegehumor.com"""
771 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
772 IE_NAME
= u
'collegehumor'
774 def report_manifest(self
, video_id
):
775 """Report information extraction."""
776 self
.to_screen(u
'%s: Downloading XML manifest' % video_id
)
778 def _real_extract(self
, url
):
779 mobj
= re
.match(self
._VALID
_URL
, url
)
781 raise ExtractorError(u
'Invalid URL: %s' % url
)
782 video_id
= mobj
.group('videoid')
790 self
.report_extraction(video_id
)
791 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
793 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
794 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
795 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
797 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
799 videoNode
= mdoc
.findall('./video')[0]
800 info
['description'] = videoNode
.findall('./description')[0].text
801 info
['title'] = videoNode
.findall('./caption')[0].text
802 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
803 manifest_url
= videoNode
.findall('./file')[0].text
805 raise ExtractorError(u
'Invalid metadata XML file')
807 manifest_url
+= '?hdcore=2.10.3'
808 self
.report_manifest(video_id
)
810 manifestXml
= compat_urllib_request
.urlopen(manifest_url
).read()
811 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
812 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
814 adoc
= xml
.etree
.ElementTree
.fromstring(manifestXml
)
816 media_node
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
817 node_id
= media_node
.attrib
['url']
818 video_id
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
819 except IndexError as err
:
820 raise ExtractorError(u
'Invalid manifest file')
822 url_pr
= compat_urllib_parse_urlparse(manifest_url
)
823 url
= url_pr
.scheme
+ '://' + url_pr
.netloc
+ '/z' + video_id
[:-2] + '/' + node_id
+ 'Seg1-Frag1'
830 class XVideosIE(InfoExtractor
):
831 """Information extractor for xvideos.com"""
833 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
836 def _real_extract(self
, url
):
837 mobj
= re
.match(self
._VALID
_URL
, url
)
839 raise ExtractorError(u
'Invalid URL: %s' % url
)
840 video_id
= mobj
.group(1)
842 webpage
= self
._download
_webpage
(url
, video_id
)
844 self
.report_extraction(video_id
)
847 video_url
= compat_urllib_parse
.unquote(self
._search
_regex
(r
'flv_url=(.+?)&',
848 webpage
, u
'video URL'))
851 video_title
= self
._html
_search
_regex
(r
'<title>(.*?)\s+-\s+XVID',
854 # Extract video thumbnail
855 video_thumbnail
= self
._search
_regex
(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
856 webpage
, u
'thumbnail', fatal
=False)
863 'title': video_title
,
865 'thumbnail': video_thumbnail
,
872 class SoundcloudIE(InfoExtractor
):
873 """Information extractor for soundcloud.com
874 To access the media, the uid of the song and a stream token
875 must be extracted from the page source and the script must make
876 a request to media.soundcloud.com/crossdomain.xml. Then
877 the media can be grabbed by requesting from an url composed
878 of the stream token and uid
881 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
882 IE_NAME
= u
'soundcloud'
884 def report_resolve(self
, video_id
):
885 """Report information extraction."""
886 self
.to_screen(u
'%s: Resolving id' % video_id
)
888 def _real_extract(self
, url
):
889 mobj
= re
.match(self
._VALID
_URL
, url
)
891 raise ExtractorError(u
'Invalid URL: %s' % url
)
893 # extract uploader (which is in the url)
894 uploader
= mobj
.group(1)
895 # extract simple title (uploader + slug of song title)
896 slug_title
= mobj
.group(2)
897 simple_title
= uploader
+ u
'-' + slug_title
898 full_title
= '%s/%s' % (uploader
, slug_title
)
900 self
.report_resolve(full_title
)
902 url
= 'http://soundcloud.com/%s/%s' % (uploader
, slug_title
)
903 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
904 info_json
= self
._download
_webpage
(resolv_url
, full_title
, u
'Downloading info JSON')
906 info
= json
.loads(info_json
)
907 video_id
= info
['id']
908 self
.report_extraction(full_title
)
910 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
911 stream_json
= self
._download
_webpage
(streams_url
, full_title
,
912 u
'Downloading stream definitions',
913 u
'unable to download stream definitions')
915 streams
= json
.loads(stream_json
)
916 mediaURL
= streams
['http_mp3_128_url']
917 upload_date
= unified_strdate(info
['created_at'])
922 'uploader': info
['user']['username'],
923 'upload_date': upload_date
,
924 'title': info
['title'],
926 'description': info
['description'],
929 class SoundcloudSetIE(InfoExtractor
):
930 """Information extractor for soundcloud.com sets
931 To access the media, the uid of the song and a stream token
932 must be extracted from the page source and the script must make
933 a request to media.soundcloud.com/crossdomain.xml. Then
934 the media can be grabbed by requesting from an url composed
935 of the stream token and uid
938 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
939 IE_NAME
= u
'soundcloud:set'
941 def report_resolve(self
, video_id
):
942 """Report information extraction."""
943 self
.to_screen(u
'%s: Resolving id' % video_id
)
945 def _real_extract(self
, url
):
946 mobj
= re
.match(self
._VALID
_URL
, url
)
948 raise ExtractorError(u
'Invalid URL: %s' % url
)
950 # extract uploader (which is in the url)
951 uploader
= mobj
.group(1)
952 # extract simple title (uploader + slug of song title)
953 slug_title
= mobj
.group(2)
954 simple_title
= uploader
+ u
'-' + slug_title
955 full_title
= '%s/sets/%s' % (uploader
, slug_title
)
957 self
.report_resolve(full_title
)
959 url
= 'http://soundcloud.com/%s/sets/%s' % (uploader
, slug_title
)
960 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
961 info_json
= self
._download
_webpage
(resolv_url
, full_title
)
964 info
= json
.loads(info_json
)
966 for err
in info
['errors']:
967 self
._downloader
.report_error(u
'unable to download video webpage: %s' % compat_str(err
['error_message']))
970 self
.report_extraction(full_title
)
971 for track
in info
['tracks']:
972 video_id
= track
['id']
974 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
975 stream_json
= self
._download
_webpage
(streams_url
, video_id
, u
'Downloading track info JSON')
977 self
.report_extraction(video_id
)
978 streams
= json
.loads(stream_json
)
979 mediaURL
= streams
['http_mp3_128_url']
984 'uploader': track
['user']['username'],
985 'upload_date': unified_strdate(track
['created_at']),
986 'title': track
['title'],
988 'description': track
['description'],
993 class InfoQIE(InfoExtractor
):
994 """Information extractor for infoq.com"""
995 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
997 def _real_extract(self
, url
):
998 mobj
= re
.match(self
._VALID
_URL
, url
)
1000 raise ExtractorError(u
'Invalid URL: %s' % url
)
1002 webpage
= self
._download
_webpage
(url
, video_id
=url
)
1003 self
.report_extraction(url
)
1006 mobj
= re
.search(r
"jsclassref ?= ?'([^']*)'", webpage
)
1008 raise ExtractorError(u
'Unable to extract video url')
1009 real_id
= compat_urllib_parse
.unquote(base64
.b64decode(mobj
.group(1).encode('ascii')).decode('utf-8'))
1010 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + real_id
1013 video_title
= self
._search
_regex
(r
'contentTitle = "(.*?)";',
1016 # Extract description
1017 video_description
= self
._html
_search
_regex
(r
'<meta name="description" content="(.*)"(?:\s*/)?>',
1018 webpage
, u
'description', fatal
=False)
1020 video_filename
= video_url
.split('/')[-1]
1021 video_id
, extension
= video_filename
.split('.')
1027 'upload_date': None,
1028 'title': video_title
,
1029 'ext': extension
, # Extension is always(?) mp4, but seems to be flv
1031 'description': video_description
,
1036 class MixcloudIE(InfoExtractor
):
1037 """Information extractor for www.mixcloud.com"""
1039 _WORKING
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1040 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1041 IE_NAME
= u
'mixcloud'
1043 def report_download_json(self
, file_id
):
1044 """Report JSON download."""
1045 self
.to_screen(u
'Downloading json')
1047 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
1048 """Get urls from 'audio_formats' section in json"""
1051 bitrate_list
= jsonData
[fmt
]
1052 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
1053 bitrate
= max(bitrate_list
) # select highest
1055 url_list
= jsonData
[fmt
][bitrate
]
1056 except TypeError: # we have no bitrate info.
1057 url_list
= jsonData
[fmt
]
1060 def check_urls(self
, url_list
):
1061 """Returns 1st active url from list"""
1062 for url
in url_list
:
1064 compat_urllib_request
.urlopen(url
)
1066 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1071 def _print_formats(self
, formats
):
1072 print('Available formats:')
1073 for fmt
in formats
.keys():
1074 for b
in formats
[fmt
]:
1076 ext
= formats
[fmt
][b
][0]
1077 print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]))
1078 except TypeError: # we have no bitrate info
1079 ext
= formats
[fmt
][0]
1080 print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]))
1083 def _real_extract(self
, url
):
1084 mobj
= re
.match(self
._VALID
_URL
, url
)
1086 raise ExtractorError(u
'Invalid URL: %s' % url
)
1087 # extract uploader & filename from url
1088 uploader
= mobj
.group(1).decode('utf-8')
1089 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
1091 # construct API request
1092 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
1093 # retrieve .json file with links to files
1094 request
= compat_urllib_request
.Request(file_url
)
1096 self
.report_download_json(file_url
)
1097 jsonData
= compat_urllib_request
.urlopen(request
).read()
1098 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1099 raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
))
1102 json_data
= json
.loads(jsonData
)
1103 player_url
= json_data
['player_swf_url']
1104 formats
= dict(json_data
['audio_formats'])
1106 req_format
= self
._downloader
.params
.get('format', None)
1109 if self
._downloader
.params
.get('listformats', None):
1110 self
._print
_formats
(formats
)
1113 if req_format
is None or req_format
== 'best':
1114 for format_param
in formats
.keys():
1115 url_list
= self
.get_urls(formats
, format_param
)
1117 file_url
= self
.check_urls(url_list
)
1118 if file_url
is not None:
1121 if req_format
not in formats
:
1122 raise ExtractorError(u
'Format is not available')
1124 url_list
= self
.get_urls(formats
, req_format
)
1125 file_url
= self
.check_urls(url_list
)
1126 format_param
= req_format
1129 'id': file_id
.decode('utf-8'),
1130 'url': file_url
.decode('utf-8'),
1131 'uploader': uploader
.decode('utf-8'),
1132 'upload_date': None,
1133 'title': json_data
['name'],
1134 'ext': file_url
.split('.')[-1].decode('utf-8'),
1135 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
1136 'thumbnail': json_data
['thumbnail_url'],
1137 'description': json_data
['description'],
1138 'player_url': player_url
.decode('utf-8'),
1141 class StanfordOpenClassroomIE(InfoExtractor
):
1142 """Information extractor for Stanford's Open ClassRoom"""
1144 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1145 IE_NAME
= u
'stanfordoc'
1147 def _real_extract(self
, url
):
1148 mobj
= re
.match(self
._VALID
_URL
, url
)
1150 raise ExtractorError(u
'Invalid URL: %s' % url
)
1152 if mobj
.group('course') and mobj
.group('video'): # A specific video
1153 course
= mobj
.group('course')
1154 video
= mobj
.group('video')
1156 'id': course
+ '_' + video
,
1158 'upload_date': None,
1161 self
.report_extraction(info
['id'])
1162 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
1163 xmlUrl
= baseUrl
+ video
+ '.xml'
1165 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
1166 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1167 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
1168 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
1170 info
['title'] = mdoc
.findall('./title')[0].text
1171 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
1173 raise ExtractorError(u
'Invalid metadata XML file')
1174 info
['ext'] = info
['url'].rpartition('.')[2]
1176 elif mobj
.group('course'): # A course page
1177 course
= mobj
.group('course')
1182 'upload_date': None,
1185 coursepage
= self
._download
_webpage
(url
, info
['id'],
1186 note
='Downloading course info page',
1187 errnote
='Unable to download course info page')
1189 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
1191 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
1192 coursepage
, u
'description', fatal
=False)
1194 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
1197 'type': 'reference',
1198 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
1202 for entry
in info
['list']:
1203 assert entry
['type'] == 'reference'
1204 results
+= self
.extract(entry
['url'])
1208 'id': 'Stanford OpenClassroom',
1211 'upload_date': None,
1214 self
.report_download_webpage(info
['id'])
1215 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1217 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
1218 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1219 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
))
1221 info
['title'] = info
['id']
1223 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
1226 'type': 'reference',
1227 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
1232 for entry
in info
['list']:
1233 assert entry
['type'] == 'reference'
1234 results
+= self
.extract(entry
['url'])
1237 class MTVIE(InfoExtractor
):
1238 """Information extractor for MTV.com"""
1240 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1243 def _real_extract(self
, url
):
1244 mobj
= re
.match(self
._VALID
_URL
, url
)
1246 raise ExtractorError(u
'Invalid URL: %s' % url
)
1247 if not mobj
.group('proto'):
1248 url
= 'http://' + url
1249 video_id
= mobj
.group('videoid')
1251 webpage
= self
._download
_webpage
(url
, video_id
)
1253 song_name
= self
._html
_search
_regex
(r
'<meta name="mtv_vt" content="([^"]+)"/>',
1254 webpage
, u
'song name', fatal
=False)
1256 video_title
= self
._html
_search
_regex
(r
'<meta name="mtv_an" content="([^"]+)"/>',
1259 mtvn_uri
= self
._html
_search
_regex
(r
'<meta name="mtvn_uri" content="([^"]+)"/>',
1260 webpage
, u
'mtvn_uri', fatal
=False)
1262 content_id
= self
._search
_regex
(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1263 webpage
, u
'content id', fatal
=False)
1265 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1266 self
.report_extraction(video_id
)
1267 request
= compat_urllib_request
.Request(videogen_url
)
1269 metadataXml
= compat_urllib_request
.urlopen(request
).read()
1270 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1271 raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
))
1273 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
1274 renditions
= mdoc
.findall('.//rendition')
1276 # For now, always pick the highest quality.
1277 rendition
= renditions
[-1]
1280 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
1281 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
1282 video_url
= rendition
.find('./src').text
1284 raise ExtractorError('Invalid rendition field.')
1289 'uploader': performer
,
1290 'upload_date': None,
1291 'title': video_title
,
1299 class YoukuIE(InfoExtractor
):
1300 _VALID_URL
= r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1303 nowTime
= int(time
.time() * 1000)
1304 random1
= random
.randint(1000,1998)
1305 random2
= random
.randint(1000,9999)
1307 return "%d%d%d" %(nowTime
,random1
,random2
)
1309 def _get_file_ID_mix_string(self
, seed
):
1311 source
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1313 for i
in range(len(source
)):
1314 seed
= (seed
* 211 + 30031 ) % 65536
1315 index
= math
.floor(seed
/ 65536 * len(source
) )
1316 mixed
.append(source
[int(index
)])
1317 source
.remove(source
[int(index
)])
1318 #return ''.join(mixed)
1321 def _get_file_id(self
, fileId
, seed
):
1322 mixed
= self
._get
_file
_ID
_mix
_string
(seed
)
1323 ids
= fileId
.split('*')
1327 realId
.append(mixed
[int(ch
)])
1328 return ''.join(realId
)
1330 def _real_extract(self
, url
):
1331 mobj
= re
.match(self
._VALID
_URL
, url
)
1333 raise ExtractorError(u
'Invalid URL: %s' % url
)
1334 video_id
= mobj
.group('ID')
1336 info_url
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1338 jsondata
= self
._download
_webpage
(info_url
, video_id
)
1340 self
.report_extraction(video_id
)
1342 config
= json
.loads(jsondata
)
1344 video_title
= config
['data'][0]['title']
1345 seed
= config
['data'][0]['seed']
1347 format
= self
._downloader
.params
.get('format', None)
1348 supported_format
= list(config
['data'][0]['streamfileids'].keys())
1350 if format
is None or format
== 'best':
1351 if 'hd2' in supported_format
:
1356 elif format
== 'worst':
1364 fileid
= config
['data'][0]['streamfileids'][format
]
1365 keys
= [s
['k'] for s
in config
['data'][0]['segs'][format
]]
1366 except (UnicodeDecodeError, ValueError, KeyError):
1367 raise ExtractorError(u
'Unable to extract info section')
1370 sid
= self
._gen
_sid
()
1371 fileid
= self
._get
_file
_id
(fileid
, seed
)
1373 #column 8,9 of fileid represent the segment number
1374 #fileid[7:9] should be changed
1375 for index
, key
in enumerate(keys
):
1377 temp_fileid
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:])
1378 download_url
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
)
1381 'id': '%s_part%02d' % (video_id
, index
),
1382 'url': download_url
,
1384 'upload_date': None,
1385 'title': video_title
,
1388 files_info
.append(info
)
1393 class XNXXIE(InfoExtractor
):
1394 """Information extractor for xnxx.com"""
1396 _VALID_URL
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1398 VIDEO_URL_RE
= r
'flv_url=(.*?)&'
1399 VIDEO_TITLE_RE
= r
'<title>(.*?)\s+-\s+XNXX.COM'
1400 VIDEO_THUMB_RE
= r
'url_bigthumb=(.*?)&'
1402 def _real_extract(self
, url
):
1403 mobj
= re
.match(self
._VALID
_URL
, url
)
1405 raise ExtractorError(u
'Invalid URL: %s' % url
)
1406 video_id
= mobj
.group(1)
1408 # Get webpage content
1409 webpage
= self
._download
_webpage
(url
, video_id
)
1411 video_url
= self
._search
_regex
(self
.VIDEO_URL_RE
,
1412 webpage
, u
'video URL')
1413 video_url
= compat_urllib_parse
.unquote(video_url
)
1415 video_title
= self
._html
_search
_regex
(self
.VIDEO_TITLE_RE
,
1418 video_thumbnail
= self
._search
_regex
(self
.VIDEO_THUMB_RE
,
1419 webpage
, u
'thumbnail', fatal
=False)
1425 'upload_date': None,
1426 'title': video_title
,
1428 'thumbnail': video_thumbnail
,
1429 'description': None,
1433 class GooglePlusIE(InfoExtractor
):
1434 """Information extractor for plus.google.com."""
1436 _VALID_URL
= r
'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1437 IE_NAME
= u
'plus.google'
1439 def _real_extract(self
, url
):
1440 # Extract id from URL
1441 mobj
= re
.match(self
._VALID
_URL
, url
)
1443 raise ExtractorError(u
'Invalid URL: %s' % url
)
1445 post_url
= mobj
.group(0)
1446 video_id
= mobj
.group(1)
1448 video_extension
= 'flv'
1450 # Step 1, Retrieve post webpage to extract further information
1451 webpage
= self
._download
_webpage
(post_url
, video_id
, u
'Downloading entry webpage')
1453 self
.report_extraction(video_id
)
1455 # Extract update date
1456 upload_date
= self
._html
_search
_regex
('title="Timestamp">(.*?)</a>',
1457 webpage
, u
'upload date', fatal
=False)
1459 # Convert timestring to a format suitable for filename
1460 upload_date
= datetime
.datetime
.strptime(upload_date
, "%Y-%m-%d")
1461 upload_date
= upload_date
.strftime('%Y%m%d')
1464 uploader
= self
._html
_search
_regex
(r
'rel\="author".*?>(.*?)</a>',
1465 webpage
, u
'uploader', fatal
=False)
1468 # Get the first line for title
1469 video_title
= self
._html
_search
_regex
(r
'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1470 webpage
, 'title', default
=u
'NA')
1472 # Step 2, Stimulate clicking the image box to launch video
1473 video_page
= self
._search
_regex
('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1474 webpage
, u
'video page URL')
1475 webpage
= self
._download
_webpage
(video_page
, video_id
, u
'Downloading video page')
1477 # Extract video links on video page
1478 """Extract video links of all sizes"""
1479 pattern
= '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1480 mobj
= re
.findall(pattern
, webpage
)
1482 raise ExtractorError(u
'Unable to extract video links')
1484 # Sort in resolution
1485 links
= sorted(mobj
)
1487 # Choose the lowest of the sort, i.e. highest resolution
1488 video_url
= links
[-1]
1489 # Only get the url. The resolution part in the tuple has no use anymore
1490 video_url
= video_url
[-1]
1491 # Treat escaped \u0026 style hex
1493 video_url
= video_url
.decode("unicode_escape")
1494 except AttributeError: # Python 3
1495 video_url
= bytes(video_url
, 'ascii').decode('unicode-escape')
1501 'uploader': uploader
,
1502 'upload_date': upload_date
,
1503 'title': video_title
,
1504 'ext': video_extension
,
1507 class NBAIE(InfoExtractor
):
1508 _VALID_URL
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1511 def _real_extract(self
, url
):
1512 mobj
= re
.match(self
._VALID
_URL
, url
)
1514 raise ExtractorError(u
'Invalid URL: %s' % url
)
1516 video_id
= mobj
.group(1)
1518 webpage
= self
._download
_webpage
(url
, video_id
)
1520 video_url
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id
+ '_nba_1280x720.mp4'
1522 shortened_video_id
= video_id
.rpartition('/')[2]
1523 title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(.*?)"',
1524 webpage
, 'title', default
=shortened_video_id
).replace('NBA.com: ', '')
1526 # It isn't there in the HTML it returns to us
1527 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1529 description
= self
._html
_search
_regex
(r
'<meta name="description" (?:content|value)="(.*?)" />', webpage
, 'description', fatal
=False)
1532 'id': shortened_video_id
,
1536 # 'uploader_date': uploader_date,
1537 'description': description
,
1541 class JustinTVIE(InfoExtractor
):
1542 """Information extractor for justin.tv and twitch.tv"""
1543 # TODO: One broadcast may be split into multiple videos. The key
1544 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1545 # starts at 1 and increases. Can we treat all parts as one video?
1547 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1549 (?P<channelid>[^/]+)|
1550 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1551 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1555 _JUSTIN_PAGE_LIMIT
= 100
1556 IE_NAME
= u
'justin.tv'
1558 def report_download_page(self
, channel
, offset
):
1559 """Report attempt to download a single page of videos."""
1560 self
.to_screen(u
'%s: Downloading video information from %d to %d' %
1561 (channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
1563 # Return count of items, list of *valid* items
1564 def _parse_page(self
, url
, video_id
):
1565 webpage
= self
._download
_webpage
(url
, video_id
,
1566 u
'Downloading video info JSON',
1567 u
'unable to download video info JSON')
1569 response
= json
.loads(webpage
)
1570 if type(response
) != list:
1571 error_text
= response
.get('error', 'unknown error')
1572 raise ExtractorError(u
'Justin.tv API: %s' % error_text
)
1574 for clip
in response
:
1575 video_url
= clip
['video_file_url']
1577 video_extension
= os
.path
.splitext(video_url
)[1][1:]
1578 video_date
= re
.sub('-', '', clip
['start_time'][:10])
1579 video_uploader_id
= clip
.get('user_id', clip
.get('channel_id'))
1580 video_id
= clip
['id']
1581 video_title
= clip
.get('title', video_id
)
1585 'title': video_title
,
1586 'uploader': clip
.get('channel_name', video_uploader_id
),
1587 'uploader_id': video_uploader_id
,
1588 'upload_date': video_date
,
1589 'ext': video_extension
,
1591 return (len(response
), info
)
1593 def _real_extract(self
, url
):
1594 mobj
= re
.match(self
._VALID
_URL
, url
)
1596 raise ExtractorError(u
'invalid URL: %s' % url
)
1598 api_base
= 'http://api.justin.tv'
1600 if mobj
.group('channelid'):
1602 video_id
= mobj
.group('channelid')
1603 api
= api_base
+ '/channel/archives/%s.json' % video_id
1604 elif mobj
.group('chapterid'):
1605 chapter_id
= mobj
.group('chapterid')
1607 webpage
= self
._download
_webpage
(url
, chapter_id
)
1608 m
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
)
1610 raise ExtractorError(u
'Cannot find archive of a chapter')
1611 archive_id
= m
.group(1)
1613 api
= api_base
+ '/broadcast/by_chapter/%s.xml' % chapter_id
1614 chapter_info_xml
= self
._download
_webpage
(api
, chapter_id
,
1615 note
=u
'Downloading chapter information',
1616 errnote
=u
'Chapter information download failed')
1617 doc
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
)
1618 for a
in doc
.findall('.//archive'):
1619 if archive_id
== a
.find('./id').text
:
1622 raise ExtractorError(u
'Could not find chapter in chapter information')
1624 video_url
= a
.find('./video_file_url').text
1625 video_ext
= video_url
.rpartition('.')[2] or u
'flv'
1627 chapter_api_url
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
1628 chapter_info_json
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
,
1629 note
='Downloading chapter metadata',
1630 errnote
='Download of chapter metadata failed')
1631 chapter_info
= json
.loads(chapter_info_json
)
1633 bracket_start
= int(doc
.find('.//bracket_start').text
)
1634 bracket_end
= int(doc
.find('.//bracket_end').text
)
1636 # TODO determine start (and probably fix up file)
1637 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1638 #video_url += u'?start=' + TODO:start_timestamp
1639 # bracket_start is 13290, but we want 51670615
1640 self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. '
1641 u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
)))
1644 'id': u
'c' + chapter_id
,
1647 'title': chapter_info
['title'],
1648 'thumbnail': chapter_info
['preview'],
1649 'description': chapter_info
['description'],
1650 'uploader': chapter_info
['channel']['display_name'],
1651 'uploader_id': chapter_info
['channel']['name'],
1655 video_id
= mobj
.group('videoid')
1656 api
= api_base
+ '/broadcast/by_archive/%s.json' % video_id
1658 self
.report_extraction(video_id
)
1662 limit
= self
._JUSTIN
_PAGE
_LIMIT
1665 self
.report_download_page(video_id
, offset
)
1666 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
1667 page_count
, page_info
= self
._parse
_page
(page_url
, video_id
)
1668 info
.extend(page_info
)
1669 if not paged
or page_count
!= limit
:
1674 class FunnyOrDieIE(InfoExtractor
):
1675 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1677 def _real_extract(self
, url
):
1678 mobj
= re
.match(self
._VALID
_URL
, url
)
1680 raise ExtractorError(u
'invalid URL: %s' % url
)
1682 video_id
= mobj
.group('id')
1683 webpage
= self
._download
_webpage
(url
, video_id
)
1685 video_url
= self
._html
_search
_regex
(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1686 webpage
, u
'video URL', flags
=re
.DOTALL
)
1688 title
= self
._html
_search
_regex
((r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1689 r
'<title>(?P<title>[^<]+?)</title>'), webpage
, 'title', flags
=re
.DOTALL
)
1691 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
1692 webpage
, u
'description', fatal
=False, flags
=re
.DOTALL
)
1699 'description': video_description
,
1703 class SteamIE(InfoExtractor
):
1704 _VALID_URL
= r
"""http://store\.steampowered\.com/
1706 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1708 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1710 _VIDEO_PAGE_TEMPLATE
= 'http://store.steampowered.com/video/%s/'
1711 _AGECHECK_TEMPLATE
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1714 def suitable(cls
, url
):
1715 """Receives a URL and returns True if suitable for this IE."""
1716 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1718 def _real_extract(self
, url
):
1719 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1720 gameID
= m
.group('gameID')
1722 videourl
= self
._VIDEO
_PAGE
_TEMPLATE
% gameID
1723 webpage
= self
._download
_webpage
(videourl
, gameID
)
1725 if re
.search('<h2>Please enter your birth date to continue:</h2>', webpage
) is not None:
1726 videourl
= self
._AGECHECK
_TEMPLATE
% gameID
1727 self
.report_age_confirmation()
1728 webpage
= self
._download
_webpage
(videourl
, gameID
)
1730 self
.report_extraction(gameID
)
1731 game_title
= self
._html
_search
_regex
(r
'<h2 class="pageheader">(.*?)</h2>',
1732 webpage
, 'game title')
1734 urlRE
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1735 mweb
= re
.finditer(urlRE
, webpage
)
1736 namesRE
= r
'<span class="title">(?P<videoName>.+?)</span>'
1737 titles
= re
.finditer(namesRE
, webpage
)
1738 thumbsRE
= r
'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1739 thumbs
= re
.finditer(thumbsRE
, webpage
)
1741 for vid
,vtitle
,thumb
in zip(mweb
,titles
,thumbs
):
1742 video_id
= vid
.group('videoID')
1743 title
= vtitle
.group('videoName')
1744 video_url
= vid
.group('videoURL')
1745 video_thumb
= thumb
.group('thumbnail')
1747 raise ExtractorError(u
'Cannot find video url for %s' % video_id
)
1752 'title': unescapeHTML(title
),
1753 'thumbnail': video_thumb
1756 return [self
.playlist_result(videos
, gameID
, game_title
)]
1758 class UstreamIE(InfoExtractor
):
1759 _VALID_URL
= r
'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1760 IE_NAME
= u
'ustream'
1762 def _real_extract(self
, url
):
1763 m
= re
.match(self
._VALID
_URL
, url
)
1764 video_id
= m
.group('videoID')
1766 video_url
= u
'http://tcdn.ustream.tv/video/%s' % video_id
1767 webpage
= self
._download
_webpage
(url
, video_id
)
1769 self
.report_extraction(video_id
)
1771 video_title
= self
._html
_search
_regex
(r
'data-title="(?P<title>.+)"',
1774 uploader
= self
._html
_search
_regex
(r
'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1775 webpage
, u
'uploader', fatal
=False, flags
=re
.DOTALL
)
1777 thumbnail
= self
._html
_search
_regex
(r
'<link rel="image_src" href="(?P<thumb>.*?)"',
1778 webpage
, u
'thumbnail', fatal
=False)
1784 'title': video_title
,
1785 'uploader': uploader
,
1786 'thumbnail': thumbnail
,
1790 class WorldStarHipHopIE(InfoExtractor
):
1791 _VALID_URL
= r
'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1792 IE_NAME
= u
'WorldStarHipHop'
1794 def _real_extract(self
, url
):
1795 m
= re
.match(self
._VALID
_URL
, url
)
1796 video_id
= m
.group('id')
1798 webpage_src
= self
._download
_webpage
(url
, video_id
)
1800 video_url
= self
._search
_regex
(r
'so\.addVariable\("file","(.*?)"\)',
1801 webpage_src
, u
'video URL')
1803 if 'mp4' in video_url
:
1808 video_title
= self
._html
_search
_regex
(r
"<title>(.*)</title>",
1809 webpage_src
, u
'title')
1811 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1812 thumbnail
= self
._html
_search
_regex
(r
'rel="image_src" href="(.*)" />',
1813 webpage_src
, u
'thumbnail', fatal
=False)
1816 _title
= r
"""candytitles.*>(.*)</span>"""
1817 mobj
= re
.search(_title
, webpage_src
)
1818 if mobj
is not None:
1819 video_title
= mobj
.group(1)
1824 'title' : video_title
,
1825 'thumbnail' : thumbnail
,
1830 class RBMARadioIE(InfoExtractor
):
1831 _VALID_URL
= r
'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1833 def _real_extract(self
, url
):
1834 m
= re
.match(self
._VALID
_URL
, url
)
1835 video_id
= m
.group('videoID')
1837 webpage
= self
._download
_webpage
(url
, video_id
)
1839 json_data
= self
._search
_regex
(r
'window\.gon.*?gon\.show=(.+?);$',
1840 webpage
, u
'json data', flags
=re
.MULTILINE
)
1843 data
= json
.loads(json_data
)
1844 except ValueError as e
:
1845 raise ExtractorError(u
'Invalid JSON: ' + str(e
))
1847 video_url
= data
['akamai_url'] + '&cbr=256'
1848 url_parts
= compat_urllib_parse_urlparse(video_url
)
1849 video_ext
= url_parts
.path
.rpartition('.')[2]
1854 'title': data
['title'],
1855 'description': data
.get('teaser_text'),
1856 'location': data
.get('country_of_origin'),
1857 'uploader': data
.get('host', {}).get('name'),
1858 'uploader_id': data
.get('host', {}).get('slug'),
1859 'thumbnail': data
.get('image', {}).get('large_url_2x'),
1860 'duration': data
.get('duration'),
1865 class YouPornIE(InfoExtractor
):
1866 """Information extractor for youporn.com."""
1867 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1869 def _print_formats(self
, formats
):
1870 """Print all available formats"""
1871 print(u
'Available formats:')
1872 print(u
'ext\t\tformat')
1873 print(u
'---------------------------------')
1874 for format
in formats
:
1875 print(u
'%s\t\t%s' % (format
['ext'], format
['format']))
1877 def _specific(self
, req_format
, formats
):
1879 if(x
["format"]==req_format
):
1883 def _real_extract(self
, url
):
1884 mobj
= re
.match(self
._VALID
_URL
, url
)
1886 raise ExtractorError(u
'Invalid URL: %s' % url
)
1887 video_id
= mobj
.group('videoid')
1889 req
= compat_urllib_request
.Request(url
)
1890 req
.add_header('Cookie', 'age_verified=1')
1891 webpage
= self
._download
_webpage
(req
, video_id
)
1893 # Get JSON parameters
1894 json_params
= self
._search
_regex
(r
'var currentVideo = new Video\((.*)\);', webpage
, u
'JSON parameters')
1896 params
= json
.loads(json_params
)
1898 raise ExtractorError(u
'Invalid JSON')
1900 self
.report_extraction(video_id
)
1902 video_title
= params
['title']
1903 upload_date
= unified_strdate(params
['release_date_f'])
1904 video_description
= params
['description']
1905 video_uploader
= params
['submitted_by']
1906 thumbnail
= params
['thumbnails'][0]['image']
1908 raise ExtractorError('Missing JSON parameter: ' + sys
.exc_info()[1])
1910 # Get all of the formats available
1911 DOWNLOAD_LIST_RE
= r
'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1912 download_list_html
= self
._search
_regex
(DOWNLOAD_LIST_RE
,
1913 webpage
, u
'download list').strip()
1915 # Get all of the links from the page
1916 LINK_RE
= r
'(?s)<a href="(?P<url>[^"]+)">'
1917 links
= re
.findall(LINK_RE
, download_list_html
)
1918 if(len(links
) == 0):
1919 raise ExtractorError(u
'ERROR: no known formats available for video')
1921 self
.to_screen(u
'Links found: %d' % len(links
))
1926 # A link looks like this:
1927 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1928 # A path looks like this:
1929 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1930 video_url
= unescapeHTML( link
)
1931 path
= compat_urllib_parse_urlparse( video_url
).path
1932 extension
= os
.path
.splitext( path
)[1][1:]
1933 format
= path
.split('/')[4].split('_')[:2]
1936 format
= "-".join( format
)
1937 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1942 'uploader': video_uploader
,
1943 'upload_date': upload_date
,
1944 'title': video_title
,
1947 'thumbnail': thumbnail
,
1948 'description': video_description
1951 if self
._downloader
.params
.get('listformats', None):
1952 self
._print
_formats
(formats
)
1955 req_format
= self
._downloader
.params
.get('format', None)
1956 self
.to_screen(u
'Format: %s' % req_format
)
1958 if req_format
is None or req_format
== 'best':
1960 elif req_format
== 'worst':
1961 return [formats
[-1]]
1962 elif req_format
in ('-1', 'all'):
1965 format
= self
._specific
( req_format
, formats
)
1967 raise ExtractorError(u
'Requested format not available')
1972 class PornotubeIE(InfoExtractor
):
1973 """Information extractor for pornotube.com."""
1974 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1976 def _real_extract(self
, url
):
1977 mobj
= re
.match(self
._VALID
_URL
, url
)
1979 raise ExtractorError(u
'Invalid URL: %s' % url
)
1981 video_id
= mobj
.group('videoid')
1982 video_title
= mobj
.group('title')
1984 # Get webpage content
1985 webpage
= self
._download
_webpage
(url
, video_id
)
1988 VIDEO_URL_RE
= r
'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1989 video_url
= self
._search
_regex
(VIDEO_URL_RE
, webpage
, u
'video url')
1990 video_url
= compat_urllib_parse
.unquote(video_url
)
1992 #Get the uploaded date
1993 VIDEO_UPLOADED_RE
= r
'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1994 upload_date
= self
._html
_search
_regex
(VIDEO_UPLOADED_RE
, webpage
, u
'upload date', fatal
=False)
1995 if upload_date
: upload_date
= unified_strdate(upload_date
)
1997 info
= {'id': video_id
,
2000 'upload_date': upload_date
,
2001 'title': video_title
,
2007 class YouJizzIE(InfoExtractor
):
2008 """Information extractor for youjizz.com."""
2009 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2011 def _real_extract(self
, url
):
2012 mobj
= re
.match(self
._VALID
_URL
, url
)
2014 raise ExtractorError(u
'Invalid URL: %s' % url
)
2016 video_id
= mobj
.group('videoid')
2018 # Get webpage content
2019 webpage
= self
._download
_webpage
(url
, video_id
)
2021 # Get the video title
2022 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*)</title>',
2023 webpage
, u
'title').strip()
2025 # Get the embed page
2026 result
= re
.search(r
'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage
)
2028 raise ExtractorError(u
'ERROR: unable to extract embed page')
2030 embed_page_url
= result
.group(0).strip()
2031 video_id
= result
.group('videoid')
2033 webpage
= self
._download
_webpage
(embed_page_url
, video_id
)
2036 video_url
= self
._search
_regex
(r
'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2037 webpage
, u
'video URL')
2039 info
= {'id': video_id
,
2041 'title': video_title
,
2044 'player_url': embed_page_url
}
2048 class EightTracksIE(InfoExtractor
):
2050 _VALID_URL
= r
'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2052 def _real_extract(self
, url
):
2053 mobj
= re
.match(self
._VALID
_URL
, url
)
2055 raise ExtractorError(u
'Invalid URL: %s' % url
)
2056 playlist_id
= mobj
.group('id')
2058 webpage
= self
._download
_webpage
(url
, playlist_id
)
2060 json_like
= self
._search
_regex
(r
"PAGE.mix = (.*?);\n", webpage
, u
'trax information', flags
=re
.DOTALL
)
2061 data
= json
.loads(json_like
)
2063 session
= str(random
.randint(0, 1000000000))
2065 track_count
= data
['tracks_count']
2066 first_url
= 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session
, mix_id
)
2067 next_url
= first_url
2069 for i
in itertools
.count():
2070 api_json
= self
._download
_webpage
(next_url
, playlist_id
,
2071 note
=u
'Downloading song information %s/%s' % (str(i
+1), track_count
),
2072 errnote
=u
'Failed to download song information')
2073 api_data
= json
.loads(api_json
)
2074 track_data
= api_data
[u
'set']['track']
2076 'id': track_data
['id'],
2077 'url': track_data
['track_file_stream_url'],
2078 'title': track_data
['performer'] + u
' - ' + track_data
['name'],
2079 'raw_title': track_data
['name'],
2080 'uploader_id': data
['user']['login'],
2084 if api_data
['set']['at_last_track']:
2086 next_url
= 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session
, mix_id
, track_data
['id'])
2089 class KeekIE(InfoExtractor
):
2090 _VALID_URL
= r
'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2093 def _real_extract(self
, url
):
2094 m
= re
.match(self
._VALID
_URL
, url
)
2095 video_id
= m
.group('videoID')
2097 video_url
= u
'http://cdn.keek.com/keek/video/%s' % video_id
2098 thumbnail
= u
'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2099 webpage
= self
._download
_webpage
(url
, video_id
)
2101 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
2104 uploader
= self
._html
_search
_regex
(r
'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2105 webpage
, u
'uploader', fatal
=False)
2111 'title': video_title
,
2112 'thumbnail': thumbnail
,
2113 'uploader': uploader
2117 class TEDIE(InfoExtractor
):
2118 _VALID_URL
=r
'''http://www\.ted\.com/
2120 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2122 ((?P<type_talk>talks)) # We have a simple talk
2124 (/lang/(.*?))? # The url may contain the language
2125 /(?P<name>\w+) # Here goes the name and then ".html"
2129 def suitable(cls
, url
):
2130 """Receives a URL and returns True if suitable for this IE."""
2131 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
2133 def _real_extract(self
, url
):
2134 m
=re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
2135 if m
.group('type_talk'):
2136 return [self
._talk
_info
(url
)]
2138 playlist_id
=m
.group('playlist_id')
2139 name
=m
.group('name')
2140 self
.to_screen(u
'Getting info of playlist %s: "%s"' % (playlist_id
,name
))
2141 return [self
._playlist
_videos
_info
(url
,name
,playlist_id
)]
2143 def _playlist_videos_info(self
,url
,name
,playlist_id
=0):
2144 '''Returns the videos of the playlist'''
2146 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2147 ([.\s]*?)data-playlist_item_id="(\d+)"
2148 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2150 video_name_RE
=r
'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2151 webpage
=self
._download
_webpage
(url
, playlist_id
, 'Downloading playlist webpage')
2152 m_videos
=re
.finditer(video_RE
,webpage
,re
.VERBOSE
)
2153 m_names
=re
.finditer(video_name_RE
,webpage
)
2155 playlist_title
= self
._html
_search
_regex
(r
'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2156 webpage
, 'playlist title')
2158 playlist_entries
= []
2159 for m_video
, m_name
in zip(m_videos
,m_names
):
2160 video_id
=m_video
.group('video_id')
2161 talk_url
='http://www.ted.com%s' % m_name
.group('talk_url')
2162 playlist_entries
.append(self
.url_result(talk_url
, 'TED'))
2163 return self
.playlist_result(playlist_entries
, playlist_id
= playlist_id
, playlist_title
= playlist_title
)
2165 def _talk_info(self
, url
, video_id
=0):
2166 """Return the video for the talk in the url"""
2167 m
= re
.match(self
._VALID
_URL
, url
,re
.VERBOSE
)
2168 video_name
= m
.group('name')
2169 webpage
= self
._download
_webpage
(url
, video_id
, 'Downloading \"%s\" page' % video_name
)
2170 self
.report_extraction(video_name
)
2171 # If the url includes the language we get the title translated
2172 title
= self
._html
_search
_regex
(r
'<span id="altHeadline" >(?P<title>.*)</span>',
2174 json_data
= self
._search
_regex
(r
'<script.*?>var talkDetails = ({.*?})</script>',
2175 webpage
, 'json data')
2176 info
= json
.loads(json_data
)
2177 desc
= self
._html
_search
_regex
(r
'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2178 webpage
, 'description', flags
= re
.DOTALL
)
2180 thumbnail
= self
._search
_regex
(r
'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2181 webpage
, 'thumbnail')
2184 'url': info
['htmlStreams'][-1]['file'],
2187 'thumbnail': thumbnail
,
2188 'description': desc
,
2192 class MySpassIE(InfoExtractor
):
2193 _VALID_URL
= r
'http://www.myspass.de/.*'
2195 def _real_extract(self
, url
):
2196 META_DATA_URL_TEMPLATE
= 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2198 # video id is the last path element of the URL
2199 # usually there is a trailing slash, so also try the second but last
2200 url_path
= compat_urllib_parse_urlparse(url
).path
2201 url_parent_path
, video_id
= os
.path
.split(url_path
)
2203 _
, video_id
= os
.path
.split(url_parent_path
)
2206 metadata_url
= META_DATA_URL_TEMPLATE
% video_id
2207 metadata_text
= self
._download
_webpage
(metadata_url
, video_id
)
2208 metadata
= xml
.etree
.ElementTree
.fromstring(metadata_text
.encode('utf-8'))
2210 # extract values from metadata
2211 url_flv_el
= metadata
.find('url_flv')
2212 if url_flv_el
is None:
2213 raise ExtractorError(u
'Unable to extract download url')
2214 video_url
= url_flv_el
.text
2215 extension
= os
.path
.splitext(video_url
)[1][1:]
2216 title_el
= metadata
.find('title')
2217 if title_el
is None:
2218 raise ExtractorError(u
'Unable to extract title')
2219 title
= title_el
.text
2220 format_id_el
= metadata
.find('format_id')
2221 if format_id_el
is None:
2224 format
= format_id_el
.text
2225 description_el
= metadata
.find('description')
2226 if description_el
is not None:
2227 description
= description_el
.text
2230 imagePreview_el
= metadata
.find('imagePreview')
2231 if imagePreview_el
is not None:
2232 thumbnail
= imagePreview_el
.text
2241 'thumbnail': thumbnail
,
2242 'description': description
2246 class SpiegelIE(InfoExtractor
):
2247 _VALID_URL
= r
'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2249 def _real_extract(self
, url
):
2250 m
= re
.match(self
._VALID
_URL
, url
)
2251 video_id
= m
.group('videoID')
2253 webpage
= self
._download
_webpage
(url
, video_id
)
2255 video_title
= self
._html
_search
_regex
(r
'<div class="module-title">(.*?)</div>',
2258 xml_url
= u
'http://video2.spiegel.de/flash/' + video_id
+ u
'.xml'
2259 xml_code
= self
._download
_webpage
(xml_url
, video_id
,
2260 note
=u
'Downloading XML', errnote
=u
'Failed to download XML')
2262 idoc
= xml
.etree
.ElementTree
.fromstring(xml_code
)
2263 last_type
= idoc
[-1]
2264 filename
= last_type
.findall('./filename')[0].text
2265 duration
= float(last_type
.findall('./duration')[0].text
)
2267 video_url
= 'http://video2.spiegel.de/flash/' + filename
2268 video_ext
= filename
.rpartition('.')[2]
2273 'title': video_title
,
2274 'duration': duration
,
2278 class LiveLeakIE(InfoExtractor
):
2280 _VALID_URL
= r
'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2281 IE_NAME
= u
'liveleak'
2283 def _real_extract(self
, url
):
2284 mobj
= re
.match(self
._VALID
_URL
, url
)
2286 raise ExtractorError(u
'Invalid URL: %s' % url
)
2288 video_id
= mobj
.group('video_id')
2290 webpage
= self
._download
_webpage
(url
, video_id
)
2292 video_url
= self
._search
_regex
(r
'file: "(.*?)",',
2293 webpage
, u
'video URL')
2295 video_title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(?P<title>.*?)"',
2296 webpage
, u
'title').replace('LiveLeak.com -', '').strip()
2298 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
2299 webpage
, u
'description', fatal
=False)
2301 video_uploader
= self
._html
_search
_regex
(r
'By:.*?(\w+)</a>',
2302 webpage
, u
'uploader', fatal
=False)
2308 'title': video_title
,
2309 'description': video_description
,
2310 'uploader': video_uploader
2317 class TumblrIE(InfoExtractor
):
2318 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2320 def _real_extract(self
, url
):
2321 m_url
= re
.match(self
._VALID
_URL
, url
)
2322 video_id
= m_url
.group('id')
2323 blog
= m_url
.group('blog_name')
2325 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
2326 webpage
= self
._download
_webpage
(url
, video_id
)
2328 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
2329 video
= re
.search(re_video
, webpage
)
2331 raise ExtractorError(u
'Unable to extract video')
2332 video_url
= video
.group('video_url')
2333 ext
= video
.group('ext')
2335 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2336 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
2337 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
2339 # The only place where you can get a title, it's not complete,
2340 # but searching in other places doesn't work for all videos
2341 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
2342 webpage
, u
'title', flags
=re
.DOTALL
)
2344 return [{'id': video_id
,
2346 'title': video_title
,
2347 'thumbnail': video_thumbnail
,
2351 class BandcampIE(InfoExtractor
):
2352 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2354 def _real_extract(self
, url
):
2355 mobj
= re
.match(self
._VALID
_URL
, url
)
2356 title
= mobj
.group('title')
2357 webpage
= self
._download
_webpage
(url
, title
)
2358 # We get the link to the free download page
2359 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
2360 if m_download
is None:
2361 raise ExtractorError(u
'No free songs found')
2363 download_link
= m_download
.group(1)
2364 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2365 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
2367 download_webpage
= self
._download
_webpage
(download_link
, id,
2368 'Downloading free downloads page')
2369 # We get the dictionary of the track from some javascrip code
2370 info
= re
.search(r
'items: (.*?),$',
2371 download_webpage
, re
.MULTILINE
).group(1)
2372 info
= json
.loads(info
)[0]
2373 # We pick mp3-320 for now, until format selection can be easily implemented.
2374 mp3_info
= info
[u
'downloads'][u
'mp3-320']
2375 # If we try to use this url it says the link has expired
2376 initial_url
= mp3_info
[u
'url']
2377 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2378 m_url
= re
.match(re_url
, initial_url
)
2379 #We build the url we will use to get the final track url
2380 # This url is build in Bandcamp in the script download_bunde_*.js
2381 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
2382 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
2383 # If we could correctly generate the .rand field the url would be
2384 #in the "download_url" key
2385 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
2387 track_info
= {'id':id,
2388 'title' : info
[u
'title'],
2391 'thumbnail' : info
[u
'thumb_url'],
2392 'uploader' : info
[u
'artist']
2397 class RedTubeIE(InfoExtractor
):
2398 """Information Extractor for redtube"""
2399 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2401 def _real_extract(self
,url
):
2402 mobj
= re
.match(self
._VALID
_URL
, url
)
2404 raise ExtractorError(u
'Invalid URL: %s' % url
)
2406 video_id
= mobj
.group('id')
2407 video_extension
= 'mp4'
2408 webpage
= self
._download
_webpage
(url
, video_id
)
2410 self
.report_extraction(video_id
)
2412 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
2413 webpage
, u
'video URL')
2415 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2421 'ext': video_extension
,
2422 'title': video_title
,
2425 class InaIE(InfoExtractor
):
2426 """Information Extractor for Ina.fr"""
2427 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2429 def _real_extract(self
,url
):
2430 mobj
= re
.match(self
._VALID
_URL
, url
)
2432 video_id
= mobj
.group('id')
2433 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
2434 video_extension
= 'mp4'
2435 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
2437 self
.report_extraction(video_id
)
2439 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2440 webpage
, u
'video URL')
2442 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2448 'ext': video_extension
,
2449 'title': video_title
,
2452 class HowcastIE(InfoExtractor
):
2453 """Information Extractor for Howcast.com"""
2454 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2456 def _real_extract(self
, url
):
2457 mobj
= re
.match(self
._VALID
_URL
, url
)
2459 video_id
= mobj
.group('id')
2460 webpage_url
= 'http://www.howcast.com/videos/' + video_id
2461 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
2463 self
.report_extraction(video_id
)
2465 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2466 webpage, u'video URL')
2468 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
2471 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2472 webpage, u'description', fatal=False)
2474 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2475 webpage, u'thumbnail', fatal=False)
2481 'title': video_title,
2482 'description': video_description,
2483 'thumbnail': thumbnail,
2486 class VineIE(InfoExtractor):
2487 """Information Extractor for Vine.co"""
2488 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2490 def _real_extract(self, url):
2491 mobj = re.match(self._VALID_URL, url)
2493 video_id = mobj.group('id')
2494 webpage_url = 'https://vine.co/v/' + video_id
2495 webpage = self._download_webpage(webpage_url, video_id)
2497 self.report_extraction(video_id)
2499 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
2500 webpage, u'video URL')
2502 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
2505 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
2506 webpage, u'thumbnail', fatal=False)
2508 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
2509 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2515 'title': video_title,
2516 'thumbnail': thumbnail,
2517 'uploader': uploader,
2520 class FlickrIE(InfoExtractor):
2521 """Information Extractor for Flickr videos"""
2522 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2524 def _real_extract(self, url):
2525 mobj = re.match(self._VALID_URL, url)
2527 video_id = mobj.group('id')
2528 video_uploader_id = mobj.group('uploader_id')
2529 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2530 webpage = self._download_webpage(webpage_url, video_id)
2532 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
2534 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2535 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2537 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2538 first_xml, u'node_id')
2540 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2541 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2543 self.report_extraction(video_id)
2545 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
2547 raise ExtractorError(u'Unable to extract video url')
2548 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2550 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
2551 webpage, u'video title
')
2553 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
2554 webpage, u'description', fatal=False)
2556 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
2557 webpage, u'thumbnail
', fatal=False)
2563 'title
': video_title,
2564 'description
': video_description,
2565 'thumbnail
': thumbnail,
2566 'uploader_id
': video_uploader_id,
2569 class TeamcocoIE(InfoExtractor):
2570 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
2572 def _real_extract(self, url):
2573 mobj = re.match(self._VALID_URL, url)
2575 raise ExtractorError(u'Invalid URL
: %s' % url)
2576 url_title = mobj.group('url_title
')
2577 webpage = self._download_webpage(url, url_title)
2579 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
2580 webpage, u'video
id')
2582 self.report_extraction(video_id)
2584 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
2587 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
2588 webpage, u'thumbnail
', fatal=False)
2590 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
2591 webpage, u'description
', fatal=False)
2593 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
2594 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
2596 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
2603 'title
': video_title,
2604 'thumbnail
': thumbnail,
2605 'description
': video_description,
2608 class XHamsterIE(InfoExtractor):
2609 """Information Extractor for xHamster"""
2610 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
2612 def _real_extract(self,url):
2613 mobj = re.match(self._VALID_URL, url)
2615 video_id = mobj.group('id')
2616 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
2617 webpage = self._download_webpage(mrss_url, video_id)
2619 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
2621 raise ExtractorError(u'Unable to extract media URL
')
2622 if len(mobj.group('server
')) == 0:
2623 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2625 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
2626 video_extension = video_url.split('.')[-1]
2628 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
2631 # Can't see the description anywhere
in the UI
2632 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2633 # webpage, u'description', fatal=False)
2634 # if video_description: video_description = unescapeHTML(video_description)
2636 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}
\'', webpage)
2638 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
2640 video_upload_date = None
2641 self._downloader.report_warning(u'Unable to extract upload date
')
2643 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
2644 webpage, u'uploader
id', default=u'anonymous
')
2646 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
2647 webpage, u'thumbnail
', fatal=False)
2652 'ext
': video_extension,
2653 'title
': video_title,
2654 # 'description
': video_description,
2655 'upload_date
': video_upload_date,
2656 'uploader_id
': video_uploader_id,
2657 'thumbnail
': video_thumbnail
2660 class HypemIE(InfoExtractor):
2661 """Information Extractor for hypem"""
2662 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
2664 def _real_extract(self, url):
2665 mobj = re.match(self._VALID_URL, url)
2667 raise ExtractorError(u'Invalid URL
: %s' % url)
2668 track_id = mobj.group(1)
2670 data = { 'ax': 1, 'ts': time.time() }
2671 data_encoded = compat_urllib_parse.urlencode(data)
2672 complete_url = url + "?" + data_encoded
2673 request = compat_urllib_request.Request(complete_url)
2674 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
2675 cookie = urlh.headers.get('Set
-Cookie
', '')
2677 self.report_extraction(track_id)
2679 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
2680 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
2682 track_list = json.loads(html_tracks)
2683 track = track_list[u'tracks
'][0]
2685 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
2688 track_id = track[u"id"]
2689 artist = track[u"artist"]
2690 title = track[u"song"]
2692 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2693 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2694 request.add_header('cookie
', cookie)
2695 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
2697 song_data = json.loads(song_data_json)
2699 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
2700 final_url = song_data[u"url"]
2710 class Vbox7IE(InfoExtractor):
2711 """Information Extractor for Vbox7"""
2712 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
2714 def _real_extract(self,url):
2715 mobj = re.match(self._VALID_URL, url)
2717 raise ExtractorError(u'Invalid URL
: %s' % url)
2718 video_id = mobj.group(1)
2720 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2721 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
2722 redirect_url = urlh.geturl() + new_location
2723 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
2725 title = self._html_search_regex(r'<title
>(.*)</title
>',
2726 webpage, u'title
').split('/')[0].strip()
2729 info_url = "http://vbox7.com/play/magare.do"
2730 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2731 info_request = compat_urllib_request.Request(info_url, data)
2732 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
2733 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
2734 if info_response is None:
2735 raise ExtractorError(u'Unable to extract the media url
')
2736 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2743 'thumbnail
': thumbnail_url,
2747 def gen_extractors():
2748 """ Return a list of an instance of every supported extractor.
2749 The order does matter; the first extractor matched is the one handling the URL.
2752 YoutubePlaylistIE(),
2777 StanfordOpenClassroomIE(),
2787 WorldStarHipHopIE(),
2817 def get_info_extractor(ie_name):
2818 """Returns the info extractor class with the given ie_name"""
2819 return globals()[ie_name+'IE
']