]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
FFmpegPostProcessor: decode stderr first and then get the last line (closes #837)
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
4fcca4bb 6import base64
d77c3dfd 7import datetime
ccf65f9d 8import itertools
d77c3dfd
FV
9import netrc
10import os
11import re
12import socket
13import time
d77c3dfd 14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
6324fd1d 18import operator
d77c3dfd 19
9e8056d5 20from .utils import *
d77c3dfd
FV
21
22
23class InfoExtractor(object):
59ae15a5 24 """Information Extractor class.
d77c3dfd 25
59ae15a5
PH
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
cdb30764 29 others. The information is stored in a dictionary which is then
59ae15a5
PH
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
717b1f72 33
59ae15a5 34 The dictionaries must include the following fields:
717b1f72 35
59ae15a5
PH
36 id: Video identifier.
37 url: Final video URL.
59ae15a5
PH
38 title: Video title, unescaped.
39 ext: Video filename extension.
717b1f72 40
59ae15a5 41 The following fields are optional:
717b1f72 42
59ae15a5
PH
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
539679c7
PH
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
77c4beab 48 uploader_id: Nickname or id of the video uploader.
6119f78c 49 location: Physical location of the video.
59ae15a5 50 player_url: SWF Player URL (used for rtmpdump).
553d0974 51 subtitles: The subtitle file contents.
59ae15a5
PH
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
d77c3dfd 54
59ae15a5 55 The fields should all be Unicode strings.
9ce5d9ee 56
59ae15a5
PH
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
717b1f72 60
59ae15a5
PH
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
03c5b0fb 63
59ae15a5
PH
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
d77c3dfd 67
59ae15a5
PH
68 _ready = False
69 _downloader = None
70 _WORKING = True
d77c3dfd 71
59ae15a5
PH
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
d77c3dfd 76
89de9eb1
FV
77 @classmethod
78 def suitable(cls, url):
59ae15a5 79 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 80 return re.match(cls._VALID_URL, url) is not None
d77c3dfd 81
89de9eb1
FV
82 @classmethod
83 def working(cls):
59ae15a5 84 """Getter method for _WORKING."""
89de9eb1 85 return cls._WORKING
03c5b0fb 86
59ae15a5
PH
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
d77c3dfd 92
59ae15a5
PH
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
d77c3dfd 97
59ae15a5
PH
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
d77c3dfd 101
59ae15a5
PH
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
d77c3dfd 105
59ae15a5
PH
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
d77c3dfd 109
d0d4f277
PH
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
d77c3dfd 113
64ce2aad
PH
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
d830b7c2 116 if note is None:
0d173446
JMF
117 self.report_download_webpage(video_id)
118 elif note is not False:
f17ce13a 119 self.to_screen(u'%s: %s' % (video_id, note))
d830b7c2 120 try:
64ce2aad 121 return compat_urllib_request.urlopen(url_or_request)
d830b7c2
PH
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
01951dda 125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
d830b7c2 126
480b6c1e
PH
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
64ce2aad 129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
e32b06e9
PH
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
64ce2aad 136 webpage_bytes = urlh.read()
855703e5
PH
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
f17ce13a 142 self.to_screen(u'Dumping request to ' + url)
855703e5
PH
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
480b6c1e
PH
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
147
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
f17ce13a
JMF
151
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
41a6eb94
JMF
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
159
320e26a0
JMF
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
163
e11eb119
JMF
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
167
8a38a194
JMF
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
173 return video_info
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
6de8f1af
JMF
178 'url': url,
179 'ie_key': ie}
8a38a194 180 return video_info
d2c69082 181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
8a38a194
JMF
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
184 'entries': entries}
d2c69082
JMF
185 if playlist_id:
186 video_info['id'] = playlist_id
187 if playlist_title:
188 video_info['title'] = playlist_title
8a38a194 189 return video_info
64ce2aad 190
d830b7c2 191
d77c3dfd 192class YoutubeIE(InfoExtractor):
59ae15a5
PH
193 """Information extractor for youtube.com."""
194
195 _VALID_URL = r"""^
196 (
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
59ae15a5
PH
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
3bb61659 206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
59ae15a5
PH
207 v=
208 )
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
213 $"""
84e4682f 214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
d3f5f9f6 215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
59ae15a5
PH
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
223 '13': '3gp',
224 '17': 'mp4',
225 '18': 'mp4',
226 '22': 'mp4',
227 '37': 'mp4',
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 '43': 'webm',
230 '44': 'webm',
231 '45': 'webm',
232 '46': 'webm',
233 }
234 _video_dimensions = {
235 '5': '240x400',
236 '6': '???',
237 '13': '???',
238 '17': '144x176',
239 '18': '360x640',
240 '22': '720x1280',
241 '34': '360x640',
242 '35': '480x854',
243 '37': '1080x1920',
244 '38': '3072x4096',
245 '43': '360x640',
246 '44': '480x854',
247 '45': '720x1280',
248 '46': '1080x1920',
cdb30764 249 }
59ae15a5
PH
250 IE_NAME = u'youtube'
251
89de9eb1
FV
252 @classmethod
253 def suitable(cls, url):
59ae15a5 254 """Receives a URL and returns True if suitable for this IE."""
89de9eb1
FV
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
59ae15a5
PH
257
258 def report_lang(self):
259 """Report attempt to set language."""
f17ce13a 260 self.to_screen(u'Setting language')
59ae15a5
PH
261
262 def report_login(self):
263 """Report attempt to log in."""
f17ce13a 264 self.to_screen(u'Logging in')
59ae15a5 265
59ae15a5
PH
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
f17ce13a 268 self.to_screen(u'%s: Downloading video webpage' % video_id)
59ae15a5
PH
269
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
f17ce13a 272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
59ae15a5
PH
273
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
f17ce13a 276 self.to_screen(u'%s: Checking available subtitles' % video_id)
59ae15a5 277
2a4093ea 278 def report_video_subtitles_request(self, video_id, sub_lang, format):
ae608b80 279 """Report attempt to download video info webpage."""
f17ce13a 280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
2a4093ea
IM
281
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
f17ce13a 285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
59ae15a5
PH
286
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
f17ce13a 289 self.to_screen(u'%s: Extracting video information' % video_id)
59ae15a5
PH
290
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
f17ce13a 293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
59ae15a5
PH
294
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
f17ce13a 297 self.to_screen(u'RTMP download detected')
59ae15a5 298
ae608b80 299 def _get_available_subtitles(self, video_id):
056d8575
FV
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
302 try:
553d0974 303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
056d8575 304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
bc97f6d6 305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
553d0974
IM
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
bc97f6d6 309 return (u'video doesn\'t have subtitles', None)
553d0974 310 return sub_lang_list
ae608b80 311
2a4093ea
IM
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
315
9e62bc44 316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
6a205c88
JMF
317 """
318 Return tuple:
319 (error_message, sub_lang, sub)
320 """
2a4093ea 321 self.report_video_subtitles_request(video_id, sub_lang, format)
fb778e66 322 params = compat_urllib_parse.urlencode({
553d0974
IM
323 'lang': sub_lang,
324 'name': sub_name,
fb778e66 325 'v': video_id,
ae608b80 326 'fmt': format,
fb778e66
PH
327 })
328 url = 'http://www.youtube.com/api/timedtext?' + params
056d8575 329 try:
553d0974 330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
056d8575 331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
bc97f6d6 332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
553d0974 333 if not sub:
bc97f6d6 334 return (u'Did not fetch video subtitles', None, None)
553d0974 335 return (None, sub_lang, sub)
ae608b80
IM
336
337 def _extract_subtitle(self, video_id):
0fb37564
JMF
338 """
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
341 """
553d0974 342 sub_lang_list = self._get_available_subtitles(video_id)
9e62bc44 343 sub_format = self._downloader.params.get('subtitlesformat')
0fb37564
JMF
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
ae608b80 346 if self._downloader.params.get('subtitleslang', False):
553d0974
IM
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
349 sub_lang = 'en'
ae608b80 350 else:
553d0974
IM
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
bc97f6d6 353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
ae608b80 354
9e62bc44 355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
553d0974 356 return [subtitle]
ae608b80
IM
357
358 def _extract_all_subtitles(self, video_id):
553d0974 359 sub_lang_list = self._get_available_subtitles(video_id)
9e62bc44 360 sub_format = self._downloader.params.get('subtitlesformat')
ef767f9f
JMF
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
553d0974
IM
363 subtitles = []
364 for sub_lang in sub_lang_list:
9e62bc44 365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
553d0974
IM
366 subtitles.append(subtitle)
367 return subtitles
056d8575 368
59ae15a5
PH
369 def _print_formats(self, formats):
370 print('Available formats:')
371 for x in formats:
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
373
374 def _real_initialize(self):
375 if self._downloader is None:
376 return
377
378 username = None
379 password = None
380 downloader_params = self._downloader.params
381
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
387 try:
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389 if info is not None:
390 username = info[0]
391 password = info[2]
392 else:
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
2e5457be 395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
396 return
397
398 # Set language
399 request = compat_urllib_request.Request(self._LANG_URL)
400 try:
401 self.report_lang()
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59ae15a5
PH
405 return
406
407 # No authentication to be performed
408 if username is None:
409 return
410
d3f5f9f6
PH
411 request = compat_urllib_request.Request(self._LOGIN_URL)
412 try:
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
d3f5f9f6
PH
416 return
417
418 galx = None
419 dsh = None
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
421 if match:
422 galx = match.group(1)
423
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 if match:
426 dsh = match.group(1)
427
59ae15a5 428 # Log in
d3f5f9f6 429 login_form_strs = {
84e4682f 430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
d3f5f9f6
PH
431 u'Email': username,
432 u'GALX': galx,
433 u'Passwd': password,
434 u'PersistentCookie': u'yes',
435 u'_utf8': u'霱',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
439 u'dnConn': u'',
440 u'dsh': dsh,
441 u'pstMsg': u'0',
442 u'rmShown': u'1',
443 u'secTok': u'',
444 u'signIn': u'Sign in',
445 u'timeStmp': u'',
446 u'service': u'youtube',
447 u'uilel': u'3',
448 u'hl': u'en_US',
449 }
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
451 # chokes on unicode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
59ae15a5
PH
455 try:
456 self.report_login()
80d3177e 457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
d3f5f9f6 458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
2e5457be 459 self._downloader.report_warning(u'unable to log in: bad username or password')
59ae15a5
PH
460 return
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
463 return
464
465 # Confirm age
466 age_form = {
467 'next_url': '/',
468 'action_confirm': 'Confirm',
469 }
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
471 try:
472 self.report_age_confirmation()
80d3177e 473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
59ae15a5 476
3bb61659 477 def _extract_id(self, url):
59ae15a5
PH
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
479 if mobj is None:
0c021ad1 480 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 481 video_id = mobj.group(2)
3bb61659
PH
482 return video_id
483
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
487 if mobj:
84e4682f 488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
3bb61659 489 video_id = self._extract_id(url)
59ae15a5
PH
490
491 # Get video webpage
492 self.report_video_webpage_download(video_id)
84e4682f 493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
3bb61659 494 request = compat_urllib_request.Request(url)
59ae15a5
PH
495 try:
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
59ae15a5
PH
499
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
501
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
504 if mobj is not None:
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
506 else:
507 player_url = None
508
509 # Get video info
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
927c8c49 512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
59ae15a5 513 % (video_id, el_type))
927c8c49
PH
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
515 note=False,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
519 break
59ae15a5
PH
520 if 'token' not in video_info:
521 if 'reason' in video_info:
0c021ad1 522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
59ae15a5 523 else:
0c021ad1 524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
59ae15a5
PH
525
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
0c021ad1 528 raise ExtractorError(u'"rental" videos not supported')
59ae15a5
PH
529
530 # Start extracting information
531 self.report_information_extraction(video_id)
532
533 # uploader
534 if 'author' not in video_info:
0c021ad1 535 raise ExtractorError(u'Unable to extract uploader name')
59ae15a5
PH
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
537
77c4beab
FV
538 # uploader_id
539 video_uploader_id = None
26cf0408 540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
77c4beab
FV
541 if mobj is not None:
542 video_uploader_id = mobj.group(1)
543 else:
c9fa1cba 544 self._downloader.report_warning(u'unable to extract uploader nickname')
77c4beab 545
59ae15a5
PH
546 # title
547 if 'title' not in video_info:
0c021ad1 548 raise ExtractorError(u'Unable to extract video title')
59ae15a5
PH
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550
551 # thumbnail image
552 if 'thumbnail_url' not in video_info:
c9fa1cba 553 self._downloader.report_warning(u'unable to extract video thumbnail')
59ae15a5
PH
554 video_thumbnail = ''
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
557
558 # upload date
559 upload_date = None
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
561 if mobj is not None:
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
bf50b038 563 upload_date = unified_strdate(upload_date)
59ae15a5
PH
564
565 # description
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
569 else:
7b670a44
PH
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
571 if fd_mobj:
572 video_description = unescapeHTML(fd_mobj.group(1))
573 else:
574 video_description = u''
59ae15a5 575
9e62bc44 576 # subtitles
59ae15a5 577 video_subtitles = None
ae608b80 578
59ae15a5 579 if self._downloader.params.get('writesubtitles', False):
ae608b80
IM
580 video_subtitles = self._extract_subtitle(video_id)
581 if video_subtitles:
553d0974
IM
582 (sub_error, sub_lang, sub) = video_subtitles[0]
583 if sub_error:
bc97f6d6 584 self._downloader.report_error(sub_error)
ae608b80
IM
585
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
553d0974
IM
589 (sub_error, sub_lang, sub) = video_subtitle
590 if sub_error:
bc97f6d6 591 self._downloader.report_error(sub_error)
59ae15a5 592
2a4093ea
IM
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
595 return
59ae15a5
PH
596
597 if 'length_seconds' not in video_info:
c9fa1cba 598 self._downloader.report_warning(u'unable to extract video duration')
59ae15a5
PH
599 video_duration = ''
600 else:
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
602
603 # token
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
608
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
2a36c352
FV
613 url_map = {}
614 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615 url_data = compat_parse_qs(url_data_str)
616 if 'itag' in url_data and 'url' in url_data:
617 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618 if not 'ratebypass' in url: url += '&ratebypass=yes'
619 url_map[url_data['itag'][0]] = url
59ae15a5
PH
620
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
625 else:
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
c681a039 629 raise ExtractorError(u'no known formats available for video')
59ae15a5
PH
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
632 return
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 else:
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
645 if rf in url_map:
646 video_url_list = [(rf, url_map[rf])]
647 break
648 if video_url_list is None:
c681a039 649 raise ExtractorError(u'requested format not available')
59ae15a5 650 else:
c681a039 651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
59ae15a5
PH
652
653 results = []
654 for format_param, video_real_url in video_url_list:
655 # Extension
656 video_extension = self._video_extensions.get(format_param, 'flv')
657
32761d86
FV
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
660
661 results.append({
662 'id': video_id,
663 'url': video_real_url,
664 'uploader': video_uploader,
77c4beab 665 'uploader_id': video_uploader_id,
59ae15a5
PH
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
675 })
676 return results
d77c3dfd
FV
677
678
679class MetacafeIE(InfoExtractor):
59ae15a5
PH
680 """Information Extractor for metacafe.com."""
681
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
686
59ae15a5
PH
687 def report_disclaimer(self):
688 """Report disclaimer retrieval."""
f17ce13a 689 self.to_screen(u'Retrieving disclaimer')
59ae15a5 690
59ae15a5
PH
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
694 try:
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 698 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
59ae15a5
PH
699
700 # Confirm age
701 disclaimer_form = {
702 'filters': '0',
703 'submit': "Continue - I'm over 18",
704 }
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706 try:
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 710 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
59ae15a5
PH
711
712 def _real_extract(self, url):
713 # Extract id and simplified title from URL
714 mobj = re.match(self._VALID_URL, url)
715 if mobj is None:
0c021ad1 716 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
717
718 video_id = mobj.group(1)
719
720 # Check if video comes from YouTube
721 mobj2 = re.match(r'^yt-(.*)$', video_id)
722 if mobj2 is not None:
6de8f1af 723 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
59ae15a5
PH
724
725 # Retrieve video webpage to extract further information
f7a9721e 726 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
59ae15a5
PH
727
728 # Extract URL, uploader and title from webpage
729 self.report_extraction(video_id)
730 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
731 if mobj is not None:
732 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733 video_extension = mediaURL[-3:]
734
735 # Extract gdaKey if available
736 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737 if mobj is None:
738 video_url = mediaURL
739 else:
740 gdaKey = mobj.group(1)
741 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
742 else:
743 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
744 if mobj is None:
0c021ad1 745 raise ExtractorError(u'Unable to extract media URL')
59ae15a5
PH
746 vardict = compat_parse_qs(mobj.group(1))
747 if 'mediaData' not in vardict:
0c021ad1 748 raise ExtractorError(u'Unable to extract media URL')
f7a9721e 749 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
59ae15a5 750 if mobj is None:
0c021ad1 751 raise ExtractorError(u'Unable to extract media URL')
f7a9721e 752 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
59ae15a5 753 video_extension = mediaURL[-3:]
f7a9721e 754 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
59ae15a5
PH
755
756 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
757 if mobj is None:
0c021ad1 758 raise ExtractorError(u'Unable to extract title')
59ae15a5
PH
759 video_title = mobj.group(1).decode('utf-8')
760
761 mobj = re.search(r'submitter=(.*?);', webpage)
762 if mobj is None:
0c021ad1 763 raise ExtractorError(u'Unable to extract uploader nickname')
59ae15a5
PH
764 video_uploader = mobj.group(1)
765
766 return [{
767 'id': video_id.decode('utf-8'),
768 'url': video_url.decode('utf-8'),
769 'uploader': video_uploader.decode('utf-8'),
770 'upload_date': None,
771 'title': video_title,
772 'ext': video_extension.decode('utf-8'),
773 }]
d77c3dfd 774
d77c3dfd 775class DailymotionIE(InfoExtractor):
59ae15a5
PH
776 """Information Extractor for Dailymotion"""
777
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
780
59ae15a5
PH
781 def _real_extract(self, url):
782 # Extract id and simplified title from URL
783 mobj = re.match(self._VALID_URL, url)
784 if mobj is None:
0c021ad1 785 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
786
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
788
789 video_extension = 'mp4'
790
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
8e241d1a 794 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
795
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
799 if mobj is None:
0c021ad1 800 raise ExtractorError(u'Unable to extract media URL')
59ae15a5
PH
801 flashvars = compat_urllib_parse.unquote(mobj.group(1))
802
803 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
804 if key in flashvars:
805 max_quality = key
f17ce13a 806 self.to_screen(u'Using %s' % key)
59ae15a5
PH
807 break
808 else:
0c021ad1 809 raise ExtractorError(u'Unable to extract video URL')
59ae15a5
PH
810
811 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
812 if mobj is None:
0c021ad1 813 raise ExtractorError(u'Unable to extract video URL')
59ae15a5
PH
814
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
816
817 # TODO: support choosing qualities
818
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
820 if mobj is None:
0c021ad1 821 raise ExtractorError(u'Unable to extract title')
28ca6b5a 822 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
823
824 video_uploader = None
825 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
826 if mobj is None:
827 # lookin for official user
828 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829 if mobj_official is None:
c9fa1cba 830 self._downloader.report_warning(u'unable to extract uploader nickname')
59ae15a5
PH
831 else:
832 video_uploader = mobj_official.group(1)
833 else:
834 video_uploader = mobj.group(1)
835
836 video_upload_date = None
837 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
838 if mobj is not None:
839 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
840
841 return [{
28ca6b5a
PH
842 'id': video_id,
843 'url': video_url,
844 'uploader': video_uploader,
59ae15a5
PH
845 'upload_date': video_upload_date,
846 'title': video_title,
28ca6b5a 847 'ext': video_extension,
59ae15a5 848 }]
d77c3dfd
FV
849
850
d77c3dfd 851class PhotobucketIE(InfoExtractor):
59ae15a5
PH
852 """Information extractor for photobucket.com."""
853
d96680f5
JMF
854 # TODO: the original _VALID_URL was:
855 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 # Check if it's necessary to keep the old extracion process
857 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
59ae15a5
PH
858 IE_NAME = u'photobucket'
859
59ae15a5
PH
860 def _real_extract(self, url):
861 # Extract id from URL
862 mobj = re.match(self._VALID_URL, url)
863 if mobj is None:
0c021ad1 864 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 865
d96680f5 866 video_id = mobj.group('id')
59ae15a5 867
d96680f5 868 video_extension = mobj.group('ext')
59ae15a5
PH
869
870 # Retrieve video webpage to extract further information
d96680f5 871 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
872
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
d96680f5
JMF
875 # We try first by looking the javascript code:
876 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
877 if mobj is not None:
878 info = json.loads(mobj.group('json'))
879 return [{
880 'id': video_id,
881 'url': info[u'downloadUrl'],
882 'uploader': info[u'username'],
883 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884 'title': info[u'title'],
885 'ext': video_extension,
886 'thumbnail': info[u'thumbUrl'],
887 }]
888
889 # We try looking in other parts of the webpage
59ae15a5
PH
890 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
891 if mobj is None:
0c021ad1 892 raise ExtractorError(u'Unable to extract media URL')
59ae15a5
PH
893 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
894
895 video_url = mediaURL
896
897 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
898 if mobj is None:
0c021ad1 899 raise ExtractorError(u'Unable to extract title')
59ae15a5
PH
900 video_title = mobj.group(1).decode('utf-8')
901
902 video_uploader = mobj.group(2).decode('utf-8')
903
904 return [{
905 'id': video_id.decode('utf-8'),
906 'url': video_url.decode('utf-8'),
907 'uploader': video_uploader,
908 'upload_date': None,
909 'title': video_title,
910 'ext': video_extension.decode('utf-8'),
911 }]
d77c3dfd
FV
912
913
914class YahooIE(InfoExtractor):
59f4fd4d
JMF
915 """Information extractor for screen.yahoo.com."""
916 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
59ae15a5 917
59f4fd4d 918 def _real_extract(self, url):
59ae15a5
PH
919 mobj = re.match(self._VALID_URL, url)
920 if mobj is None:
0c021ad1 921 raise ExtractorError(u'Invalid URL: %s' % url)
59f4fd4d 922 video_id = mobj.group('id')
2f58b12d
JMF
923 webpage = self._download_webpage(url, video_id)
924 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
925
926 if m_id is None:
927 # TODO: Check which url parameters are required
928 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
929 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
930 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
931 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
932 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
933 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
934 '''
935 self.report_extraction(video_id)
936 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
937 if m_info is None:
938 raise ExtractorError(u'Unable to extract video info')
939 video_title = m_info.group('title')
940 video_description = m_info.group('description')
941 video_thumb = m_info.group('thumb')
942 video_date = m_info.group('date')
943 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
944
945 # TODO: Find a way to get mp4 videos
946 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
947 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
948 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
949 video_url = m_rest.group('url')
950 video_path = m_rest.group('path')
951 if m_rest is None:
952 raise ExtractorError(u'Unable to extract video url')
953
954 else: # We have to use a different method if another id is defined
955 long_id = m_id.group('new_id')
956 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
957 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
958 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
959 info = json.loads(json_str)
960 res = info[u'query'][u'results'][u'mediaObj'][0]
961 stream = res[u'streams'][0]
962 video_path = stream[u'path']
963 video_url = stream[u'host']
964 meta = res[u'meta']
965 video_title = meta[u'title']
966 video_description = meta[u'description']
967 video_thumb = meta[u'thumbnail']
968 video_date = None # I can't find it
59f4fd4d
JMF
969
970 info_dict = {
971 'id': video_id,
2f58b12d
JMF
972 'url': video_url,
973 'play_path': video_path,
59f4fd4d
JMF
974 'title':video_title,
975 'description': video_description,
976 'thumbnail': video_thumb,
977 'upload_date': video_date,
978 'ext': 'flv',
979 }
980 return info_dict
d77c3dfd
FV
981
982class VimeoIE(InfoExtractor):
59ae15a5
PH
983 """Information extractor for vimeo.com."""
984
985 # _VALID_URL matches Vimeo URLs
8edc2cf8 986 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
59ae15a5
PH
987 IE_NAME = u'vimeo'
988
59ae15a5
PH
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
992 if mobj is None:
0c021ad1 993 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 994
8edc2cf8
PH
995 video_id = mobj.group('id')
996 if not mobj.group('proto'):
997 url = 'https://' + url
998 if mobj.group('direct_link'):
999 url = 'https://vimeo.com/' + video_id
59ae15a5
PH
1000
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url, None, std_headers)
46bfb422 1003 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
1004
1005 # Now we begin extracting as much information as we can from what we
1006 # retrieved. First we extract the information common to all extractors,
1007 # and latter we extract those that are Vimeo specific.
1008 self.report_extraction(video_id)
1009
1010 # Extract the config JSON
59ae15a5 1011 try:
1ca63e3a 1012 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1013 config = json.loads(config)
1014 except:
3820df01 1015 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
0c021ad1 1016 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
3820df01 1017 else:
0c021ad1 1018 raise ExtractorError(u'Unable to extract info section')
cdb30764 1019
59ae15a5
PH
1020 # Extract title
1021 video_title = config["video"]["title"]
1022
77c4beab 1023 # Extract uploader and uploader_id
59ae15a5 1024 video_uploader = config["video"]["owner"]["name"]
77c4beab 1025 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1026
1027 # Extract video thumbnail
1028 video_thumbnail = config["video"]["thumbnail"]
1029
1030 # Extract video description
0dcfb234 1031 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5 1032 if video_description: video_description = clean_html(video_description)
dc36bc94 1033 else: video_description = u''
59ae15a5
PH
1034
1035 # Extract upload date
1036 video_upload_date = None
6b3aef80 1037 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1038 if mobj is not None:
6b3aef80 1039 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1040
1041 # Vimeo specific: extract request signature and timestamp
1042 sig = config['request']['signature']
1043 timestamp = config['request']['timestamp']
1044
1045 # Vimeo specific: extract video codec and quality information
1046 # First consider quality, then codecs, then take everything
1047 # TODO bind to format param
1048 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049 files = { 'hd': [], 'sd': [], 'other': []}
1050 for codec_name, codec_extension in codecs:
1051 if codec_name in config["video"]["files"]:
1052 if 'hd' in config["video"]["files"][codec_name]:
1053 files['hd'].append((codec_name, codec_extension, 'hd'))
1054 elif 'sd' in config["video"]["files"][codec_name]:
1055 files['sd'].append((codec_name, codec_extension, 'sd'))
1056 else:
1057 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059 for quality in ('hd', 'sd', 'other'):
1060 if len(files[quality]) > 0:
1061 video_quality = files[quality][0][2]
1062 video_codec = files[quality][0][0]
1063 video_extension = files[quality][0][1]
f17ce13a 1064 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
59ae15a5
PH
1065 break
1066 else:
0c021ad1 1067 raise ExtractorError(u'No known codec found')
59ae15a5
PH
1068
1069 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071
1072 return [{
1073 'id': video_id,
1074 'url': video_url,
1075 'uploader': video_uploader,
77c4beab 1076 'uploader_id': video_uploader_id,
59ae15a5
PH
1077 'upload_date': video_upload_date,
1078 'title': video_title,
1079 'ext': video_extension,
1080 'thumbnail': video_thumbnail,
1081 'description': video_description,
1082 }]
d77c3dfd
FV
1083
1084
f2ad10a9 1085class ArteTvIE(InfoExtractor):
59ae15a5
PH
1086 """arte.tv information extractor."""
1087
1088 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089 _LIVE_URL = r'index-[0-9]+\.html$'
1090
1091 IE_NAME = u'arte.tv'
1092
59ae15a5 1093 def fetch_webpage(self, url):
59ae15a5
PH
1094 request = compat_urllib_request.Request(url)
1095 try:
1096 self.report_download_webpage(url)
1097 webpage = compat_urllib_request.urlopen(request).read()
1098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 1099 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5 1100 except ValueError as err:
0c021ad1 1101 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
1102 return webpage
1103
1104 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105 page = self.fetch_webpage(url)
1106 mobj = re.search(regex, page, regexFlags)
1107 info = {}
1108
1109 if mobj is None:
0c021ad1 1110 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
1111
1112 for (i, key, err) in matchTuples:
1113 if mobj.group(i) is None:
0c021ad1 1114 raise ExtractorError(err)
59ae15a5
PH
1115 else:
1116 info[key] = mobj.group(i)
1117
1118 return info
1119
1120 def extractLiveStream(self, url):
1121 video_lang = url.split('/')[-4]
1122 info = self.grep_webpage(
1123 url,
1124 r'src="(.*?/videothek_js.*?\.js)',
1125 0,
1126 [
613bf669 1127 (1, 'url', u'Invalid URL: %s' % url)
59ae15a5
PH
1128 ]
1129 )
1130 http_host = url.split('/')[2]
1131 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132 info = self.grep_webpage(
1133 next_url,
1134 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135 '(http://.*?\.swf).*?' +
1136 '(rtmp://.*?)\'',
1137 re.DOTALL,
1138 [
613bf669
JMF
1139 (1, 'path', u'could not extract video path: %s' % url),
1140 (2, 'player', u'could not extract video player: %s' % url),
1141 (3, 'url', u'could not extract video url: %s' % url)
59ae15a5
PH
1142 ]
1143 )
1144 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1145
1146 def extractPlus7Stream(self, url):
1147 video_lang = url.split('/')[-3]
1148 info = self.grep_webpage(
1149 url,
1150 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1151 0,
1152 [
613bf669 1153 (1, 'url', u'Invalid URL: %s' % url)
59ae15a5
PH
1154 ]
1155 )
1156 next_url = compat_urllib_parse.unquote(info.get('url'))
1157 info = self.grep_webpage(
1158 next_url,
1159 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1160 0,
1161 [
613bf669 1162 (1, 'url', u'Could not find <video> tag: %s' % url)
59ae15a5
PH
1163 ]
1164 )
1165 next_url = compat_urllib_parse.unquote(info.get('url'))
1166
1167 info = self.grep_webpage(
1168 next_url,
1169 r'<video id="(.*?)".*?>.*?' +
1170 '<name>(.*?)</name>.*?' +
1171 '<dateVideo>(.*?)</dateVideo>.*?' +
1172 '<url quality="hd">(.*?)</url>',
1173 re.DOTALL,
1174 [
613bf669
JMF
1175 (1, 'id', u'could not extract video id: %s' % url),
1176 (2, 'title', u'could not extract video title: %s' % url),
1177 (3, 'date', u'could not extract video date: %s' % url),
1178 (4, 'url', u'could not extract video url: %s' % url)
59ae15a5
PH
1179 ]
1180 )
1181
1182 return {
1183 'id': info.get('id'),
1184 'url': compat_urllib_parse.unquote(info.get('url')),
1185 'uploader': u'arte.tv',
f8602d32 1186 'upload_date': unified_strdate(info.get('date')),
93702113 1187 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1188 'ext': u'mp4',
1189 'format': u'NA',
1190 'player_url': None,
1191 }
1192
1193 def _real_extract(self, url):
1194 video_id = url.split('/')[-1]
1195 self.report_extraction(video_id)
1196
1197 if re.search(self._LIVE_URL, video_id) is not None:
1198 self.extractLiveStream(url)
1199 return
1200 else:
1201 info = self.extractPlus7Stream(url)
1202
1203 return [info]
f2ad10a9
CA
1204
1205
d77c3dfd 1206class GenericIE(InfoExtractor):
59ae15a5
PH
1207 """Generic last-resort information extractor."""
1208
1209 _VALID_URL = r'.*'
1210 IE_NAME = u'generic'
1211
59ae15a5
PH
1212 def report_download_webpage(self, video_id):
1213 """Report webpage download."""
3d342357 1214 if not self._downloader.params.get('test', False):
f17ce13a 1215 self._downloader.report_warning(u'Falling back on generic information extractor.')
0d173446 1216 super(GenericIE, self).report_download_webpage(video_id)
59ae15a5 1217
59ae15a5
PH
1218 def report_following_redirect(self, new_url):
1219 """Report information extraction."""
1220 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1221
59ae15a5 1222 def _test_redirect(self, url):
a0d6fe7b 1223 """Check if it is a redirect, like url shorteners, in case return the new url."""
59ae15a5
PH
1224 class HeadRequest(compat_urllib_request.Request):
1225 def get_method(self):
1226 return "HEAD"
1227
1228 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1229 """
cdb30764 1230 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1231 HeadRequest also on the redirected URL
1232 """
cdb30764 1233 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1234 if code in (301, 302, 303, 307):
cdb30764 1235 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1236 newheaders = dict((k,v) for k,v in req.headers.items()
1237 if k.lower() not in ("content-length", "content-type"))
cdb30764 1238 return HeadRequest(newurl,
59ae15a5 1239 headers=newheaders,
cdb30764
ND
1240 origin_req_host=req.get_origin_req_host(),
1241 unverifiable=True)
1242 else:
1243 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1244
1245 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1246 """
1247 Fallback to GET if HEAD is not allowed (405 HTTP error)
1248 """
cdb30764 1249 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1250 fp.read()
1251 fp.close()
1252
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1255 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1256 headers=newheaders,
1257 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1258 unverifiable=True))
1259
1260 # Build our opener
cdb30764 1261 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1262 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263 HTTPMethodFallback, HEADRedirectHandler,
7c038b3c 1264 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
59ae15a5
PH
1265 opener.add_handler(handler())
1266
1267 response = opener.open(HeadRequest(url))
1268 new_url = response.geturl()
1269
1270 if url == new_url:
1271 return False
1272
1273 self.report_following_redirect(new_url)
a0d6fe7b 1274 return new_url
59ae15a5
PH
1275
1276 def _real_extract(self, url):
a0d6fe7b
JMF
1277 new_url = self._test_redirect(url)
1278 if new_url: return [self.url_result(new_url)]
59ae15a5
PH
1279
1280 video_id = url.split('/')[-1]
59ae15a5 1281 try:
3d342357 1282 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
1283 except ValueError as err:
1284 # since this is the last-resort InfoExtractor, if
1285 # this error is thrown, it'll be thrown here
0c021ad1 1286 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
1287
1288 self.report_extraction(video_id)
1289 # Start with something easy: JW Player in SWFObject
1290 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1291 if mobj is None:
1292 # Broaden the search a little bit
1293 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1013186a
PH
1294 if mobj is None:
1295 # Broaden the search a little bit: JWPlayer JS loader
1296 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
59ae15a5 1297 if mobj is None:
0c021ad1 1298 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
1299
1300 # It's possible that one of the regexes
1301 # matched, but returned an empty group:
1302 if mobj.group(1) is None:
0c021ad1 1303 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
1304
1305 video_url = compat_urllib_parse.unquote(mobj.group(1))
1306 video_id = os.path.basename(video_url)
1307
1308 # here's a fun little line of code for you:
1309 video_extension = os.path.splitext(video_id)[1][1:]
1310 video_id = os.path.splitext(video_id)[0]
1311
1312 # it's tempting to parse this further, but you would
1313 # have to take into account all the variations like
1314 # Video Title - Site Name
1315 # Site Name | Video Title
1316 # Video Title - Tagline | Site Name
1317 # and so on and so forth; it's just not practical
1318 mobj = re.search(r'<title>(.*)</title>', webpage)
1319 if mobj is None:
0c021ad1 1320 raise ExtractorError(u'Unable to extract title')
f1171f7c 1321 video_title = mobj.group(1)
59ae15a5
PH
1322
1323 # video uploader is domain name
1324 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1325 if mobj is None:
0c021ad1 1326 raise ExtractorError(u'Unable to extract title')
f1171f7c 1327 video_uploader = mobj.group(1)
59ae15a5
PH
1328
1329 return [{
f1171f7c
PH
1330 'id': video_id,
1331 'url': video_url,
59ae15a5
PH
1332 'uploader': video_uploader,
1333 'upload_date': None,
1334 'title': video_title,
f1171f7c 1335 'ext': video_extension,
59ae15a5 1336 }]
d77c3dfd
FV
1337
1338
1339class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1340 """Information Extractor for YouTube search queries."""
1341 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343 _max_youtube_results = 1000
1344 IE_NAME = u'youtube:search'
1345
59ae15a5
PH
1346 def report_download_page(self, query, pagenum):
1347 """Report attempt to download search page with given number."""
1348 query = query.decode(preferredencoding())
1349 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1350
1351 def _real_extract(self, query):
1352 mobj = re.match(self._VALID_URL, query)
1353 if mobj is None:
0c021ad1 1354 raise ExtractorError(u'Invalid search query "%s"' % query)
59ae15a5
PH
1355
1356 prefix, query = query.split(':')
1357 prefix = prefix[8:]
1358 query = query.encode('utf-8')
1359 if prefix == '':
8c416ad2 1360 return self._get_n_results(query, 1)
59ae15a5 1361 elif prefix == 'all':
8c416ad2 1362 self._get_n_results(query, self._max_youtube_results)
59ae15a5
PH
1363 else:
1364 try:
1365 n = int(prefix)
1366 if n <= 0:
0c021ad1 1367 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
59ae15a5 1368 elif n > self._max_youtube_results:
2e5457be 1369 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
59ae15a5 1370 n = self._max_youtube_results
8c416ad2 1371 return self._get_n_results(query, n)
59ae15a5 1372 except ValueError: # parsing prefix as integer fails
8c416ad2 1373 return self._get_n_results(query, 1)
59ae15a5 1374
8c416ad2
JMF
1375 def _get_n_results(self, query, n):
1376 """Get a specified number of results for a query"""
59ae15a5
PH
1377
1378 video_ids = []
1379 pagenum = 0
1380 limit = n
1381
1382 while (50 * pagenum) < limit:
1383 self.report_download_page(query, pagenum+1)
1384 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385 request = compat_urllib_request.Request(result_url)
1386 try:
d1b7a243 1387 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 1388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 1389 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
59ae15a5
PH
1390 api_response = json.loads(data)['data']
1391
9e07cf29 1392 if not 'items' in api_response:
0c021ad1 1393 raise ExtractorError(u'[youtube] No video results')
9e07cf29 1394
59ae15a5
PH
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1397
1398 limit = min(n, api_response['totalItems'])
1399 pagenum += 1
1400
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
8c416ad2
JMF
1403 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1404 return videos
d77c3dfd
FV
1405
1406
1407class GoogleSearchIE(InfoExtractor):
59ae15a5 1408 """Information Extractor for Google Video search queries."""
94ca71b7 1409 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
3c5e7729 1410 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
59ae15a5
PH
1411 _max_google_results = 1000
1412 IE_NAME = u'video.google:search'
1413
59ae15a5
PH
1414 def _real_extract(self, query):
1415 mobj = re.match(self._VALID_URL, query)
59ae15a5 1416
94ca71b7
PH
1417 prefix = mobj.group('prefix')
1418 query = mobj.group('query')
59ae15a5 1419 if prefix == '':
43b62acc 1420 return self._get_n_results(query, 1)
59ae15a5 1421 elif prefix == 'all':
43b62acc 1422 return self._get_n_results(query, self._max_google_results)
59ae15a5 1423 else:
94ca71b7
PH
1424 n = int(prefix)
1425 if n <= 0:
1426 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427 elif n > self._max_google_results:
1428 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429 n = self._max_google_results
43b62acc 1430 return self._get_n_results(query, n)
59ae15a5 1431
43b62acc
JMF
1432 def _get_n_results(self, query, n):
1433 """Get a specified number of results for a query"""
59ae15a5 1434
94ca71b7
PH
1435 res = {
1436 '_type': 'playlist',
1437 'id': query,
1438 'entries': []
1439 }
59ae15a5 1440
94ca71b7 1441 for pagenum in itertools.count(1):
3c5e7729
JMF
1442 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1443 print(result_url)
94ca71b7
PH
1444 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1445 note='Downloading result page ' + str(pagenum))
59ae15a5 1446
94ca71b7
PH
1447 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1448 e = {
1449 '_type': 'url',
1450 'url': mobj.group(1)
1451 }
1452 res['entries'].append(e)
d77c3dfd 1453
94ca71b7
PH
1454 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1455 return res
d77c3dfd
FV
1456
1457class YahooSearchIE(InfoExtractor):
59ae15a5 1458 """Information Extractor for Yahoo! Video search queries."""
93702113 1459
59ae15a5 1460 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
59ae15a5 1461
5a853e14
JMF
1462 _max_yahoo_results = 1000
1463 IE_NAME = u'screen.yahoo:search'
59ae15a5
PH
1464
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1467 if mobj is None:
0c021ad1 1468 raise ExtractorError(u'Invalid search query "%s"' % query)
59ae15a5
PH
1469
1470 prefix, query = query.split(':')
1471 prefix = prefix[8:]
1472 query = query.encode('utf-8')
1473 if prefix == '':
5a853e14 1474 return self._get_n_results(query, 1)
59ae15a5 1475 elif prefix == 'all':
5a853e14 1476 return self._get_n_results(query, self._max_yahoo_results)
59ae15a5
PH
1477 else:
1478 try:
1479 n = int(prefix)
1480 if n <= 0:
0c021ad1 1481 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
59ae15a5 1482 elif n > self._max_yahoo_results:
2e5457be 1483 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
59ae15a5 1484 n = self._max_yahoo_results
5a853e14 1485 return self._get_n_results(query, n)
59ae15a5 1486 except ValueError: # parsing prefix as integer fails
5a853e14 1487 return self._get_n_results(query, 1)
59ae15a5 1488
5a853e14
JMF
1489 def _get_n_results(self, query, n):
1490 """Get a specified number of results for a query"""
59ae15a5 1491
5a853e14
JMF
1492 res = {
1493 '_type': 'playlist',
1494 'id': query,
1495 'entries': []
1496 }
1497 for pagenum in itertools.count(0):
1498 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1499 webpage = self._download_webpage(result_url, query,
1500 note='Downloading results page '+str(pagenum+1))
1501 info = json.loads(webpage)
1502 m = info[u'm']
1503 results = info[u'results']
1504
1505 for (i, r) in enumerate(results):
1506 if (pagenum * 30) +i >= n:
1507 break
1508 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1509 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1510 res['entries'].append(e)
1511 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1512 break
59ae15a5 1513
5a853e14 1514 return res
d77c3dfd
FV
1515
1516
1517class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1518 """Information Extractor for YouTube playlists."""
1519
6324fd1d
FV
1520 _VALID_URL = r"""(?:
1521 (?:https?://)?
1522 (?:\w+\.)?
1523 youtube\.com/
1524 (?:
89de9eb1
FV
1525 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1526 \? (?:.*?&)*? (?:p|a|list)=
6324fd1d 1527 | p/
6324fd1d 1528 )
89de9eb1
FV
1529 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1530 .*
1531 |
1532 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1533 )"""
6324fd1d
FV
1534 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1535 _MAX_RESULTS = 50
59ae15a5
PH
1536 IE_NAME = u'youtube:playlist'
1537
89de9eb1
FV
1538 @classmethod
1539 def suitable(cls, url):
6324fd1d 1540 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 1541 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
6324fd1d 1542
59ae15a5
PH
1543 def _real_extract(self, url):
1544 # Extract playlist id
6324fd1d 1545 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 1546 if mobj is None:
0c021ad1 1547 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 1548
6324fd1d 1549 # Download playlist videos from API
89de9eb1 1550 playlist_id = mobj.group(1) or mobj.group(2)
6324fd1d
FV
1551 page_num = 1
1552 videos = []
59ae15a5
PH
1553
1554 while True:
6324fd1d 1555 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
46bfb422 1556 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
59ae15a5 1557
6324fd1d
FV
1558 try:
1559 response = json.loads(page)
1560 except ValueError as err:
0c021ad1 1561 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
59ae15a5 1562
feba604e 1563 if 'feed' not in response:
0c021ad1 1564 raise ExtractorError(u'Got a malformed response from YouTube API')
aba8df23 1565 playlist_title = response['feed']['title']['$t']
feba604e
PH
1566 if 'entry' not in response['feed']:
1567 # Number of videos is a multiple of self._MAX_RESULTS
1568 break
1569
89de9eb1
FV
1570 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1571 for entry in response['feed']['entry']
1572 if 'content' in entry ]
6324fd1d
FV
1573
1574 if len(response['feed']['entry']) < self._MAX_RESULTS:
59ae15a5 1575 break
6324fd1d 1576 page_num += 1
59ae15a5 1577
691db5ba 1578 videos = [v[1] for v in sorted(videos)]
9789a05c 1579
6de8f1af 1580 url_results = [self.url_result(url, 'Youtube') for url in videos]
c7293824 1581 return [self.playlist_result(url_results, playlist_id, playlist_title)]
d77c3dfd
FV
1582
1583
902b2a0a 1584class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1585 """Information Extractor for YouTube channels."""
1586
5a8d1319 1587 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
59ae15a5 1588 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
5a8d1319 1589 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1590 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
59ae15a5
PH
1591 IE_NAME = u'youtube:channel'
1592
5a8d1319 1593 def extract_videos_from_page(self, page):
1594 ids_in_page = []
1595 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1596 if mobj.group(1) not in ids_in_page:
1597 ids_in_page.append(mobj.group(1))
1598 return ids_in_page
1599
59ae15a5
PH
1600 def _real_extract(self, url):
1601 # Extract channel id
1602 mobj = re.match(self._VALID_URL, url)
1603 if mobj is None:
0c021ad1 1604 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 1605
5a8d1319 1606 # Download channel page
59ae15a5
PH
1607 channel_id = mobj.group(1)
1608 video_ids = []
1609 pagenum = 1
1610
5a8d1319 1611 url = self._TEMPLATE_URL % (channel_id, pagenum)
46bfb422
JMF
1612 page = self._download_webpage(url, channel_id,
1613 u'Downloading page #%s' % pagenum)
59ae15a5 1614
5a8d1319 1615 # Extract video identifiers
1616 ids_in_page = self.extract_videos_from_page(page)
1617 video_ids.extend(ids_in_page)
59ae15a5 1618
5a8d1319 1619 # Download any subsequent channel pages using the json-based channel_ajax query
1620 if self._MORE_PAGES_INDICATOR in page:
1621 while True:
1622 pagenum = pagenum + 1
1623
5a8d1319 1624 url = self._MORE_PAGES_URL % (pagenum, channel_id)
46bfb422
JMF
1625 page = self._download_webpage(url, channel_id,
1626 u'Downloading page #%s' % pagenum)
5a8d1319 1627
1628 page = json.loads(page)
1629
1630 ids_in_page = self.extract_videos_from_page(page['content_html'])
1631 video_ids.extend(ids_in_page)
1632
1633 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1634 break
59ae15a5 1635
9789a05c
FV
1636 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1637
f6e6da95 1638 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
6de8f1af 1639 url_entries = [self.url_result(url, 'Youtube') for url in urls]
d2c69082 1640 return [self.playlist_result(url_entries, channel_id)]
902b2a0a
FV
1641
1642
d77c3dfd 1643class YoutubeUserIE(InfoExtractor):
59ae15a5 1644 """Information Extractor for YouTube users."""
d77c3dfd 1645
59ae15a5
PH
1646 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1647 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1648 _GDATA_PAGE_SIZE = 50
1649 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1650 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1651 IE_NAME = u'youtube:user'
d77c3dfd 1652
59ae15a5
PH
1653 def _real_extract(self, url):
1654 # Extract username
1655 mobj = re.match(self._VALID_URL, url)
1656 if mobj is None:
0c021ad1 1657 raise ExtractorError(u'Invalid URL: %s' % url)
d77c3dfd 1658
59ae15a5 1659 username = mobj.group(1)
d77c3dfd 1660
59ae15a5
PH
1661 # Download video ids using YouTube Data API. Result size per
1662 # query is limited (currently to 50 videos) so we need to query
1663 # page by page until there are no video ids - it means we got
1664 # all of them.
d77c3dfd 1665
59ae15a5
PH
1666 video_ids = []
1667 pagenum = 0
d77c3dfd 1668
59ae15a5
PH
1669 while True:
1670 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
d77c3dfd 1671
46bfb422
JMF
1672 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1673 page = self._download_webpage(gdata_url, username,
1674 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1675
59ae15a5
PH
1676 # Extract video identifiers
1677 ids_in_page = []
d77c3dfd 1678
59ae15a5
PH
1679 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1680 if mobj.group(1) not in ids_in_page:
1681 ids_in_page.append(mobj.group(1))
d77c3dfd 1682
59ae15a5 1683 video_ids.extend(ids_in_page)
d77c3dfd 1684
59ae15a5
PH
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1689 # again.
d77c3dfd 1690
59ae15a5
PH
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1692 break
d77c3dfd 1693
59ae15a5 1694 pagenum += 1
d77c3dfd 1695
597cc8a4 1696 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
6de8f1af 1697 url_results = [self.url_result(url, 'Youtube') for url in urls]
d2c69082 1698 return [self.playlist_result(url_results, playlist_title = username)]
d77c3dfd
FV
1699
1700
eeeb4daa 1701class BlipTVUserIE(InfoExtractor):
59ae15a5 1702 """Information Extractor for blip.tv users."""
eeeb4daa 1703
59ae15a5
PH
1704 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1705 _PAGE_SIZE = 12
1706 IE_NAME = u'blip.tv:user'
eeeb4daa 1707
59ae15a5
PH
1708 def _real_extract(self, url):
1709 # Extract username
1710 mobj = re.match(self._VALID_URL, url)
1711 if mobj is None:
0c021ad1 1712 raise ExtractorError(u'Invalid URL: %s' % url)
eeeb4daa 1713
59ae15a5 1714 username = mobj.group(1)
eeeb4daa 1715
59ae15a5 1716 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1717
46bfb422
JMF
1718 page = self._download_webpage(url, username, u'Downloading user page')
1719 mobj = re.search(r'data-users-id="([^"]+)"', page)
1720 page_base = page_base % mobj.group(1)
eeeb4daa
JCGS
1721
1722
59ae15a5
PH
1723 # Download video ids using BlipTV Ajax calls. Result size per
1724 # query is limited (currently to 12 videos) so we need to query
1725 # page by page until there are no video ids - it means we got
1726 # all of them.
eeeb4daa 1727
59ae15a5
PH
1728 video_ids = []
1729 pagenum = 1
eeeb4daa 1730
59ae15a5 1731 while True:
450e7099 1732 url = page_base + "&page=" + str(pagenum)
46bfb422
JMF
1733 page = self._download_webpage(url, username,
1734 u'Downloading video ids from page %d' % pagenum)
eeeb4daa 1735
59ae15a5
PH
1736 # Extract video identifiers
1737 ids_in_page = []
eeeb4daa 1738
59ae15a5
PH
1739 for mobj in re.finditer(r'href="/([^"]+)"', page):
1740 if mobj.group(1) not in ids_in_page:
1741 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1742
59ae15a5 1743 video_ids.extend(ids_in_page)
eeeb4daa 1744
59ae15a5
PH
1745 # A little optimization - if current page is not
1746 # "full", ie. does not contain PAGE_SIZE video ids then
1747 # we can assume that this page is the last one - there
1748 # are no more ids on further pages - no need to query
1749 # again.
eeeb4daa 1750
59ae15a5
PH
1751 if len(ids_in_page) < self._PAGE_SIZE:
1752 break
eeeb4daa 1753
59ae15a5 1754 pagenum += 1
eeeb4daa 1755
f6e6da95 1756 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
6de8f1af 1757 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
d2c69082 1758 return [self.playlist_result(url_entries, playlist_title = username)]
eeeb4daa
JCGS
1759
1760
d77c3dfd 1761class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1762 """Information extractor for depositfiles.com"""
1763
1764 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59ae15a5 1765
59ae15a5
PH
1766 def _real_extract(self, url):
1767 file_id = url.split('/')[-1]
1768 # Rebuild url in english locale
1769 url = 'http://depositfiles.com/en/files/' + file_id
1770
1771 # Retrieve file webpage with 'Free download' button pressed
1772 free_download_indication = { 'gateway_result' : '1' }
1773 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1774 try:
1775 self.report_download_webpage(file_id)
1776 webpage = compat_urllib_request.urlopen(request).read()
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 1778 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
59ae15a5
PH
1779
1780 # Search for the real file URL
1781 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1782 if (mobj is None) or (mobj.group(1) is None):
1783 # Try to figure out reason of the error.
1784 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1785 if (mobj is not None) and (mobj.group(1) is not None):
1786 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
0c021ad1 1787 raise ExtractorError(u'%s' % restriction_message)
59ae15a5 1788 else:
0c021ad1 1789 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
59ae15a5
PH
1790
1791 file_url = mobj.group(1)
1792 file_extension = os.path.splitext(file_url)[1][1:]
1793
1794 # Search for file title
1795 mobj = re.search(r'<b title="(.*?)">', webpage)
1796 if mobj is None:
0c021ad1 1797 raise ExtractorError(u'Unable to extract title')
59ae15a5
PH
1798 file_title = mobj.group(1).decode('utf-8')
1799
1800 return [{
1801 'id': file_id.decode('utf-8'),
1802 'url': file_url.decode('utf-8'),
1803 'uploader': None,
1804 'upload_date': None,
1805 'title': file_title,
1806 'ext': file_extension.decode('utf-8'),
1807 }]
d77c3dfd
FV
1808
1809
1810class FacebookIE(InfoExtractor):
59ae15a5
PH
1811 """Information Extractor for Facebook"""
1812
59ae15a5
PH
1813 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1814 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1815 _NETRC_MACHINE = 'facebook'
59ae15a5
PH
1816 IE_NAME = u'facebook'
1817
59ae15a5
PH
1818 def report_login(self):
1819 """Report attempt to log in."""
f17ce13a 1820 self.to_screen(u'Logging in')
59ae15a5
PH
1821
1822 def _real_initialize(self):
1823 if self._downloader is None:
1824 return
1825
1826 useremail = None
1827 password = None
1828 downloader_params = self._downloader.params
1829
1830 # Attempt to use provided username and password or .netrc data
1831 if downloader_params.get('username', None) is not None:
1832 useremail = downloader_params['username']
1833 password = downloader_params['password']
1834 elif downloader_params.get('usenetrc', False):
1835 try:
1836 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1837 if info is not None:
1838 useremail = info[0]
1839 password = info[2]
1840 else:
1841 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1842 except (IOError, netrc.NetrcParseError) as err:
2e5457be 1843 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
1844 return
1845
1846 if useremail is None:
1847 return
1848
1849 # Log in
1850 login_form = {
1851 'email': useremail,
1852 'pass': password,
1853 'login': 'Log+In'
1854 }
1855 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1856 try:
1857 self.report_login()
1858 login_results = compat_urllib_request.urlopen(request).read()
1859 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2e5457be 1860 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
59ae15a5
PH
1861 return
1862 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 1863 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
1864 return
1865
1866 def _real_extract(self, url):
1867 mobj = re.match(self._VALID_URL, url)
1868 if mobj is None:
0c021ad1 1869 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
1870 video_id = mobj.group('ID')
1871
b954070d
PH
1872 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1873 webpage = self._download_webpage(url, video_id)
1874
32c96387 1875 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
b954070d
PH
1876 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1877 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1878 if not m:
1879 raise ExtractorError(u'Cannot parse data')
1880 data = dict(json.loads(m.group(1)))
edba5137
PH
1881 params_raw = compat_urllib_parse.unquote(data['params'])
1882 params = json.loads(params_raw)
32c96387
PH
1883 video_data = params['video_data'][0]
1884 video_url = video_data.get('hd_src')
7796e8c2 1885 if not video_url:
32c96387 1886 video_url = video_data['sd_src']
7796e8c2
PH
1887 if not video_url:
1888 raise ExtractorError(u'Cannot find video URL')
32c96387
PH
1889 video_duration = int(video_data['video_duration'])
1890 thumbnail = video_data['thumbnail_src']
b954070d
PH
1891
1892 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1893 if not m:
1894 raise ExtractorError(u'Cannot find title in webpage')
1895 video_title = unescapeHTML(m.group(1))
1896
1897 info = {
1898 'id': video_id,
1899 'title': video_title,
1900 'url': video_url,
1901 'ext': 'mp4',
1902 'duration': video_duration,
32c96387 1903 'thumbnail': thumbnail,
b954070d
PH
1904 }
1905 return [info]
59ae15a5 1906
d77c3dfd
FV
1907
1908class BlipTVIE(InfoExtractor):
59ae15a5
PH
1909 """Information extractor for blip.tv"""
1910
1911 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1912 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1913 IE_NAME = u'blip.tv'
1914
59ae15a5
PH
1915 def report_direct_download(self, title):
1916 """Report information extraction."""
f17ce13a 1917 self.to_screen(u'%s: Direct download detected' % title)
59ae15a5
PH
1918
1919 def _real_extract(self, url):
1920 mobj = re.match(self._VALID_URL, url)
1921 if mobj is None:
0c021ad1 1922 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 1923
f7b567ff
PH
1924 urlp = compat_urllib_parse_urlparse(url)
1925 if urlp.path.startswith('/play/'):
7f9d41a5
JCGS
1926 request = compat_urllib_request.Request(url)
1927 response = compat_urllib_request.urlopen(request)
1928 redirecturl = response.geturl()
f7b567ff
PH
1929 rurlp = compat_urllib_parse_urlparse(redirecturl)
1930 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1931 url = 'http://blip.tv/a/a-' + file_id
1932 return self._real_extract(url)
1933
7f9d41a5 1934
59ae15a5
PH
1935 if '?' in url:
1936 cchar = '&'
1937 else:
1938 cchar = '?'
1939 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 1940 request = compat_urllib_request.Request(json_url)
3446dfb7 1941 request.add_header('User-Agent', 'iTunes/10.6.1')
59ae15a5
PH
1942 self.report_extraction(mobj.group(1))
1943 info = None
1944 try:
1945 urlh = compat_urllib_request.urlopen(request)
1946 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1947 basename = url.split('/')[-1]
1948 title,ext = os.path.splitext(basename)
1949 title = title.decode('UTF-8')
1950 ext = ext.replace('.', '')
1951 self.report_direct_download(title)
1952 info = {
1953 'id': title,
1954 'url': url,
1955 'uploader': None,
1956 'upload_date': None,
1957 'title': title,
1958 'ext': ext,
1959 'urlhandle': urlh
1960 }
1961 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446dfb7 1962 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
59ae15a5
PH
1963 if info is None: # Regular URL
1964 try:
55c05398
PH
1965 json_code_bytes = urlh.read()
1966 json_code = json_code_bytes.decode('utf-8')
59ae15a5 1967 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 1968 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
59ae15a5
PH
1969
1970 try:
1971 json_data = json.loads(json_code)
1972 if 'Post' in json_data:
1973 data = json_data['Post']
1974 else:
1975 data = json_data
1976
1977 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1978 video_url = data['media']['url']
1979 umobj = re.match(self._URL_EXT, video_url)
1980 if umobj is None:
1981 raise ValueError('Can not determine filename extension')
1982 ext = umobj.group(1)
1983
1984 info = {
1985 'id': data['item_id'],
1986 'url': video_url,
1987 'uploader': data['display_name'],
1988 'upload_date': upload_date,
1989 'title': data['title'],
1990 'ext': ext,
1991 'format': data['media']['mimeType'],
1992 'thumbnail': data['thumbnailUrl'],
1993 'description': data['description'],
3446dfb7
PH
1994 'player_url': data['embedUrl'],
1995 'user_agent': 'iTunes/10.6.1',
59ae15a5
PH
1996 }
1997 except (ValueError,KeyError) as err:
0c021ad1 1998 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
59ae15a5 1999
59ae15a5 2000 return [info]
d77c3dfd
FV
2001
2002
2003class MyVideoIE(InfoExtractor):
59ae15a5
PH
2004 """Information Extractor for myvideo.de."""
2005
2006 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2007 IE_NAME = u'myvideo'
2008
59ae15a5
PH
2009 def _real_extract(self,url):
2010 mobj = re.match(self._VALID_URL, url)
2011 if mobj is None:
0c021ad1 2012 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2013
2014 video_id = mobj.group(1)
2015
2016 # Get video webpage
5f955171
PH
2017 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2018 webpage = self._download_webpage(webpage_url, video_id)
59ae15a5
PH
2019
2020 self.report_extraction(video_id)
7decf895 2021 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
59ae15a5
PH
2022 webpage)
2023 if mobj is None:
0c021ad1 2024 raise ExtractorError(u'Unable to extract media URL')
59ae15a5
PH
2025 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2026
2027 mobj = re.search('<title>([^<]+)</title>', webpage)
2028 if mobj is None:
0c021ad1 2029 raise ExtractorError(u'Unable to extract title')
59ae15a5
PH
2030
2031 video_title = mobj.group(1)
2032
2033 return [{
2034 'id': video_id,
2035 'url': video_url,
2036 'uploader': None,
2037 'upload_date': None,
2038 'title': video_title,
2039 'ext': u'flv',
2040 }]
d77c3dfd
FV
2041
2042class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2043 """Information extractor for The Daily Show and Colbert Report """
2044
ca6849e6 2045 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2046 # urls for episodes like:
ca6849e6 2047 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2048 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2049 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2050 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2051 |(https?://)?(www\.)?
2052 (?P<showname>thedailyshow|colbertnation)\.com/
2053 (full-episodes/(?P<episode>.*)|
2054 (?P<clip>
2055 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2056 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2057 $"""
59ae15a5
PH
2058
2059 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2060
2061 _video_extensions = {
2062 '3500': 'mp4',
2063 '2200': 'mp4',
2064 '1700': 'mp4',
2065 '1200': 'mp4',
2066 '750': 'mp4',
2067 '400': 'mp4',
2068 }
2069 _video_dimensions = {
2070 '3500': '1280x720',
2071 '2200': '960x540',
2072 '1700': '768x432',
2073 '1200': '640x360',
2074 '750': '512x288',
2075 '400': '384x216',
2076 }
2077
89de9eb1
FV
2078 @classmethod
2079 def suitable(cls, url):
ca6849e6 2080 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 2081 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
ca6849e6 2082
59ae15a5
PH
2083 def _print_formats(self, formats):
2084 print('Available formats:')
2085 for x in formats:
2086 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2087
2088
2089 def _real_extract(self, url):
ca6849e6 2090 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 2091 if mobj is None:
0c021ad1 2092 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2093
2094 if mobj.group('shortname'):
2095 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2096 url = u'http://www.thedailyshow.com/full-episodes/'
2097 else:
2098 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2099 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2100 assert mobj is not None
2101
ca6849e6 2102 if mobj.group('clip'):
2103 if mobj.group('showname') == 'thedailyshow':
2104 epTitle = mobj.group('tdstitle')
2105 else:
2106 epTitle = mobj.group('cntitle')
2107 dlNewest = False
59ae15a5 2108 else:
ca6849e6 2109 dlNewest = not mobj.group('episode')
2110 if dlNewest:
2111 epTitle = mobj.group('showname')
2112 else:
2113 epTitle = mobj.group('episode')
59ae15a5 2114
59ae15a5 2115 self.report_extraction(epTitle)
480b6c1e 2116 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
59ae15a5
PH
2117 if dlNewest:
2118 url = htmlHandle.geturl()
ca6849e6 2119 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 2120 if mobj is None:
480b6c1e 2121 raise ExtractorError(u'Invalid redirected URL: ' + url)
59ae15a5 2122 if mobj.group('episode') == '':
480b6c1e 2123 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
59ae15a5
PH
2124 epTitle = mobj.group('episode')
2125
93148102 2126 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
59ae15a5
PH
2127
2128 if len(mMovieParams) == 0:
2129 # The Colbert Report embeds the information in a without
2130 # a URL prefix; so extract the alternate reference
2131 # and then add the URL prefix manually.
2132
93148102 2133 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
59ae15a5 2134 if len(altMovieParams) == 0:
480b6c1e 2135 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
59ae15a5
PH
2136 else:
2137 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2138
59ae15a5
PH
2139 uri = mMovieParams[0][1]
2140 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
46bfb422
JMF
2141 indexXml = self._download_webpage(indexUrl, epTitle,
2142 u'Downloading show index',
2143 u'unable to download episode index')
59ae15a5
PH
2144
2145 results = []
2146
2147 idoc = xml.etree.ElementTree.fromstring(indexXml)
2148 itemEls = idoc.findall('.//item')
7717ae19 2149 for partNum,itemEl in enumerate(itemEls):
59ae15a5
PH
2150 mediaId = itemEl.findall('./guid')[0].text
2151 shortMediaId = mediaId.split(':')[-1]
2152 showId = mediaId.split(':')[-2].replace('.com', '')
2153 officialTitle = itemEl.findall('./title')[0].text
bf50b038 2154 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
59ae15a5
PH
2155
2156 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2157 compat_urllib_parse.urlencode({'uri': mediaId}))
46bfb422
JMF
2158 configXml = self._download_webpage(configUrl, epTitle,
2159 u'Downloading configuration for %s' % shortMediaId)
59ae15a5
PH
2160
2161 cdoc = xml.etree.ElementTree.fromstring(configXml)
2162 turls = []
2163 for rendition in cdoc.findall('.//rendition'):
2164 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2165 turls.append(finfo)
2166
2167 if len(turls) == 0:
c9fa1cba 2168 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
59ae15a5 2169 continue
cdb30764 2170
59ae15a5
PH
2171 if self._downloader.params.get('listformats', None):
2172 self._print_formats([i[0] for i in turls])
2173 return
2174
2175 # For now, just pick the highest bitrate
32635ec6 2176 format,rtmp_video_url = turls[-1]
59ae15a5
PH
2177
2178 # Get the format arg from the arg stream
2179 req_format = self._downloader.params.get('format', None)
2180
2181 # Select format if we can find one
2182 for f,v in turls:
2183 if f == req_format:
32635ec6 2184 format, rtmp_video_url = f, v
59ae15a5
PH
2185 break
2186
32635ec6
PH
2187 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2188 if not m:
2189 raise ExtractorError(u'Cannot transform RTMP url')
2190 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2191 video_url = base + m.group('finalid')
59ae15a5 2192
7717ae19 2193 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
59ae15a5
PH
2194 info = {
2195 'id': shortMediaId,
2196 'url': video_url,
2197 'uploader': showId,
2198 'upload_date': officialDate,
2199 'title': effTitle,
2200 'ext': 'mp4',
2201 'format': format,
2202 'thumbnail': None,
2203 'description': officialTitle,
59ae15a5 2204 }
59ae15a5 2205 results.append(info)
cdb30764 2206
59ae15a5 2207 return results
d77c3dfd
FV
2208
2209
2210class EscapistIE(InfoExtractor):
59ae15a5
PH
2211 """Information extractor for The Escapist """
2212
2213 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2214 IE_NAME = u'escapist'
2215
59ae15a5
PH
2216 def _real_extract(self, url):
2217 mobj = re.match(self._VALID_URL, url)
2218 if mobj is None:
0c021ad1 2219 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2220 showName = mobj.group('showname')
2221 videoId = mobj.group('episode')
2222
2223 self.report_extraction(showName)
46bfb422 2224 webPage = self._download_webpage(url, showName)
59ae15a5
PH
2225
2226 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2227 description = unescapeHTML(descMatch.group(1))
2228 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2229 imgUrl = unescapeHTML(imgMatch.group(1))
2230 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2231 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2232 configUrlMatch = re.search('config=(.*)$', playerUrl)
2233 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2234
46bfb422
JMF
2235 configJSON = self._download_webpage(configUrl, showName,
2236 u'Downloading configuration',
2237 u'unable to download configuration')
59ae15a5
PH
2238
2239 # Technically, it's JavaScript, not JSON
2240 configJSON = configJSON.replace("'", '"')
2241
2242 try:
2243 config = json.loads(configJSON)
2244 except (ValueError,) as err:
0c021ad1 2245 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
59ae15a5
PH
2246
2247 playlist = config['playlist']
2248 videoUrl = playlist[1]['url']
2249
2250 info = {
2251 'id': videoId,
2252 'url': videoUrl,
2253 'uploader': showName,
2254 'upload_date': None,
2255 'title': showName,
47dcd621 2256 'ext': 'mp4',
59ae15a5
PH
2257 'thumbnail': imgUrl,
2258 'description': description,
2259 'player_url': playerUrl,
2260 }
2261
2262 return [info]
d77c3dfd 2263
d77c3dfd 2264class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2265 """Information extractor for collegehumor.com"""
2266
0eb0faa2 2267 _WORKING = False
59ae15a5
PH
2268 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2269 IE_NAME = u'collegehumor'
2270
799c0763 2271 def report_manifest(self, video_id):
59ae15a5 2272 """Report information extraction."""
f17ce13a 2273 self.to_screen(u'%s: Downloading XML manifest' % video_id)
59ae15a5 2274
59ae15a5
PH
2275 def _real_extract(self, url):
2276 mobj = re.match(self._VALID_URL, url)
2277 if mobj is None:
0c021ad1 2278 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2279 video_id = mobj.group('videoid')
2280
59ae15a5
PH
2281 info = {
2282 'id': video_id,
59ae15a5
PH
2283 'uploader': None,
2284 'upload_date': None,
2285 }
2286
2287 self.report_extraction(video_id)
799c0763 2288 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2289 try:
2290 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 2292 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
59ae15a5
PH
2293
2294 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2295 try:
2296 videoNode = mdoc.findall('./video')[0]
2297 info['description'] = videoNode.findall('./description')[0].text
2298 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2299 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2300 manifest_url = videoNode.findall('./file')[0].text
59ae15a5 2301 except IndexError:
0c021ad1 2302 raise ExtractorError(u'Invalid metadata XML file')
59ae15a5 2303
799c0763
PH
2304 manifest_url += '?hdcore=2.10.3'
2305 self.report_manifest(video_id)
2306 try:
2307 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 2309 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
799c0763
PH
2310
2311 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2312 try:
2313 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2314 node_id = media_node.attrib['url']
2315 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2316 except IndexError as err:
0c021ad1 2317 raise ExtractorError(u'Invalid manifest file')
799c0763
PH
2318
2319 url_pr = compat_urllib_parse_urlparse(manifest_url)
2320 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2321
2322 info['url'] = url
2323 info['ext'] = 'f4f'
59ae15a5 2324 return [info]
d77c3dfd
FV
2325
2326
2327class XVideosIE(InfoExtractor):
59ae15a5 2328 """Information extractor for xvideos.com"""
d77c3dfd 2329
59ae15a5
PH
2330 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2331 IE_NAME = u'xvideos'
d77c3dfd 2332
59ae15a5
PH
2333 def _real_extract(self, url):
2334 mobj = re.match(self._VALID_URL, url)
2335 if mobj is None:
0c021ad1 2336 raise ExtractorError(u'Invalid URL: %s' % url)
8588a86f 2337 video_id = mobj.group(1)
d77c3dfd 2338
5f955171 2339 webpage = self._download_webpage(url, video_id)
d77c3dfd 2340
59ae15a5 2341 self.report_extraction(video_id)
d77c3dfd
FV
2342
2343
59ae15a5
PH
2344 # Extract video URL
2345 mobj = re.search(r'flv_url=(.+?)&', webpage)
2346 if mobj is None:
0c021ad1 2347 raise ExtractorError(u'Unable to extract video url')
8588a86f 2348 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2349
2350
59ae15a5
PH
2351 # Extract title
2352 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2353 if mobj is None:
0c021ad1 2354 raise ExtractorError(u'Unable to extract video title')
8588a86f 2355 video_title = mobj.group(1)
d77c3dfd
FV
2356
2357
59ae15a5
PH
2358 # Extract video thumbnail
2359 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2360 if mobj is None:
0c021ad1 2361 raise ExtractorError(u'Unable to extract video thumbnail')
8588a86f 2362 video_thumbnail = mobj.group(0)
d77c3dfd 2363
59ae15a5
PH
2364 info = {
2365 'id': video_id,
2366 'url': video_url,
2367 'uploader': None,
2368 'upload_date': None,
2369 'title': video_title,
2370 'ext': 'flv',
2371 'thumbnail': video_thumbnail,
2372 'description': None,
2373 }
d77c3dfd 2374
59ae15a5 2375 return [info]
d77c3dfd
FV
2376
2377
2378class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2379 """Information extractor for soundcloud.com
2380 To access the media, the uid of the song and a stream token
2381 must be extracted from the page source and the script must make
2382 a request to media.soundcloud.com/crossdomain.xml. Then
2383 the media can be grabbed by requesting from an url composed
2384 of the stream token and uid
2385 """
2386
2387 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2388 IE_NAME = u'soundcloud'
2389
8fd3afd5 2390 def report_resolve(self, video_id):
59ae15a5 2391 """Report information extraction."""
f17ce13a 2392 self.to_screen(u'%s: Resolving id' % video_id)
59ae15a5 2393
59ae15a5
PH
2394 def _real_extract(self, url):
2395 mobj = re.match(self._VALID_URL, url)
2396 if mobj is None:
0c021ad1 2397 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2398
2399 # extract uploader (which is in the url)
15c8d833 2400 uploader = mobj.group(1)
59ae15a5 2401 # extract simple title (uploader + slug of song title)
15c8d833 2402 slug_title = mobj.group(2)
59ae15a5 2403 simple_title = uploader + u'-' + slug_title
46bfb422 2404 full_title = '%s/%s' % (uploader, slug_title)
59ae15a5 2405
46bfb422 2406 self.report_resolve(full_title)
59ae15a5 2407
8fd3afd5
PH
2408 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2409 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
46bfb422 2410 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
59ae15a5 2411
8fd3afd5
PH
2412 info = json.loads(info_json)
2413 video_id = info['id']
46bfb422 2414 self.report_extraction(full_title)
59ae15a5 2415
8fd3afd5 2416 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
46bfb422
JMF
2417 stream_json = self._download_webpage(streams_url, full_title,
2418 u'Downloading stream definitions',
2419 u'unable to download stream definitions')
59ae15a5 2420
8fd3afd5 2421 streams = json.loads(stream_json)
c7214f9a 2422 mediaURL = streams['http_mp3_128_url']
bf50b038 2423 upload_date = unified_strdate(info['created_at'])
59ae15a5
PH
2424
2425 return [{
c7214f9a 2426 'id': info['id'],
59ae15a5 2427 'url': mediaURL,
c7214f9a 2428 'uploader': info['user']['username'],
bf50b038 2429 'upload_date': upload_date,
c7214f9a 2430 'title': info['title'],
59ae15a5 2431 'ext': u'mp3',
c7214f9a 2432 'description': info['description'],
59ae15a5 2433 }]
d77c3dfd 2434
5011cded 2435class SoundcloudSetIE(InfoExtractor):
2436 """Information extractor for soundcloud.com sets
2437 To access the media, the uid of the song and a stream token
2438 must be extracted from the page source and the script must make
2439 a request to media.soundcloud.com/crossdomain.xml. Then
2440 the media can be grabbed by requesting from an url composed
2441 of the stream token and uid
2442 """
2443
2444 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
50f6412e 2445 IE_NAME = u'soundcloud:set'
5011cded 2446
5011cded 2447 def report_resolve(self, video_id):
2448 """Report information extraction."""
f17ce13a 2449 self.to_screen(u'%s: Resolving id' % video_id)
5011cded 2450
5011cded 2451 def _real_extract(self, url):
2452 mobj = re.match(self._VALID_URL, url)
2453 if mobj is None:
0c021ad1 2454 raise ExtractorError(u'Invalid URL: %s' % url)
5011cded 2455
2456 # extract uploader (which is in the url)
2457 uploader = mobj.group(1)
2458 # extract simple title (uploader + slug of song title)
2459 slug_title = mobj.group(2)
2460 simple_title = uploader + u'-' + slug_title
46bfb422 2461 full_title = '%s/sets/%s' % (uploader, slug_title)
5011cded 2462
46bfb422 2463 self.report_resolve(full_title)
5011cded 2464
2465 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2466 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
46bfb422 2467 info_json = self._download_webpage(resolv_url, full_title)
5011cded 2468
2469 videos = []
2470 info = json.loads(info_json)
2471 if 'errors' in info:
2472 for err in info['errors']:
613bf669 2473 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
5011cded 2474 return
2475
46bfb422 2476 self.report_extraction(full_title)
5011cded 2477 for track in info['tracks']:
2478 video_id = track['id']
5011cded 2479
2480 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
46bfb422 2481 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
5011cded 2482
46bfb422 2483 self.report_extraction(video_id)
5011cded 2484 streams = json.loads(stream_json)
2485 mediaURL = streams['http_mp3_128_url']
2486
2487 videos.append({
2488 'id': video_id,
2489 'url': mediaURL,
2490 'uploader': track['user']['username'],
fe348844 2491 'upload_date': unified_strdate(track['created_at']),
5011cded 2492 'title': track['title'],
2493 'ext': u'mp3',
2494 'description': track['description'],
2495 })
2496 return videos
2497
d77c3dfd
FV
2498
2499class InfoQIE(InfoExtractor):
59ae15a5 2500 """Information extractor for infoq.com"""
59ae15a5 2501 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
59ae15a5 2502
59ae15a5
PH
2503 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2505 if mobj is None:
0c021ad1 2506 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5 2507
4fcca4bb 2508 webpage = self._download_webpage(url, video_id=url)
59ae15a5
PH
2509 self.report_extraction(url)
2510
59ae15a5 2511 # Extract video URL
a3d689cf 2512 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
59ae15a5 2513 if mobj is None:
0c021ad1 2514 raise ExtractorError(u'Unable to extract video url')
4fcca4bb
PH
2515 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2516 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
59ae15a5
PH
2517
2518 # Extract title
2519 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2520 if mobj is None:
0c021ad1 2521 raise ExtractorError(u'Unable to extract video title')
4fcca4bb 2522 video_title = mobj.group(1)
59ae15a5
PH
2523
2524 # Extract description
2525 video_description = u'No description available.'
2526 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2527 if mobj is not None:
4fcca4bb 2528 video_description = mobj.group(1)
59ae15a5
PH
2529
2530 video_filename = video_url.split('/')[-1]
2531 video_id, extension = video_filename.split('.')
2532
2533 info = {
2534 'id': video_id,
2535 'url': video_url,
2536 'uploader': None,
2537 'upload_date': None,
2538 'title': video_title,
2539 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2540 'thumbnail': None,
2541 'description': video_description,
2542 }
2543
2544 return [info]
d77c3dfd
FV
2545
2546class MixcloudIE(InfoExtractor):
59ae15a5 2547 """Information extractor for www.mixcloud.com"""
93702113
FV
2548
2549 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2550 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2551 IE_NAME = u'mixcloud'
2552
59ae15a5
PH
2553 def report_download_json(self, file_id):
2554 """Report JSON download."""
f17ce13a 2555 self.to_screen(u'Downloading json')
59ae15a5 2556
59ae15a5
PH
2557 def get_urls(self, jsonData, fmt, bitrate='best'):
2558 """Get urls from 'audio_formats' section in json"""
2559 file_url = None
2560 try:
2561 bitrate_list = jsonData[fmt]
2562 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2563 bitrate = max(bitrate_list) # select highest
2564
2565 url_list = jsonData[fmt][bitrate]
2566 except TypeError: # we have no bitrate info.
2567 url_list = jsonData[fmt]
2568 return url_list
2569
2570 def check_urls(self, url_list):
2571 """Returns 1st active url from list"""
2572 for url in url_list:
2573 try:
2574 compat_urllib_request.urlopen(url)
2575 return url
2576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577 url = None
2578
2579 return None
2580
2581 def _print_formats(self, formats):
2582 print('Available formats:')
2583 for fmt in formats.keys():
2584 for b in formats[fmt]:
2585 try:
2586 ext = formats[fmt][b][0]
2587 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2588 except TypeError: # we have no bitrate info
2589 ext = formats[fmt][0]
2590 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2591 break
2592
2593 def _real_extract(self, url):
2594 mobj = re.match(self._VALID_URL, url)
2595 if mobj is None:
0c021ad1 2596 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2597 # extract uploader & filename from url
2598 uploader = mobj.group(1).decode('utf-8')
2599 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2600
2601 # construct API request
2602 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2603 # retrieve .json file with links to files
2604 request = compat_urllib_request.Request(file_url)
2605 try:
2606 self.report_download_json(file_url)
2607 jsonData = compat_urllib_request.urlopen(request).read()
2608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 2609 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
59ae15a5
PH
2610
2611 # parse JSON
2612 json_data = json.loads(jsonData)
2613 player_url = json_data['player_swf_url']
2614 formats = dict(json_data['audio_formats'])
2615
2616 req_format = self._downloader.params.get('format', None)
2617 bitrate = None
2618
2619 if self._downloader.params.get('listformats', None):
2620 self._print_formats(formats)
2621 return
2622
2623 if req_format is None or req_format == 'best':
2624 for format_param in formats.keys():
2625 url_list = self.get_urls(formats, format_param)
2626 # check urls
2627 file_url = self.check_urls(url_list)
2628 if file_url is not None:
2629 break # got it!
2630 else:
99b0a129 2631 if req_format not in formats:
0c021ad1 2632 raise ExtractorError(u'Format is not available')
59ae15a5
PH
2633
2634 url_list = self.get_urls(formats, req_format)
2635 file_url = self.check_urls(url_list)
2636 format_param = req_format
2637
2638 return [{
2639 'id': file_id.decode('utf-8'),
2640 'url': file_url.decode('utf-8'),
2641 'uploader': uploader.decode('utf-8'),
2642 'upload_date': None,
2643 'title': json_data['name'],
2644 'ext': file_url.split('.')[-1].decode('utf-8'),
2645 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2646 'thumbnail': json_data['thumbnail_url'],
2647 'description': json_data['description'],
2648 'player_url': player_url.decode('utf-8'),
2649 }]
d77c3dfd
FV
2650
2651class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
2652 """Information extractor for Stanford's Open ClassRoom"""
2653
2654 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2655 IE_NAME = u'stanfordoc'
2656
59ae15a5
PH
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2659 if mobj is None:
f0bad2b0 2660 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2661
2662 if mobj.group('course') and mobj.group('video'): # A specific video
2663 course = mobj.group('course')
2664 video = mobj.group('video')
2665 info = {
2666 'id': course + '_' + video,
2667 'uploader': None,
2668 'upload_date': None,
2669 }
2670
2671 self.report_extraction(info['id'])
2672 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2673 xmlUrl = baseUrl + video + '.xml'
2674 try:
2675 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2676 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 2677 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
59ae15a5
PH
2678 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2679 try:
2680 info['title'] = mdoc.findall('./title')[0].text
2681 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2682 except IndexError:
0c021ad1 2683 raise ExtractorError(u'Invalid metadata XML file')
59ae15a5
PH
2684 info['ext'] = info['url'].rpartition('.')[2]
2685 return [info]
2686 elif mobj.group('course'): # A course page
2687 course = mobj.group('course')
2688 info = {
2689 'id': course,
2690 'type': 'playlist',
2691 'uploader': None,
2692 'upload_date': None,
2693 }
2694
f0bad2b0
PH
2695 coursepage = self._download_webpage(url, info['id'],
2696 note='Downloading course info page',
2697 errnote='Unable to download course info page')
59ae15a5
PH
2698
2699 m = re.search('<h1>([^<]+)</h1>', coursepage)
2700 if m:
2701 info['title'] = unescapeHTML(m.group(1))
2702 else:
2703 info['title'] = info['id']
2704
2705 m = re.search('<description>([^<]+)</description>', coursepage)
2706 if m:
2707 info['description'] = unescapeHTML(m.group(1))
2708
2709 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2710 info['list'] = [
2711 {
2712 'type': 'reference',
2713 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2714 }
2715 for vpage in links]
2716 results = []
2717 for entry in info['list']:
2718 assert entry['type'] == 'reference'
2719 results += self.extract(entry['url'])
2720 return results
59ae15a5
PH
2721 else: # Root page
2722 info = {
2723 'id': 'Stanford OpenClassroom',
2724 'type': 'playlist',
2725 'uploader': None,
2726 'upload_date': None,
2727 }
2728
2729 self.report_download_webpage(info['id'])
2730 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2731 try:
2732 rootpage = compat_urllib_request.urlopen(rootURL).read()
2733 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 2734 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
59ae15a5
PH
2735
2736 info['title'] = info['id']
2737
2738 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2739 info['list'] = [
2740 {
2741 'type': 'reference',
2742 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2743 }
2744 for cpage in links]
2745
2746 results = []
2747 for entry in info['list']:
2748 assert entry['type'] == 'reference'
2749 results += self.extract(entry['url'])
2750 return results
d77c3dfd
FV
2751
2752class MTVIE(InfoExtractor):
59ae15a5
PH
2753 """Information extractor for MTV.com"""
2754
2755 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2756 IE_NAME = u'mtv'
2757
59ae15a5
PH
2758 def _real_extract(self, url):
2759 mobj = re.match(self._VALID_URL, url)
2760 if mobj is None:
0c021ad1 2761 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2762 if not mobj.group('proto'):
2763 url = 'http://' + url
2764 video_id = mobj.group('videoid')
59ae15a5 2765
5f955171 2766 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
2767
2768 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2769 if mobj is None:
0c021ad1 2770 raise ExtractorError(u'Unable to extract song name')
59ae15a5
PH
2771 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2772 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2773 if mobj is None:
0c021ad1 2774 raise ExtractorError(u'Unable to extract performer')
59ae15a5 2775 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 2776 video_title = performer + ' - ' + song_name
59ae15a5
PH
2777
2778 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2779 if mobj is None:
0c021ad1 2780 raise ExtractorError(u'Unable to mtvn_uri')
59ae15a5
PH
2781 mtvn_uri = mobj.group(1)
2782
2783 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2784 if mobj is None:
0c021ad1 2785 raise ExtractorError(u'Unable to extract content id')
59ae15a5
PH
2786 content_id = mobj.group(1)
2787
2788 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2789 self.report_extraction(video_id)
2790 request = compat_urllib_request.Request(videogen_url)
2791 try:
2792 metadataXml = compat_urllib_request.urlopen(request).read()
2793 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
0c021ad1 2794 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
59ae15a5
PH
2795
2796 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2797 renditions = mdoc.findall('.//rendition')
2798
2799 # For now, always pick the highest quality.
2800 rendition = renditions[-1]
2801
2802 try:
2803 _,_,ext = rendition.attrib['type'].partition('/')
2804 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2805 video_url = rendition.find('./src').text
2806 except KeyError:
0c021ad1 2807 raise ExtractorError('Invalid rendition field.')
59ae15a5
PH
2808
2809 info = {
2810 'id': video_id,
2811 'url': video_url,
2812 'uploader': performer,
2813 'upload_date': None,
2814 'title': video_title,
2815 'ext': ext,
2816 'format': format,
2817 }
2818
2819 return [info]
6de7ef9b 2820
302efc19 2821
302efc19 2822class YoukuIE(InfoExtractor):
59ae15a5 2823 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59ae15a5 2824
59ae15a5
PH
2825 def _gen_sid(self):
2826 nowTime = int(time.time() * 1000)
2827 random1 = random.randint(1000,1998)
2828 random2 = random.randint(1000,9999)
2829
2830 return "%d%d%d" %(nowTime,random1,random2)
2831
2832 def _get_file_ID_mix_string(self, seed):
2833 mixed = []
2834 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2835 seed = float(seed)
2836 for i in range(len(source)):
2837 seed = (seed * 211 + 30031 ) % 65536
2838 index = math.floor(seed / 65536 * len(source) )
2839 mixed.append(source[int(index)])
2840 source.remove(source[int(index)])
2841 #return ''.join(mixed)
2842 return mixed
2843
2844 def _get_file_id(self, fileId, seed):
2845 mixed = self._get_file_ID_mix_string(seed)
2846 ids = fileId.split('*')
2847 realId = []
2848 for ch in ids:
2849 if ch:
2850 realId.append(mixed[int(ch)])
2851 return ''.join(realId)
2852
2853 def _real_extract(self, url):
2854 mobj = re.match(self._VALID_URL, url)
2855 if mobj is None:
0c021ad1 2856 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2857 video_id = mobj.group('ID')
2858
2859 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2860
46bfb422 2861 jsondata = self._download_webpage(info_url, video_id)
59ae15a5
PH
2862
2863 self.report_extraction(video_id)
2864 try:
46bfb422 2865 config = json.loads(jsondata)
59ae15a5
PH
2866
2867 video_title = config['data'][0]['title']
2868 seed = config['data'][0]['seed']
2869
2870 format = self._downloader.params.get('format', None)
1a2c3c0f 2871 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
2872
2873 if format is None or format == 'best':
2874 if 'hd2' in supported_format:
2875 format = 'hd2'
2876 else:
2877 format = 'flv'
2878 ext = u'flv'
2879 elif format == 'worst':
2880 format = 'mp4'
2881 ext = u'mp4'
2882 else:
2883 format = 'flv'
2884 ext = u'flv'
2885
2886
2887 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 2888 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 2889 except (UnicodeDecodeError, ValueError, KeyError):
0c021ad1 2890 raise ExtractorError(u'Unable to extract info section')
59ae15a5
PH
2891
2892 files_info=[]
2893 sid = self._gen_sid()
2894 fileid = self._get_file_id(fileid, seed)
2895
2896 #column 8,9 of fileid represent the segment number
2897 #fileid[7:9] should be changed
2898 for index, key in enumerate(keys):
2899
2900 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2901 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2902
2903 info = {
2904 'id': '%s_part%02d' % (video_id, index),
2905 'url': download_url,
2906 'uploader': None,
2907 'upload_date': None,
2908 'title': video_title,
2909 'ext': ext,
2910 }
2911 files_info.append(info)
2912
2913 return files_info
5dc846fa
FV
2914
2915
6de7ef9b 2916class XNXXIE(InfoExtractor):
59ae15a5
PH
2917 """Information extractor for xnxx.com"""
2918
caec7618 2919 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
59ae15a5
PH
2920 IE_NAME = u'xnxx'
2921 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2922 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2923 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2924
59ae15a5
PH
2925 def _real_extract(self, url):
2926 mobj = re.match(self._VALID_URL, url)
2927 if mobj is None:
0c021ad1 2928 raise ExtractorError(u'Invalid URL: %s' % url)
bec102a8 2929 video_id = mobj.group(1)
59ae15a5 2930
59ae15a5 2931 # Get webpage content
46bfb422 2932 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
2933
2934 result = re.search(self.VIDEO_URL_RE, webpage)
2935 if result is None:
0c021ad1 2936 raise ExtractorError(u'Unable to extract video url')
bec102a8 2937 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
2938
2939 result = re.search(self.VIDEO_TITLE_RE, webpage)
2940 if result is None:
0c021ad1 2941 raise ExtractorError(u'Unable to extract video title')
bec102a8 2942 video_title = result.group(1)
59ae15a5
PH
2943
2944 result = re.search(self.VIDEO_THUMB_RE, webpage)
2945 if result is None:
0c021ad1 2946 raise ExtractorError(u'Unable to extract video thumbnail')
bec102a8 2947 video_thumbnail = result.group(1)
59ae15a5
PH
2948
2949 return [{
2950 'id': video_id,
2951 'url': video_url,
2952 'uploader': None,
2953 'upload_date': None,
2954 'title': video_title,
2955 'ext': 'flv',
2956 'thumbnail': video_thumbnail,
2957 'description': None,
2958 }]
fd873c69
FV
2959
2960
d443aca8 2961class GooglePlusIE(InfoExtractor):
59ae15a5
PH
2962 """Information extractor for plus.google.com."""
2963
93702113 2964 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
2965 IE_NAME = u'plus.google'
2966
59ae15a5
PH
2967 def report_extract_entry(self, url):
2968 """Report downloading extry"""
f17ce13a 2969 self.to_screen(u'Downloading entry: %s' % url)
59ae15a5
PH
2970
2971 def report_date(self, upload_date):
2972 """Report downloading extry"""
f17ce13a 2973 self.to_screen(u'Entry date: %s' % upload_date)
59ae15a5
PH
2974
2975 def report_uploader(self, uploader):
2976 """Report downloading extry"""
f17ce13a 2977 self.to_screen(u'Uploader: %s' % uploader)
59ae15a5
PH
2978
2979 def report_title(self, video_title):
2980 """Report downloading extry"""
f17ce13a 2981 self.to_screen(u'Title: %s' % video_title)
59ae15a5
PH
2982
2983 def report_extract_vid_page(self, video_page):
2984 """Report information extraction."""
f17ce13a 2985 self.to_screen(u'Extracting video page: %s' % video_page)
59ae15a5
PH
2986
2987 def _real_extract(self, url):
2988 # Extract id from URL
2989 mobj = re.match(self._VALID_URL, url)
2990 if mobj is None:
0c021ad1 2991 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2992
2993 post_url = mobj.group(0)
93702113 2994 video_id = mobj.group(1)
59ae15a5
PH
2995
2996 video_extension = 'flv'
2997
2998 # Step 1, Retrieve post webpage to extract further information
2999 self.report_extract_entry(post_url)
46bfb422 3000 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
59ae15a5
PH
3001
3002 # Extract update date
3003 upload_date = None
3004 pattern = 'title="Timestamp">(.*?)</a>'
3005 mobj = re.search(pattern, webpage)
3006 if mobj:
3007 upload_date = mobj.group(1)
3008 # Convert timestring to a format suitable for filename
3009 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3010 upload_date = upload_date.strftime('%Y%m%d')
3011 self.report_date(upload_date)
3012
3013 # Extract uploader
3014 uploader = None
3015 pattern = r'rel\="author".*?>(.*?)</a>'
3016 mobj = re.search(pattern, webpage)
3017 if mobj:
3018 uploader = mobj.group(1)
3019 self.report_uploader(uploader)
3020
3021 # Extract title
3022 # Get the first line for title
3023 video_title = u'NA'
3024 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3025 mobj = re.search(pattern, webpage)
3026 if mobj:
3027 video_title = mobj.group(1)
3028 self.report_title(video_title)
3029
3030 # Step 2, Stimulate clicking the image box to launch video
3031 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3032 mobj = re.search(pattern, webpage)
3033 if mobj is None:
486f0c94 3034 raise ExtractorError(u'Unable to extract video page URL')
59ae15a5
PH
3035
3036 video_page = mobj.group(1)
46bfb422 3037 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
59ae15a5
PH
3038 self.report_extract_vid_page(video_page)
3039
3040
3041 # Extract video links on video page
3042 """Extract video links of all sizes"""
3043 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3044 mobj = re.findall(pattern, webpage)
3045 if len(mobj) == 0:
486f0c94 3046 raise ExtractorError(u'Unable to extract video links')
59ae15a5
PH
3047
3048 # Sort in resolution
3049 links = sorted(mobj)
3050
3051 # Choose the lowest of the sort, i.e. highest resolution
3052 video_url = links[-1]
3053 # Only get the url. The resolution part in the tuple has no use anymore
3054 video_url = video_url[-1]
3055 # Treat escaped \u0026 style hex
93702113
FV
3056 try:
3057 video_url = video_url.decode("unicode_escape")
3058 except AttributeError: # Python 3
3059 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3060
3061
3062 return [{
93702113 3063 'id': video_id,
59ae15a5 3064 'url': video_url,
93702113
FV
3065 'uploader': uploader,
3066 'upload_date': upload_date,
3067 'title': video_title,
3068 'ext': video_extension,
59ae15a5 3069 }]
4cc3d074
PH
3070
3071class NBAIE(InfoExtractor):
3072 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3073 IE_NAME = u'nba'
3074
4cc3d074
PH
3075 def _real_extract(self, url):
3076 mobj = re.match(self._VALID_URL, url)
3077 if mobj is None:
0c021ad1 3078 raise ExtractorError(u'Invalid URL: %s' % url)
4cc3d074
PH
3079
3080 video_id = mobj.group(1)
3081 if video_id.endswith('/index.html'):
3082 video_id = video_id[:-len('/index.html')]
3083
5f955171 3084 webpage = self._download_webpage(url, video_id)
4cc3d074
PH
3085
3086 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3087 def _findProp(rexp, default=None):
3088 m = re.search(rexp, webpage)
3089 if m:
3090 return unescapeHTML(m.group(1))
3091 else:
3092 return default
3093
3094 shortened_video_id = video_id.rpartition('/')[2]
3095 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3096 info = {
3097 'id': shortened_video_id,
3098 'url': video_url,
3099 'ext': 'mp4',
3100 'title': title,
3101 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3102 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3103 }
3104 return [info]
0b40544f
DV
3105
3106class JustinTVIE(InfoExtractor):
3107 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3108 # TODO: One broadcast may be split into multiple videos. The key
3109 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3110 # starts at 1 and increases. Can we treat all parts as one video?
3111
4096b609 3112 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
0e16f094
PH
3113 (?:
3114 (?P<channelid>[^/]+)|
3115 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3116 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3117 )
3118 /?(?:\#.*)?$
3119 """
4096b609 3120 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3121 IE_NAME = u'justin.tv'
3122
4096b609
DV
3123 def report_download_page(self, channel, offset):
3124 """Report attempt to download a single page of videos."""
f17ce13a
JMF
3125 self.to_screen(u'%s: Downloading video information from %d to %d' %
3126 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
4096b609 3127
2ab1c5ed 3128 # Return count of items, list of *valid* items
46bfb422
JMF
3129 def _parse_page(self, url, video_id):
3130 webpage = self._download_webpage(url, video_id,
3131 u'Downloading video info JSON',
3132 u'unable to download video info JSON')
cdb30764 3133
0b40544f 3134 response = json.loads(webpage)
fa1bf9c6 3135 if type(response) != list:
3136 error_text = response.get('error', 'unknown error')
decd1d17 3137 raise ExtractorError(u'Justin.tv API: %s' % error_text)
0b40544f
DV
3138 info = []
3139 for clip in response:
3140 video_url = clip['video_file_url']
3141 if video_url:
3142 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 3143 video_date = re.sub('-', '', clip['start_time'][:10])
3144 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
3145 video_id = clip['id']
3146 video_title = clip.get('title', video_id)
0b40544f 3147 info.append({
97f194c1 3148 'id': video_id,
0b40544f 3149 'url': video_url,
97f194c1 3150 'title': video_title,
fa1bf9c6 3151 'uploader': clip.get('channel_name', video_uploader_id),
3152 'uploader_id': video_uploader_id,
0b40544f
DV
3153 'upload_date': video_date,
3154 'ext': video_extension,
3155 })
2ab1c5ed
DV
3156 return (len(response), info)
3157
3158 def _real_extract(self, url):
3159 mobj = re.match(self._VALID_URL, url)
3160 if mobj is None:
0e16f094 3161 raise ExtractorError(u'invalid URL: %s' % url)
cdb30764 3162
0e16f094 3163 api_base = 'http://api.justin.tv'
2ab1c5ed 3164 paged = False
0e16f094 3165 if mobj.group('channelid'):
2ab1c5ed 3166 paged = True
0e16f094
PH
3167 video_id = mobj.group('channelid')
3168 api = api_base + '/channel/archives/%s.json' % video_id
3169 elif mobj.group('chapterid'):
3170 chapter_id = mobj.group('chapterid')
0e16f094
PH
3171
3172 webpage = self._download_webpage(url, chapter_id)
3173 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3174 if not m:
f4f31688 3175 raise ExtractorError(u'Cannot find archive of a chapter')
0e16f094 3176 archive_id = m.group(1)
f4f31688
PH
3177
3178 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3179 chapter_info_xml = self._download_webpage(api, chapter_id,
3180 note=u'Downloading chapter information',
3181 errnote=u'Chapter information download failed')
3182 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3183 for a in doc.findall('.//archive'):
3184 if archive_id == a.find('./id').text:
3185 break
3186 else:
3187 raise ExtractorError(u'Could not find chapter in chapter information')
3188
3189 video_url = a.find('./video_file_url').text
3190 video_ext = video_url.rpartition('.')[2] or u'flv'
3191
db8fd71c 3192 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
c43e5724 3193 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
db8fd71c
PH
3194 note='Downloading chapter metadata',
3195 errnote='Download of chapter metadata failed')
3196 chapter_info = json.loads(chapter_info_json)
3197
4539dd30
PH
3198 bracket_start = int(doc.find('.//bracket_start').text)
3199 bracket_end = int(doc.find('.//bracket_end').text)
c43e5724 3200
f4f31688
PH
3201 # TODO determine start (and probably fix up file)
3202 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
4539dd30
PH
3203 #video_url += u'?start=' + TODO:start_timestamp
3204 # bracket_start is 13290, but we want 51670615
3205 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3206 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
0e16f094 3207
f4f31688
PH
3208 info = {
3209 'id': u'c' + chapter_id,
3210 'url': video_url,
3211 'ext': video_ext,
db8fd71c
PH
3212 'title': chapter_info['title'],
3213 'thumbnail': chapter_info['preview'],
3214 'description': chapter_info['description'],
c43e5724
PH
3215 'uploader': chapter_info['channel']['display_name'],
3216 'uploader_id': chapter_info['channel']['name'],
f4f31688
PH
3217 }
3218 return [info]
2ab1c5ed 3219 else:
0e16f094
PH
3220 video_id = mobj.group('videoid')
3221 api = api_base + '/broadcast/by_archive/%s.json' % video_id
cdb30764 3222
2ab1c5ed 3223 self.report_extraction(video_id)
cdb30764 3224
2ab1c5ed
DV
3225 info = []
3226 offset = 0
4096b609
DV
3227 limit = self._JUSTIN_PAGE_LIMIT
3228 while True:
3229 if paged:
3230 self.report_download_page(video_id, offset)
2ab1c5ed 3231 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
46bfb422 3232 page_count, page_info = self._parse_page(page_url, video_id)
2ab1c5ed
DV
3233 info.extend(page_info)
3234 if not paged or page_count != limit:
3235 break
3236 offset += limit
0b40544f 3237 return info
21a9c6aa
PH
3238
3239class FunnyOrDieIE(InfoExtractor):
3240 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
21a9c6aa 3241
21a9c6aa
PH
3242 def _real_extract(self, url):
3243 mobj = re.match(self._VALID_URL, url)
3244 if mobj is None:
decd1d17 3245 raise ExtractorError(u'invalid URL: %s' % url)
21a9c6aa
PH
3246
3247 video_id = mobj.group('id')
5f955171 3248 webpage = self._download_webpage(url, video_id)
21a9c6aa
PH
3249
3250 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3251 if not m:
486f0c94 3252 raise ExtractorError(u'Unable to find video information')
21a9c6aa 3253 video_url = unescapeHTML(m.group('url'))
21a9c6aa 3254
7decf895 3255 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
21a9c6aa 3256 if not m:
bfdf4692
PH
3257 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3258 if not m:
486f0c94 3259 raise ExtractorError(u'Cannot find video title')
7decf895 3260 title = clean_html(m.group('title'))
21a9c6aa
PH
3261
3262 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3263 if m:
3264 desc = unescapeHTML(m.group('desc'))
3265 else:
3266 desc = None
3267
3268 info = {
3269 'id': video_id,
3270 'url': video_url,
3271 'ext': 'mp4',
3272 'title': title,
3273 'description': desc,
3274 }
3275 return [info]
d0d4f277 3276
e314ba67 3277class SteamIE(InfoExtractor):
feecf225 3278 _VALID_URL = r"""http://store\.steampowered\.com/
4c9f7a99 3279 (agecheck/)?
e314ba67
JMF
3280 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3281 (?P<gameID>\d+)/?
3282 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3283 """
4aeae91f 3284
89de9eb1
FV
3285 @classmethod
3286 def suitable(cls, url):
e314ba67 3287 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 3288 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
5f955171 3289
e314ba67
JMF
3290 def _real_extract(self, url):
3291 m = re.match(self._VALID_URL, url, re.VERBOSE)
e314ba67 3292 gameID = m.group('gameID')
e11eb119
JMF
3293 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3294 self.report_age_confirmation()
5f955171 3295 webpage = self._download_webpage(videourl, gameID)
9e1cf0c2
JMF
3296 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3297
3298 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
e314ba67 3299 mweb = re.finditer(urlRE, webpage)
5e9d042d
JMF
3300 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3301 titles = re.finditer(namesRE, webpage)
60bd48b1
JMF
3302 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3303 thumbs = re.finditer(thumbsRE, webpage)
e314ba67 3304 videos = []
60bd48b1 3305 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
e314ba67 3306 video_id = vid.group('videoID')
5f955171
PH
3307 title = vtitle.group('videoName')
3308 video_url = vid.group('videoURL')
60bd48b1 3309 video_thumb = thumb.group('thumbnail')
e314ba67 3310 if not video_url:
486f0c94 3311 raise ExtractorError(u'Cannot find video url for %s' % video_id)
e314ba67
JMF
3312 info = {
3313 'id':video_id,
3314 'url':video_url,
3315 'ext': 'flv',
60bd48b1
JMF
3316 'title': unescapeHTML(title),
3317 'thumbnail': video_thumb
e314ba67
JMF
3318 }
3319 videos.append(info)
9e1cf0c2 3320 return [self.playlist_result(videos, gameID, game_title)]
ef0c8d5f 3321
278986ea 3322class UstreamIE(InfoExtractor):
ef0c8d5f 3323 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 3324 IE_NAME = u'ustream'
ef0c8d5f 3325
278986ea
JMF
3326 def _real_extract(self, url):
3327 m = re.match(self._VALID_URL, url)
3328 video_id = m.group('videoID')
3329 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 3330 webpage = self._download_webpage(url, video_id)
278986ea
JMF
3331 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3332 title = m.group('title')
3333 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3334 uploader = m.group('uploader')
3335 info = {
3336 'id':video_id,
3337 'url':video_url,
3338 'ext': 'flv',
3339 'title': title,
3340 'uploader': uploader
3341 }
3342 return [info]
4aeae91f 3343
40634747 3344class WorldStarHipHopIE(InfoExtractor):
180e689f 3345 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
40634747
JMS
3346 IE_NAME = u'WorldStarHipHop'
3347
3348 def _real_extract(self, url):
7da5556a 3349 _src_url = r'so\.addVariable\("file","(.*?)"\)'
40634747 3350
08ec0af7
JMS
3351 m = re.match(self._VALID_URL, url)
3352 video_id = m.group('id')
3353
46bfb422
JMF
3354 webpage_src = self._download_webpage(url, video_id)
3355
3356 mobj = re.search(_src_url, webpage_src)
3357
40634747 3358 if mobj is not None:
7da5556a 3359 video_url = mobj.group(1)
40634747 3360 if 'mp4' in video_url:
b3bcca08 3361 ext = 'mp4'
40634747 3362 else:
b3bcca08 3363 ext = 'flv'
40634747 3364 else:
180e689f 3365 raise ExtractorError(u'Cannot find video url for %s' % video_id)
fa41fbd3 3366
180e689f 3367 mobj = re.search(r"<title>(.*)</title>", webpage_src)
40634747 3368
180e689f
PH
3369 if mobj is None:
3370 raise ExtractorError(u'Cannot determine title')
3371 title = mobj.group(1)
40634747 3372
180e689f 3373 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
40634747
JMS
3374 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3375 if mobj is not None:
3376 thumbnail = mobj.group(1)
3377 else:
3378 _title = r"""candytitles.*>(.*)</span>"""
3379 mobj = re.search(_title, webpage_src)
3380 if mobj is not None:
3381 title = mobj.group(1)
3382 thumbnail = None
fa41fbd3 3383
b3bcca08 3384 results = [{
64c78d50 3385 'id': video_id,
b3bcca08
JMS
3386 'url' : video_url,
3387 'title' : title,
3388 'thumbnail' : thumbnail,
3389 'ext' : ext,
3390 }]
40634747
JMS
3391 return results
3392
ca0a0bbe
PH
3393class RBMARadioIE(InfoExtractor):
3394 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3395
3396 def _real_extract(self, url):
3397 m = re.match(self._VALID_URL, url)
3398 video_id = m.group('videoID')
3399
3400 webpage = self._download_webpage(url, video_id)
3401 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3402 if not m:
3403 raise ExtractorError(u'Cannot find metadata')
3404 json_data = m.group(1)
3405
3406 try:
3407 data = json.loads(json_data)
3408 except ValueError as e:
3409 raise ExtractorError(u'Invalid JSON: ' + str(e))
3410
3411 video_url = data['akamai_url'] + '&cbr=256'
3412 url_parts = compat_urllib_parse_urlparse(video_url)
3413 video_ext = url_parts.path.rpartition('.')[2]
3414 info = {
3415 'id': video_id,
3416 'url': video_url,
3417 'ext': video_ext,
3418 'title': data['title'],
3419 'description': data.get('teaser_text'),
3420 'location': data.get('country_of_origin'),
3421 'uploader': data.get('host', {}).get('name'),
3422 'uploader_id': data.get('host', {}).get('slug'),
187f491a 3423 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
3424 'duration': data.get('duration'),
3425 }
3426 return [info]
4aeae91f 3427
991ba7fa
JC
3428
3429class YouPornIE(InfoExtractor):
3430 """Information extractor for youporn.com."""
991ba7fa 3431 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
6324fd1d 3432
991ba7fa
JC
3433 def _print_formats(self, formats):
3434 """Print all available formats"""
565f7519 3435 print(u'Available formats:')
ca6710ee
JC
3436 print(u'ext\t\tformat')
3437 print(u'---------------------------------')
991ba7fa 3438 for format in formats:
ca6710ee 3439 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
3440
3441 def _specific(self, req_format, formats):
3442 for x in formats:
3443 if(x["format"]==req_format):
3444 return x
3445 return None
3446
991ba7fa
JC
3447 def _real_extract(self, url):
3448 mobj = re.match(self._VALID_URL, url)
3449 if mobj is None:
0c021ad1 3450 raise ExtractorError(u'Invalid URL: %s' % url)
991ba7fa 3451
ca6710ee 3452 video_id = mobj.group('videoid')
991ba7fa 3453
629fcdd1
PH
3454 req = compat_urllib_request.Request(url)
3455 req.add_header('Cookie', 'age_verified=1')
3456 webpage = self._download_webpage(req, video_id)
991ba7fa
JC
3457
3458 # Get the video title
e711babb 3459 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
991ba7fa 3460 if result is None:
e711babb 3461 raise ExtractorError(u'Unable to extract video title')
ca6710ee 3462 video_title = result.group('title').strip()
991ba7fa
JC
3463
3464 # Get the video date
e711babb 3465 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
991ba7fa 3466 if result is None:
2e5457be 3467 self._downloader.report_warning(u'unable to extract video date')
629fcdd1
PH
3468 upload_date = None
3469 else:
bf50b038 3470 upload_date = unified_strdate(result.group('date').strip())
991ba7fa
JC
3471
3472 # Get the video uploader
e711babb 3473 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
991ba7fa 3474 if result is None:
2e5457be 3475 self._downloader.report_warning(u'unable to extract uploader')
629fcdd1
PH
3476 video_uploader = None
3477 else:
3478 video_uploader = result.group('uploader').strip()
3479 video_uploader = clean_html( video_uploader )
991ba7fa
JC
3480
3481 # Get all of the formats available
ca6710ee
JC
3482 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3483 result = re.search(DOWNLOAD_LIST_RE, webpage)
991ba7fa 3484 if result is None:
629fcdd1 3485 raise ExtractorError(u'Unable to extract download list')
ca6710ee 3486 download_list_html = result.group('download_list').strip()
991ba7fa
JC
3487
3488 # Get all of the links from the page
ca6710ee
JC
3489 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3490 links = re.findall(LINK_RE, download_list_html)
991ba7fa 3491 if(len(links) == 0):
629fcdd1 3492 raise ExtractorError(u'ERROR: no known formats available for video')
6324fd1d 3493
f17ce13a 3494 self.to_screen(u'Links found: %d' % len(links))
991ba7fa
JC
3495
3496 formats = []
3497 for link in links:
3498
3499 # A link looks like this:
3500 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3501 # A path looks like this:
3502 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
3503 video_url = unescapeHTML( link )
3504 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
3505 extension = os.path.splitext( path )[1][1:]
3506 format = path.split('/')[4].split('_')[:2]
3507 size = format[0]
3508 bitrate = format[1]
3509 format = "-".join( format )
3510 title = u'%s-%s-%s' % (video_title, size, bitrate)
3511
3512 formats.append({
3513 'id': video_id,
3514 'url': video_url,
3515 'uploader': video_uploader,
3516 'upload_date': upload_date,
3517 'title': title,
3518 'ext': extension,
3519 'format': format,
3520 'thumbnail': None,
3521 'description': None,
3522 'player_url': None
3523 })
3524
3525 if self._downloader.params.get('listformats', None):
3526 self._print_formats(formats)
3527 return
3528
3529 req_format = self._downloader.params.get('format', None)
f17ce13a 3530 self.to_screen(u'Format: %s' % req_format)
991ba7fa 3531
991ba7fa
JC
3532 if req_format is None or req_format == 'best':
3533 return [formats[0]]
3534 elif req_format == 'worst':
3535 return [formats[-1]]
3536 elif req_format in ('-1', 'all'):
3537 return formats
3538 else:
3539 format = self._specific( req_format, formats )
3540 if result is None:
0c021ad1 3541 raise ExtractorError(u'Requested format not available')
991ba7fa
JC
3542 return [format]
3543
6324fd1d 3544
991ba7fa
JC
3545
3546class PornotubeIE(InfoExtractor):
3547 """Information extractor for pornotube.com."""
991ba7fa 3548 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 3549
991ba7fa
JC
3550 def _real_extract(self, url):
3551 mobj = re.match(self._VALID_URL, url)
3552 if mobj is None:
0c021ad1 3553 raise ExtractorError(u'Invalid URL: %s' % url)
991ba7fa 3554
ca6710ee
JC
3555 video_id = mobj.group('videoid')
3556 video_title = mobj.group('title')
991ba7fa
JC
3557
3558 # Get webpage content
ca6710ee 3559 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3560
3561 # Get the video URL
ca6710ee
JC
3562 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3563 result = re.search(VIDEO_URL_RE, webpage)
991ba7fa 3564 if result is None:
0c021ad1 3565 raise ExtractorError(u'Unable to extract video url')
ca6710ee 3566 video_url = compat_urllib_parse.unquote(result.group('url'))
991ba7fa
JC
3567
3568 #Get the uploaded date
ca6710ee
JC
3569 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3570 result = re.search(VIDEO_UPLOADED_RE, webpage)
991ba7fa 3571 if result is None:
0c021ad1 3572 raise ExtractorError(u'Unable to extract video title')
bf50b038 3573 upload_date = unified_strdate(result.group('date'))
991ba7fa
JC
3574
3575 info = {'id': video_id,
3576 'url': video_url,
3577 'uploader': None,
3578 'upload_date': upload_date,
3579 'title': video_title,
3580 'ext': 'flv',
565f7519 3581 'format': 'flv'}
991ba7fa
JC
3582
3583 return [info]
3584
991ba7fa
JC
3585class YouJizzIE(InfoExtractor):
3586 """Information extractor for youjizz.com."""
ca6710ee 3587 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 3588
991ba7fa 3589 def _real_extract(self, url):
ca6710ee
JC
3590 mobj = re.match(self._VALID_URL, url)
3591 if mobj is None:
0c021ad1 3592 raise ExtractorError(u'Invalid URL: %s' % url)
ca6710ee
JC
3593
3594 video_id = mobj.group('videoid')
3595
3596 # Get webpage content
3597 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3598
3599 # Get the video title
db16276b 3600 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
991ba7fa 3601 if result is None:
db16276b 3602 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 3603 video_title = result.group('title').strip()
991ba7fa
JC
3604
3605 # Get the embed page
db16276b 3606 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 3607 if result is None:
db16276b 3608 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 3609
ca6710ee
JC
3610 embed_page_url = result.group(0).strip()
3611 video_id = result.group('videoid')
6324fd1d 3612
ca6710ee
JC
3613 webpage = self._download_webpage(embed_page_url, video_id)
3614
991ba7fa 3615 # Get the video URL
db16276b 3616 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
991ba7fa 3617 if result is None:
db16276b 3618 raise ExtractorError(u'ERROR: unable to extract video url')
ca6710ee 3619 video_url = result.group('source')
991ba7fa
JC
3620
3621 info = {'id': video_id,
3622 'url': video_url,
991ba7fa
JC
3623 'title': video_title,
3624 'ext': 'flv',
3625 'format': 'flv',
991ba7fa
JC
3626 'player_url': embed_page_url}
3627
3628 return [info]
3629
ccf65f9d
PH
3630class EightTracksIE(InfoExtractor):
3631 IE_NAME = '8tracks'
25580f32 3632 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
ccf65f9d
PH
3633
3634 def _real_extract(self, url):
3635 mobj = re.match(self._VALID_URL, url)
3636 if mobj is None:
3637 raise ExtractorError(u'Invalid URL: %s' % url)
3638 playlist_id = mobj.group('id')
3639
3640 webpage = self._download_webpage(url, playlist_id)
3641
2a9983b7 3642 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
ccf65f9d
PH
3643 if not m:
3644 raise ExtractorError(u'Cannot find trax information')
3645 json_like = m.group(1)
3646 data = json.loads(json_like)
3647
3648 session = str(random.randint(0, 1000000000))
3649 mix_id = data['id']
3650 track_count = data['tracks_count']
3651 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3652 next_url = first_url
3653 res = []
3654 for i in itertools.count():
3655 api_json = self._download_webpage(next_url, playlist_id,
3656 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3657 errnote=u'Failed to download song information')
3658 api_data = json.loads(api_json)
3659 track_data = api_data[u'set']['track']
3660 info = {
3661 'id': track_data['id'],
3662 'url': track_data['track_file_stream_url'],
da4de959
PH
3663 'title': track_data['performer'] + u' - ' + track_data['name'],
3664 'raw_title': track_data['name'],
3665 'uploader_id': data['user']['login'],
ccf65f9d
PH
3666 'ext': 'm4a',
3667 }
3668 res.append(info)
3669 if api_data['set']['at_last_track']:
3670 break
3671 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3672 return res
991ba7fa 3673
da06e2da
OK
3674class KeekIE(InfoExtractor):
3675 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3676 IE_NAME = u'keek'
3677
3678 def _real_extract(self, url):
3679 m = re.match(self._VALID_URL, url)
3680 video_id = m.group('videoID')
3681 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3682 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3683 webpage = self._download_webpage(url, video_id)
f4381ab8 3684 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
f0877a44 3685 title = unescapeHTML(m.group('title'))
f10b2a9c
FV
3686 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3687 uploader = clean_html(m.group('uploader'))
da06e2da 3688 info = {
f10b2a9c
FV
3689 'id': video_id,
3690 'url': video_url,
da06e2da
OK
3691 'ext': 'mp4',
3692 'title': title,
3693 'thumbnail': thumbnail,
3694 'uploader': uploader
f0877a44 3695 }
da06e2da
OK
3696 return [info]
3697
3a468f2d 3698class TEDIE(InfoExtractor):
feecf225 3699 _VALID_URL=r'''http://www\.ted\.com/
414638cd
JMF
3700 (
3701 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3702 |
3703 ((?P<type_talk>talks)) # We have a simple talk
3704 )
2e2038dc 3705 (/lang/(.*?))? # The url may contain the language
414638cd
JMF
3706 /(?P<name>\w+) # Here goes the name and then ".html"
3707 '''
3708
89de9eb1
FV
3709 @classmethod
3710 def suitable(cls, url):
414638cd 3711 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 3712 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
414638cd 3713
3a468f2d 3714 def _real_extract(self, url):
414638cd
JMF
3715 m=re.match(self._VALID_URL, url, re.VERBOSE)
3716 if m.group('type_talk'):
3717 return [self._talk_info(url)]
3718 else :
3719 playlist_id=m.group('playlist_id')
3720 name=m.group('name')
f17ce13a 3721 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
e905b6f8 3722 return [self._playlist_videos_info(url,name,playlist_id)]
414638cd
JMF
3723
3724 def _talk_video_link(self,mediaSlug):
3725 '''Returns the video link for that mediaSlug'''
3726 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3727
3728 def _playlist_videos_info(self,url,name,playlist_id=0):
3729 '''Returns the videos of the playlist'''
3730 video_RE=r'''
3731 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3732 ([.\s]*?)data-playlist_item_id="(\d+)"
3733 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3734 '''
c85538db 3735 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
414638cd
JMF
3736 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3737 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3738 m_names=re.finditer(video_name_RE,webpage)
e905b6f8
JMF
3739
3740 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3741 m_playlist = re.search(playlist_RE, webpage)
3742 playlist_title = m_playlist.group('playlist_title')
3743
3744 playlist_entries = []
414638cd 3745 for m_video, m_name in zip(m_videos,m_names):
c85538db
JMF
3746 video_id=m_video.group('video_id')
3747 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
e905b6f8
JMF
3748 playlist_entries.append(self.url_result(talk_url, 'TED'))
3749 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
c85538db 3750
414638cd
JMF
3751 def _talk_info(self, url, video_id=0):
3752 """Return the video for the talk in the url"""
3753 m=re.match(self._VALID_URL, url,re.VERBOSE)
3754 videoName=m.group('name')
3755 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3756 # If the url includes the language we get the title translated
7decf895 3757 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3a468f2d
JMF
3758 title=re.search(title_RE, webpage).group('title')
3759 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3760 "id":(?P<videoID>[\d]+).*?
3761 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
c85538db
JMF
3762 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3763 thumb_match=re.search(thumb_RE,webpage)
3a468f2d
JMF
3764 info_match=re.search(info_RE,webpage,re.VERBOSE)
3765 video_id=info_match.group('videoID')
3766 mediaSlug=info_match.group('mediaSlug')
414638cd 3767 video_url=self._talk_video_link(mediaSlug)
3a468f2d 3768 info = {
414638cd
JMF
3769 'id': video_id,
3770 'url': video_url,
3a468f2d 3771 'ext': 'mp4',
c85538db
JMF
3772 'title': title,
3773 'thumbnail': thumb_match.group('thumbnail')
414638cd
JMF
3774 }
3775 return info
da06e2da 3776
58994225 3777class MySpassIE(InfoExtractor):
1ad5d872 3778 _VALID_URL = r'http://www.myspass.de/.*'
6324fd1d 3779
1ad5d872 3780 def _real_extract(self, url):
3781 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
58994225 3782
1ad5d872 3783 # video id is the last path element of the URL
3784 # usually there is a trailing slash, so also try the second but last
3785 url_path = compat_urllib_parse_urlparse(url).path
3786 url_parent_path, video_id = os.path.split(url_path)
3787 if not video_id:
3788 _, video_id = os.path.split(url_parent_path)
6324fd1d 3789
1ad5d872 3790 # get metadata
3791 metadata_url = META_DATA_URL_TEMPLATE % video_id
3792 metadata_text = self._download_webpage(metadata_url, video_id)
3793 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
6324fd1d 3794
1ad5d872 3795 # extract values from metadata
3796 url_flv_el = metadata.find('url_flv')
3797 if url_flv_el is None:
0c021ad1 3798 raise ExtractorError(u'Unable to extract download url')
1ad5d872 3799 video_url = url_flv_el.text
3800 extension = os.path.splitext(video_url)[1][1:]
3801 title_el = metadata.find('title')
3802 if title_el is None:
0c021ad1 3803 raise ExtractorError(u'Unable to extract title')
1ad5d872 3804 title = title_el.text
3805 format_id_el = metadata.find('format_id')
3806 if format_id_el is None:
3807 format = ext
3808 else:
3809 format = format_id_el.text
3810 description_el = metadata.find('description')
3811 if description_el is not None:
3812 description = description_el.text
3813 else:
3814 description = None
3815 imagePreview_el = metadata.find('imagePreview')
3816 if imagePreview_el is not None:
3817 thumbnail = imagePreview_el.text
3818 else:
3819 thumbnail = None
3820 info = {
3821 'id': video_id,
3822 'url': video_url,
3823 'title': title,
3824 'ext': extension,
3825 'format': format,
3826 'thumbnail': thumbnail,
3827 'description': description
3828 }
3829 return [info]
3830
e32b06e9 3831class SpiegelIE(InfoExtractor):
1f46c152 3832 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
e32b06e9
PH
3833
3834 def _real_extract(self, url):
3835 m = re.match(self._VALID_URL, url)
3836 video_id = m.group('videoID')
3837
3838 webpage = self._download_webpage(url, video_id)
3839 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3840 if not m:
3841 raise ExtractorError(u'Cannot find title')
3842 video_title = unescapeHTML(m.group(1))
3843
3844 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3845 xml_code = self._download_webpage(xml_url, video_id,
3846 note=u'Downloading XML', errnote=u'Failed to download XML')
3847
3848 idoc = xml.etree.ElementTree.fromstring(xml_code)
3849 last_type = idoc[-1]
3850 filename = last_type.findall('./filename')[0].text
3851 duration = float(last_type.findall('./duration')[0].text)
3852
3853 video_url = 'http://video2.spiegel.de/flash/' + filename
3854 video_ext = filename.rpartition('.')[2]
3855 info = {
3856 'id': video_id,
3857 'url': video_url,
3858 'ext': video_ext,
3859 'title': video_title,
3860 'duration': duration,
3861 }
3862 return [info]
3863
0cd35867 3864class LiveLeakIE(InfoExtractor):
43113d92 3865
0cd35867 3866 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
43113d92 3867 IE_NAME = u'liveleak'
3868
3869 def _real_extract(self, url):
3870 mobj = re.match(self._VALID_URL, url)
3871 if mobj is None:
0c021ad1 3872 raise ExtractorError(u'Invalid URL: %s' % url)
43113d92 3873
0cd35867 3874 video_id = mobj.group('video_id')
43113d92 3875
3876 webpage = self._download_webpage(url, video_id)
3877
0cd35867
FV
3878 m = re.search(r'file: "(.*?)",', webpage)
3879 if not m:
0c021ad1 3880 raise ExtractorError(u'Unable to find video url')
0cd35867
FV
3881 video_url = m.group(1)
3882
43113d92 3883 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3884 if not m:
486f0c94 3885 raise ExtractorError(u'Cannot find video title')
0cd35867 3886 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
43113d92 3887
3888 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3889 if m:
3890 desc = unescapeHTML(m.group('desc'))
3891 else:
3892 desc = None
3893
0cd35867
FV
3894 m = re.search(r'By:.*?(\w+)</a>', webpage)
3895 if m:
3896 uploader = clean_html(m.group(1))
3897 else:
3898 uploader = None
43113d92 3899
3900 info = {
3901 'id': video_id,
3902 'url': video_url,
3903 'ext': 'mp4',
3904 'title': title,
0cd35867
FV
3905 'description': desc,
3906 'uploader': uploader
43113d92 3907 }
3908
3909 return [info]
3910
df2dedee 3911class ARDIE(InfoExtractor):
b03d65c2
PH
3912 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3913 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
df2dedee
MW
3914 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3915
3916 def _real_extract(self, url):
3917 # determine video id from url
3918 m = re.match(self._VALID_URL, url)
b03d65c2
PH
3919
3920 numid = re.search(r'documentId=([0-9]+)', url)
3921 if numid:
3922 video_id = numid.group(1)
3923 else:
3924 video_id = m.group('video_id')
df2dedee
MW
3925
3926 # determine title and media streams from webpage
3927 html = self._download_webpage(url, video_id)
3928 title = re.search(self._TITLE, html).group('title')
3929 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3930 if not streams:
3931 assert '"fsk"' in html
0c021ad1 3932 raise ExtractorError(u'This video is only available after 8:00 pm')
df2dedee
MW
3933
3934 # choose default media type and highest quality for now
b03d65c2
PH
3935 stream = max([s for s in streams if int(s["media_type"]) == 0],
3936 key=lambda s: int(s["quality"]))
df2dedee
MW
3937
3938 # there's two possibilities: RTMP stream or HTTP download
3939 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3940 if stream['rtmp_url']:
f17ce13a 3941 self.to_screen(u'RTMP download detected')
df2dedee
MW
3942 assert stream['video_url'].startswith('mp4:')
3943 info["url"] = stream["rtmp_url"]
3944 info["play_path"] = stream['video_url']
3945 else:
3946 assert stream["video_url"].endswith('.mp4')
3947 info["url"] = stream["video_url"]
3948 return [info]
3949
c15e0241 3950class TumblrIE(InfoExtractor):
feecf225 3951 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
c15e0241
JMF
3952
3953 def _real_extract(self, url):
3954 m_url = re.match(self._VALID_URL, url)
3955 video_id = m_url.group('id')
3956 blog = m_url.group('blog_name')
3957
3958 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3959 webpage = self._download_webpage(url, video_id)
3960
feecf225 3961 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
c15e0241
JMF
3962 video = re.search(re_video, webpage)
3963 if video is None:
3964 self.to_screen("No video founded")
3965 return []
3966 video_url = video.group('video_url')
3967 ext = video.group('ext')
3968
3969 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3970 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3971
3972 # The only place where you can get a title, it's not complete,
3973 # but searching in other places doesn't work for all videos
104ccdb8
JMF
3974 re_title = r'<title>(?P<title>.*?)</title>'
3975 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
c15e0241
JMF
3976
3977 return [{'id': video_id,
3978 'url': video_url,
3979 'title': title,
3980 'thumbnail': thumb,
3981 'ext': ext
3982 }]
3983
aed523ec 3984class BandcampIE(InfoExtractor):
feecf225 3985 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
aed523ec
JMF
3986
3987 def _real_extract(self, url):
3988 mobj = re.match(self._VALID_URL, url)
3989 title = mobj.group('title')
3990 webpage = self._download_webpage(url, title)
3991 # We get the link to the free download page
3992 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3993 if m_download is None:
0c021ad1
JMF
3994 raise ExtractorError(u'No free songs founded')
3995
aed523ec
JMF
3996 download_link = m_download.group(1)
3997 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3998 webpage, re.MULTILINE|re.DOTALL).group('id')
3999
4000 download_webpage = self._download_webpage(download_link, id,
4001 'Downloading free downloads page')
4002 # We get the dictionary of the track from some javascrip code
4003 info = re.search(r'items: (.*?),$',
4004 download_webpage, re.MULTILINE).group(1)
4005 info = json.loads(info)[0]
4006 # We pick mp3-320 for now, until format selection can be easily implemented.
4007 mp3_info = info[u'downloads'][u'mp3-320']
4008 # If we try to use this url it says the link has expired
4009 initial_url = mp3_info[u'url']
feecf225 4010 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
aed523ec
JMF
4011 m_url = re.match(re_url, initial_url)
4012 #We build the url we will use to get the final track url
4013 # This url is build in Bandcamp in the script download_bunde_*.js
4014 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4015 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4016 # If we could correctly generate the .rand field the url would be
4017 #in the "download_url" key
4018 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4019
4020 track_info = {'id':id,
4021 'title' : info[u'title'],
4022 'ext' : 'mp3',
4023 'url' : final_url,
4024 'thumbnail' : info[u'thumb_url'],
4025 'uploader' : info[u'artist']
4026 }
4027
4028 return [track_info]
4029
c34407d1 4030class RedTubeIE(InfoExtractor):
5e34d2eb
YUK
4031 """Information Extractor for redtube"""
4032 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
5e34d2eb
YUK
4033
4034 def _real_extract(self,url):
4035 mobj = re.match(self._VALID_URL, url)
4036 if mobj is None:
c34407d1
PH
4037 raise ExtractorError(u'Invalid URL: %s' % url)
4038
5e34d2eb
YUK
4039 video_id = mobj.group('id')
4040 video_extension = 'mp4'
4041 webpage = self._download_webpage(url, video_id)
4042 self.report_extraction(video_id)
4043 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
c34407d1
PH
4044
4045 if mobj is None:
4046 raise ExtractorError(u'Unable to extract media URL')
4047
4048 video_url = mobj.group(1)
4049 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4050 if mobj is None:
4051 raise ExtractorError(u'Unable to extract title')
4052 video_title = mobj.group(1)
5e34d2eb
YUK
4053
4054 return [{
4055 'id': video_id,
4056 'url': video_url,
4057 'ext': video_extension,
4058 'title': video_title,
4059 }]
7f5bd09b 4060
4061class InaIE(InfoExtractor):
4062 """Information Extractor for Ina.fr"""
4063 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
dfb9323c 4064
7f5bd09b 4065 def _real_extract(self,url):
4066 mobj = re.match(self._VALID_URL, url)
dfb9323c 4067
7f5bd09b 4068 video_id = mobj.group('id')
dfb9323c
PH
4069 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4070 video_extension = 'mp4'
4071 webpage = self._download_webpage(mrss_url, video_id)
7f5bd09b 4072
dfb9323c 4073 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
7f5bd09b 4074 if mobj is None:
4075 raise ExtractorError(u'Unable to extract media URL')
7f5bd09b 4076 video_url = mobj.group(1)
dfb9323c
PH
4077
4078 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
7f5bd09b 4079 if mobj is None:
4080 raise ExtractorError(u'Unable to extract title')
4081 video_title = mobj.group(1)
7f5bd09b 4082
4083 return [{
4084 'id': video_id,
4085 'url': video_url,
4086 'ext': video_extension,
4087 'title': video_title,
4088 }]
e32b06e9 4089
4aeae91f
PH
4090def gen_extractors():
4091 """ Return a list of an instance of every supported extractor.
4092 The order does matter; the first extractor matched is the one handling the URL.
4093 """
4094 return [
4095 YoutubePlaylistIE(),
4096 YoutubeChannelIE(),
4097 YoutubeUserIE(),
4098 YoutubeSearchIE(),
4099 YoutubeIE(),
4100 MetacafeIE(),
4101 DailymotionIE(),
4102 GoogleSearchIE(),
4103 PhotobucketIE(),
4104 YahooIE(),
4105 YahooSearchIE(),
4106 DepositFilesIE(),
4107 FacebookIE(),
4108 BlipTVUserIE(),
4109 BlipTVIE(),
4110 VimeoIE(),
4111 MyVideoIE(),
4112 ComedyCentralIE(),
4113 EscapistIE(),
4114 CollegeHumorIE(),
4115 XVideosIE(),
5011cded 4116 SoundcloudSetIE(),
4aeae91f
PH
4117 SoundcloudIE(),
4118 InfoQIE(),
4119 MixcloudIE(),
4120 StanfordOpenClassroomIE(),
4121 MTVIE(),
4122 YoukuIE(),
4123 XNXXIE(),
18be482a
JC
4124 YouJizzIE(),
4125 PornotubeIE(),
4126 YouPornIE(),
4aeae91f
PH
4127 GooglePlusIE(),
4128 ArteTvIE(),
4129 NBAIE(),
40634747 4130 WorldStarHipHopIE(),
4aeae91f
PH
4131 JustinTVIE(),
4132 FunnyOrDieIE(),
4aeae91f
PH
4133 SteamIE(),
4134 UstreamIE(),
ca0a0bbe 4135 RBMARadioIE(),
ccf65f9d 4136 EightTracksIE(),
da06e2da 4137 KeekIE(),
3a468f2d 4138 TEDIE(),
58994225 4139 MySpassIE(),
e32b06e9 4140 SpiegelIE(),
0cd35867 4141 LiveLeakIE(),
df2dedee 4142 ARDIE(),
c15e0241 4143 TumblrIE(),
aed523ec 4144 BandcampIE(),
c34407d1 4145 RedTubeIE(),
dfb9323c 4146 InaIE(),
4aeae91f
PH
4147 GenericIE()
4148 ]
93412126
JMF
4149
4150def get_info_extractor(ie_name):
4151 """Returns the info extractor class with the given ie_name"""
4152 return globals()[ie_name+'IE']