]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Fix some IEs that didn't return the uploade_date in the YYYYMMDD format
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
4fcca4bb 6import base64
d77c3dfd 7import datetime
ccf65f9d 8import itertools
d77c3dfd
FV
9import netrc
10import os
11import re
12import socket
13import time
d77c3dfd 14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
6324fd1d 18import operator
d77c3dfd 19
9e8056d5 20from .utils import *
d77c3dfd
FV
21
22
23class InfoExtractor(object):
59ae15a5 24 """Information Extractor class.
d77c3dfd 25
59ae15a5
PH
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
cdb30764 29 others. The information is stored in a dictionary which is then
59ae15a5
PH
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
717b1f72 33
59ae15a5 34 The dictionaries must include the following fields:
717b1f72 35
59ae15a5
PH
36 id: Video identifier.
37 url: Final video URL.
59ae15a5
PH
38 title: Video title, unescaped.
39 ext: Video filename extension.
717b1f72 40
59ae15a5 41 The following fields are optional:
717b1f72 42
59ae15a5
PH
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
539679c7
PH
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
77c4beab 48 uploader_id: Nickname or id of the video uploader.
6119f78c 49 location: Physical location of the video.
59ae15a5 50 player_url: SWF Player URL (used for rtmpdump).
553d0974 51 subtitles: The subtitle file contents.
59ae15a5
PH
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
d77c3dfd 54
59ae15a5 55 The fields should all be Unicode strings.
9ce5d9ee 56
59ae15a5
PH
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
717b1f72 60
59ae15a5
PH
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
03c5b0fb 63
59ae15a5
PH
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
d77c3dfd 67
59ae15a5
PH
68 _ready = False
69 _downloader = None
70 _WORKING = True
d77c3dfd 71
59ae15a5
PH
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
d77c3dfd 76
89de9eb1
FV
77 @classmethod
78 def suitable(cls, url):
59ae15a5 79 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 80 return re.match(cls._VALID_URL, url) is not None
d77c3dfd 81
89de9eb1
FV
82 @classmethod
83 def working(cls):
59ae15a5 84 """Getter method for _WORKING."""
89de9eb1 85 return cls._WORKING
03c5b0fb 86
59ae15a5
PH
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
d77c3dfd 92
59ae15a5
PH
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
d77c3dfd 97
59ae15a5
PH
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
d77c3dfd 101
59ae15a5
PH
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
d77c3dfd 105
59ae15a5
PH
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
d77c3dfd 109
d0d4f277
PH
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
d77c3dfd 113
64ce2aad
PH
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
d830b7c2 116 if note is None:
0d173446
JMF
117 self.report_download_webpage(video_id)
118 elif note is not False:
f17ce13a 119 self.to_screen(u'%s: %s' % (video_id, note))
d830b7c2 120 try:
64ce2aad 121 return compat_urllib_request.urlopen(url_or_request)
d830b7c2
PH
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
01951dda 125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
d830b7c2 126
64ce2aad
PH
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
e32b06e9
PH
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
64ce2aad 136 webpage_bytes = urlh.read()
855703e5
PH
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
f17ce13a 142 self.to_screen(u'Dumping request to ' + url)
855703e5
PH
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
e32b06e9 145 return webpage_bytes.decode(encoding, 'replace')
f17ce13a
JMF
146
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
41a6eb94
JMF
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
154
320e26a0
JMF
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
158
e11eb119
JMF
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
162
8a38a194
JMF
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
168 return video_info
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
6de8f1af
JMF
173 'url': url,
174 'ie_key': ie}
8a38a194 175 return video_info
d2c69082 176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
8a38a194
JMF
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
179 'entries': entries}
d2c69082
JMF
180 if playlist_id:
181 video_info['id'] = playlist_id
182 if playlist_title:
183 video_info['title'] = playlist_title
8a38a194 184 return video_info
64ce2aad 185
d830b7c2 186
d77c3dfd 187class YoutubeIE(InfoExtractor):
59ae15a5
PH
188 """Information extractor for youtube.com."""
189
190 _VALID_URL = r"""^
191 (
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
59ae15a5
PH
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
3bb61659 201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
59ae15a5
PH
202 v=
203 )
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
208 $"""
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
d3f5f9f6 210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
59ae15a5
PH
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
218 '13': '3gp',
219 '17': 'mp4',
220 '18': 'mp4',
221 '22': 'mp4',
222 '37': 'mp4',
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
224 '43': 'webm',
225 '44': 'webm',
226 '45': 'webm',
227 '46': 'webm',
228 }
229 _video_dimensions = {
230 '5': '240x400',
231 '6': '???',
232 '13': '???',
233 '17': '144x176',
234 '18': '360x640',
235 '22': '720x1280',
236 '34': '360x640',
237 '35': '480x854',
238 '37': '1080x1920',
239 '38': '3072x4096',
240 '43': '360x640',
241 '44': '480x854',
242 '45': '720x1280',
243 '46': '1080x1920',
cdb30764 244 }
59ae15a5
PH
245 IE_NAME = u'youtube'
246
89de9eb1
FV
247 @classmethod
248 def suitable(cls, url):
59ae15a5 249 """Receives a URL and returns True if suitable for this IE."""
89de9eb1
FV
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
59ae15a5
PH
252
253 def report_lang(self):
254 """Report attempt to set language."""
f17ce13a 255 self.to_screen(u'Setting language')
59ae15a5
PH
256
257 def report_login(self):
258 """Report attempt to log in."""
f17ce13a 259 self.to_screen(u'Logging in')
59ae15a5 260
59ae15a5
PH
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
f17ce13a 263 self.to_screen(u'%s: Downloading video webpage' % video_id)
59ae15a5
PH
264
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
f17ce13a 267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
59ae15a5
PH
268
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
f17ce13a 271 self.to_screen(u'%s: Checking available subtitles' % video_id)
59ae15a5 272
2a4093ea 273 def report_video_subtitles_request(self, video_id, sub_lang, format):
ae608b80 274 """Report attempt to download video info webpage."""
f17ce13a 275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
2a4093ea
IM
276
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
f17ce13a 280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
59ae15a5
PH
281
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
f17ce13a 284 self.to_screen(u'%s: Extracting video information' % video_id)
59ae15a5
PH
285
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
f17ce13a 288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
59ae15a5
PH
289
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
f17ce13a 292 self.to_screen(u'RTMP download detected')
59ae15a5 293
ae608b80 294 def _get_available_subtitles(self, video_id):
056d8575
FV
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
297 try:
553d0974 298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
056d8575 299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
bc97f6d6 300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
553d0974
IM
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
bc97f6d6 304 return (u'video doesn\'t have subtitles', None)
553d0974 305 return sub_lang_list
ae608b80 306
2a4093ea
IM
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
310
9e62bc44 311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
6a205c88
JMF
312 """
313 Return tuple:
314 (error_message, sub_lang, sub)
315 """
2a4093ea 316 self.report_video_subtitles_request(video_id, sub_lang, format)
fb778e66 317 params = compat_urllib_parse.urlencode({
553d0974
IM
318 'lang': sub_lang,
319 'name': sub_name,
fb778e66 320 'v': video_id,
ae608b80 321 'fmt': format,
fb778e66
PH
322 })
323 url = 'http://www.youtube.com/api/timedtext?' + params
056d8575 324 try:
553d0974 325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
056d8575 326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
bc97f6d6 327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
553d0974 328 if not sub:
bc97f6d6 329 return (u'Did not fetch video subtitles', None, None)
553d0974 330 return (None, sub_lang, sub)
ae608b80
IM
331
332 def _extract_subtitle(self, video_id):
0fb37564
JMF
333 """
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
336 """
553d0974 337 sub_lang_list = self._get_available_subtitles(video_id)
9e62bc44 338 sub_format = self._downloader.params.get('subtitlesformat')
0fb37564
JMF
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
ae608b80 341 if self._downloader.params.get('subtitleslang', False):
553d0974
IM
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
344 sub_lang = 'en'
ae608b80 345 else:
553d0974
IM
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
bc97f6d6 348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
ae608b80 349
9e62bc44 350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
553d0974 351 return [subtitle]
ae608b80
IM
352
353 def _extract_all_subtitles(self, video_id):
553d0974 354 sub_lang_list = self._get_available_subtitles(video_id)
9e62bc44 355 sub_format = self._downloader.params.get('subtitlesformat')
ef767f9f
JMF
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
553d0974
IM
358 subtitles = []
359 for sub_lang in sub_lang_list:
9e62bc44 360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
553d0974
IM
361 subtitles.append(subtitle)
362 return subtitles
056d8575 363
59ae15a5
PH
364 def _print_formats(self, formats):
365 print('Available formats:')
366 for x in formats:
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
368
369 def _real_initialize(self):
370 if self._downloader is None:
371 return
372
373 username = None
374 password = None
375 downloader_params = self._downloader.params
376
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
382 try:
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 if info is not None:
385 username = info[0]
386 password = info[2]
387 else:
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
2e5457be 390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
391 return
392
393 # Set language
394 request = compat_urllib_request.Request(self._LANG_URL)
395 try:
396 self.report_lang()
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59ae15a5
PH
400 return
401
402 # No authentication to be performed
403 if username is None:
404 return
405
d3f5f9f6
PH
406 request = compat_urllib_request.Request(self._LOGIN_URL)
407 try:
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
d3f5f9f6
PH
411 return
412
413 galx = None
414 dsh = None
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
416 if match:
417 galx = match.group(1)
418
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
420 if match:
421 dsh = match.group(1)
422
59ae15a5 423 # Log in
d3f5f9f6
PH
424 login_form_strs = {
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
426 u'Email': username,
427 u'GALX': galx,
428 u'Passwd': password,
429 u'PersistentCookie': u'yes',
430 u'_utf8': u'霱',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
434 u'dnConn': u'',
435 u'dsh': dsh,
436 u'pstMsg': u'0',
437 u'rmShown': u'1',
438 u'secTok': u'',
439 u'signIn': u'Sign in',
440 u'timeStmp': u'',
441 u'service': u'youtube',
442 u'uilel': u'3',
443 u'hl': u'en_US',
444 }
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
446 # chokes on unicode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
59ae15a5
PH
450 try:
451 self.report_login()
80d3177e 452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
d3f5f9f6 453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
2e5457be 454 self._downloader.report_warning(u'unable to log in: bad username or password')
59ae15a5
PH
455 return
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
458 return
459
460 # Confirm age
461 age_form = {
462 'next_url': '/',
463 'action_confirm': 'Confirm',
464 }
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
466 try:
467 self.report_age_confirmation()
80d3177e 468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
59ae15a5
PH
471 return
472
3bb61659 473 def _extract_id(self, url):
59ae15a5
PH
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
475 if mobj is None:
e5f30ade 476 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
477 return
478 video_id = mobj.group(2)
3bb61659
PH
479 return video_id
480
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
484 if mobj:
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
59ae15a5
PH
487
488 # Get video webpage
489 self.report_video_webpage_download(video_id)
3bb61659
PH
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
59ae15a5
PH
492 try:
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
59ae15a5
PH
496 return
497
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
499
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
502 if mobj is not None:
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 else:
505 player_url = None
506
507 # Get video info
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
927c8c49 510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
59ae15a5 511 % (video_id, el_type))
927c8c49
PH
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
513 note=False,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
517 break
59ae15a5
PH
518 if 'token' not in video_info:
519 if 'reason' in video_info:
e5f30ade 520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
59ae15a5 521 else:
e5f30ade 522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
59ae15a5
PH
523 return
524
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
e5f30ade 527 self._downloader.report_error(u'"rental" videos not supported')
59ae15a5
PH
528 return
529
530 # Start extracting information
531 self.report_information_extraction(video_id)
532
533 # uploader
534 if 'author' not in video_info:
e5f30ade 535 self._downloader.report_error(u'unable to extract uploader name')
59ae15a5
PH
536 return
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
538
77c4beab
FV
539 # uploader_id
540 video_uploader_id = None
26cf0408 541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
77c4beab
FV
542 if mobj is not None:
543 video_uploader_id = mobj.group(1)
544 else:
c9fa1cba 545 self._downloader.report_warning(u'unable to extract uploader nickname')
77c4beab 546
59ae15a5
PH
547 # title
548 if 'title' not in video_info:
e5f30ade 549 self._downloader.report_error(u'unable to extract video title')
59ae15a5
PH
550 return
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552
553 # thumbnail image
554 if 'thumbnail_url' not in video_info:
c9fa1cba 555 self._downloader.report_warning(u'unable to extract video thumbnail')
59ae15a5
PH
556 video_thumbnail = ''
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
559
560 # upload date
561 upload_date = None
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
563 if mobj is not None:
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
bf50b038 565 upload_date = unified_strdate(upload_date)
59ae15a5
PH
566
567 # description
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
571 else:
7b670a44
PH
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
573 if fd_mobj:
574 video_description = unescapeHTML(fd_mobj.group(1))
575 else:
576 video_description = u''
59ae15a5 577
9e62bc44 578 # subtitles
59ae15a5 579 video_subtitles = None
ae608b80 580
59ae15a5 581 if self._downloader.params.get('writesubtitles', False):
ae608b80
IM
582 video_subtitles = self._extract_subtitle(video_id)
583 if video_subtitles:
553d0974
IM
584 (sub_error, sub_lang, sub) = video_subtitles[0]
585 if sub_error:
bc97f6d6 586 self._downloader.report_error(sub_error)
ae608b80
IM
587
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
553d0974
IM
591 (sub_error, sub_lang, sub) = video_subtitle
592 if sub_error:
bc97f6d6 593 self._downloader.report_error(sub_error)
59ae15a5 594
2a4093ea
IM
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
597 return
59ae15a5
PH
598
599 if 'length_seconds' not in video_info:
c9fa1cba 600 self._downloader.report_warning(u'unable to extract video duration')
59ae15a5
PH
601 video_duration = ''
602 else:
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604
605 # token
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
607
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
610
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1a2c3c0f 617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
c8c5443b 618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
59ae15a5
PH
619
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
624 else:
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
c681a039 628 raise ExtractorError(u'no known formats available for video')
59ae15a5
PH
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
631 return
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
638 else:
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
644 if rf in url_map:
645 video_url_list = [(rf, url_map[rf])]
646 break
647 if video_url_list is None:
c681a039 648 raise ExtractorError(u'requested format not available')
59ae15a5 649 else:
c681a039 650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
59ae15a5
PH
651
652 results = []
653 for format_param, video_real_url in video_url_list:
654 # Extension
655 video_extension = self._video_extensions.get(format_param, 'flv')
656
32761d86
FV
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
659
660 results.append({
661 'id': video_id,
662 'url': video_real_url,
663 'uploader': video_uploader,
77c4beab 664 'uploader_id': video_uploader_id,
59ae15a5
PH
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
674 })
675 return results
d77c3dfd
FV
676
677
678class MetacafeIE(InfoExtractor):
59ae15a5
PH
679 """Information Extractor for metacafe.com."""
680
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
685
59ae15a5
PH
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
f17ce13a 688 self.to_screen(u'Retrieving disclaimer')
59ae15a5 689
59ae15a5
PH
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
693 try:
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
59ae15a5
PH
698 return
699
700 # Confirm age
701 disclaimer_form = {
702 'filters': '0',
703 'submit': "Continue - I'm over 18",
704 }
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706 try:
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
59ae15a5
PH
711 return
712
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
716 if mobj is None:
e5f30ade 717 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
718 return
719
720 video_id = mobj.group(1)
721
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
6de8f1af 725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
59ae15a5
PH
726
727 # Retrieve video webpage to extract further information
f7a9721e 728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
59ae15a5
PH
729
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
733 if mobj is not None:
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
736
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
739 if mobj is None:
740 video_url = mediaURL
741 else:
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
744 else:
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
746 if mobj is None:
e5f30ade 747 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
748 return
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
e5f30ade 751 self._downloader.report_error(u'unable to extract media URL')
59ae15a5 752 return
f7a9721e 753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
59ae15a5 754 if mobj is None:
e5f30ade 755 self._downloader.report_error(u'unable to extract media URL')
59ae15a5 756 return
f7a9721e 757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
59ae15a5 758 video_extension = mediaURL[-3:]
f7a9721e 759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
59ae15a5
PH
760
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
762 if mobj is None:
e5f30ade 763 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
764 return
765 video_title = mobj.group(1).decode('utf-8')
766
767 mobj = re.search(r'submitter=(.*?);', webpage)
768 if mobj is None:
e5f30ade 769 self._downloader.report_error(u'unable to extract uploader nickname')
59ae15a5
PH
770 return
771 video_uploader = mobj.group(1)
772
773 return [{
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
777 'upload_date': None,
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
780 }]
d77c3dfd
FV
781
782
783class DailymotionIE(InfoExtractor):
59ae15a5
PH
784 """Information Extractor for Dailymotion"""
785
786 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787 IE_NAME = u'dailymotion'
b17c974a 788 _WORKING = False
59ae15a5 789
59ae15a5
PH
790 def _real_extract(self, url):
791 # Extract id and simplified title from URL
792 mobj = re.match(self._VALID_URL, url)
793 if mobj is None:
e5f30ade 794 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
795 return
796
797 video_id = mobj.group(1).split('_')[0].split('?')[0]
798
799 video_extension = 'mp4'
800
801 # Retrieve video webpage to extract further information
802 request = compat_urllib_request.Request(url)
803 request.add_header('Cookie', 'family_filter=off')
8e241d1a 804 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
805
806 # Extract URL, uploader and title from webpage
807 self.report_extraction(video_id)
808 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
809 if mobj is None:
e5f30ade 810 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
811 return
812 flashvars = compat_urllib_parse.unquote(mobj.group(1))
813
814 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815 if key in flashvars:
816 max_quality = key
f17ce13a 817 self.to_screen(u'Using %s' % key)
59ae15a5
PH
818 break
819 else:
e5f30ade 820 self._downloader.report_error(u'unable to extract video URL')
59ae15a5
PH
821 return
822
823 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
824 if mobj is None:
e5f30ade 825 self._downloader.report_error(u'unable to extract video URL')
59ae15a5
PH
826 return
827
828 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
829
830 # TODO: support choosing qualities
831
832 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
833 if mobj is None:
e5f30ade 834 self._downloader.report_error(u'unable to extract title')
59ae15a5 835 return
28ca6b5a 836 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
837
838 video_uploader = None
839 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
840 if mobj is None:
841 # lookin for official user
842 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
843 if mobj_official is None:
c9fa1cba 844 self._downloader.report_warning(u'unable to extract uploader nickname')
59ae15a5
PH
845 else:
846 video_uploader = mobj_official.group(1)
847 else:
848 video_uploader = mobj.group(1)
849
850 video_upload_date = None
851 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
852 if mobj is not None:
853 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
854
855 return [{
28ca6b5a
PH
856 'id': video_id,
857 'url': video_url,
858 'uploader': video_uploader,
59ae15a5
PH
859 'upload_date': video_upload_date,
860 'title': video_title,
28ca6b5a 861 'ext': video_extension,
59ae15a5 862 }]
d77c3dfd
FV
863
864
d77c3dfd 865class PhotobucketIE(InfoExtractor):
59ae15a5
PH
866 """Information extractor for photobucket.com."""
867
868 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
869 IE_NAME = u'photobucket'
870
59ae15a5
PH
871 def _real_extract(self, url):
872 # Extract id from URL
873 mobj = re.match(self._VALID_URL, url)
874 if mobj is None:
e5f30ade 875 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
876 return
877
878 video_id = mobj.group(1)
879
880 video_extension = 'flv'
881
882 # Retrieve video webpage to extract further information
883 request = compat_urllib_request.Request(url)
884 try:
885 self.report_download_webpage(video_id)
886 webpage = compat_urllib_request.urlopen(request).read()
887 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 888 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
889 return
890
891 # Extract URL, uploader, and title from webpage
892 self.report_extraction(video_id)
893 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
894 if mobj is None:
e5f30ade 895 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
896 return
897 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
898
899 video_url = mediaURL
900
901 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
902 if mobj is None:
e5f30ade 903 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
904 return
905 video_title = mobj.group(1).decode('utf-8')
906
907 video_uploader = mobj.group(2).decode('utf-8')
908
909 return [{
910 'id': video_id.decode('utf-8'),
911 'url': video_url.decode('utf-8'),
912 'uploader': video_uploader,
913 'upload_date': None,
914 'title': video_title,
915 'ext': video_extension.decode('utf-8'),
916 }]
d77c3dfd
FV
917
918
919class YahooIE(InfoExtractor):
59ae15a5
PH
920 """Information extractor for video.yahoo.com."""
921
93702113 922 _WORKING = False
59ae15a5
PH
923 # _VALID_URL matches all Yahoo! Video URLs
924 # _VPAGE_URL matches only the extractable '/watch/' URLs
925 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
926 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
927 IE_NAME = u'video.yahoo'
928
59ae15a5
PH
929 def _real_extract(self, url, new_video=True):
930 # Extract ID from URL
931 mobj = re.match(self._VALID_URL, url)
932 if mobj is None:
e5f30ade 933 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
934 return
935
936 video_id = mobj.group(2)
937 video_extension = 'flv'
938
939 # Rewrite valid but non-extractable URLs as
940 # extractable English language /watch/ URLs
941 if re.match(self._VPAGE_URL, url) is None:
942 request = compat_urllib_request.Request(url)
943 try:
944 webpage = compat_urllib_request.urlopen(request).read()
945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 946 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
947 return
948
949 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
950 if mobj is None:
e5f30ade 951 self._downloader.report_error(u'Unable to extract id field')
59ae15a5
PH
952 return
953 yahoo_id = mobj.group(1)
954
955 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
956 if mobj is None:
e5f30ade 957 self._downloader.report_error(u'Unable to extract vid field')
59ae15a5
PH
958 return
959 yahoo_vid = mobj.group(1)
960
961 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
962 return self._real_extract(url, new_video=False)
963
964 # Retrieve video webpage to extract further information
965 request = compat_urllib_request.Request(url)
966 try:
967 self.report_download_webpage(video_id)
968 webpage = compat_urllib_request.urlopen(request).read()
969 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 970 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
971 return
972
973 # Extract uploader and title from webpage
974 self.report_extraction(video_id)
975 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
976 if mobj is None:
e5f30ade 977 self._downloader.report_error(u'unable to extract video title')
59ae15a5
PH
978 return
979 video_title = mobj.group(1).decode('utf-8')
980
981 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
982 if mobj is None:
e5f30ade 983 self._downloader.report_error(u'unable to extract video uploader')
59ae15a5
PH
984 return
985 video_uploader = mobj.group(1).decode('utf-8')
986
987 # Extract video thumbnail
988 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
989 if mobj is None:
e5f30ade 990 self._downloader.report_error(u'unable to extract video thumbnail')
59ae15a5
PH
991 return
992 video_thumbnail = mobj.group(1).decode('utf-8')
993
994 # Extract video description
995 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
996 if mobj is None:
e5f30ade 997 self._downloader.report_error(u'unable to extract video description')
59ae15a5
PH
998 return
999 video_description = mobj.group(1).decode('utf-8')
1000 if not video_description:
1001 video_description = 'No description available.'
1002
1003 # Extract video height and width
1004 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1005 if mobj is None:
e5f30ade 1006 self._downloader.report_error(u'unable to extract video height')
59ae15a5
PH
1007 return
1008 yv_video_height = mobj.group(1)
1009
1010 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1011 if mobj is None:
e5f30ade 1012 self._downloader.report_error(u'unable to extract video width')
59ae15a5
PH
1013 return
1014 yv_video_width = mobj.group(1)
1015
1016 # Retrieve video playlist to extract media URL
1017 # I'm not completely sure what all these options are, but we
1018 # seem to need most of them, otherwise the server sends a 401.
1019 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1020 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1021 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1022 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1023 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1024 try:
1025 self.report_download_webpage(video_id)
1026 webpage = compat_urllib_request.urlopen(request).read()
1027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1028 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
1029 return
1030
1031 # Extract media URL from playlist XML
1032 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1033 if mobj is None:
e5f30ade 1034 self._downloader.report_error(u'Unable to extract media URL')
59ae15a5
PH
1035 return
1036 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1037 video_url = unescapeHTML(video_url)
1038
1039 return [{
1040 'id': video_id.decode('utf-8'),
1041 'url': video_url,
1042 'uploader': video_uploader,
1043 'upload_date': None,
1044 'title': video_title,
1045 'ext': video_extension.decode('utf-8'),
1046 'thumbnail': video_thumbnail.decode('utf-8'),
1047 'description': video_description,
1048 }]
d77c3dfd
FV
1049
1050
1051class VimeoIE(InfoExtractor):
59ae15a5
PH
1052 """Information extractor for vimeo.com."""
1053
1054 # _VALID_URL matches Vimeo URLs
8edc2cf8 1055 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
59ae15a5
PH
1056 IE_NAME = u'vimeo'
1057
59ae15a5
PH
1058 def _real_extract(self, url, new_video=True):
1059 # Extract ID from URL
1060 mobj = re.match(self._VALID_URL, url)
1061 if mobj is None:
e5f30ade 1062 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1063 return
1064
8edc2cf8
PH
1065 video_id = mobj.group('id')
1066 if not mobj.group('proto'):
1067 url = 'https://' + url
1068 if mobj.group('direct_link'):
1069 url = 'https://vimeo.com/' + video_id
59ae15a5
PH
1070
1071 # Retrieve video webpage to extract further information
1072 request = compat_urllib_request.Request(url, None, std_headers)
1073 try:
1074 self.report_download_webpage(video_id)
f1171f7c
PH
1075 webpage_bytes = compat_urllib_request.urlopen(request).read()
1076 webpage = webpage_bytes.decode('utf-8')
59ae15a5 1077 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1078 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
1079 return
1080
1081 # Now we begin extracting as much information as we can from what we
1082 # retrieved. First we extract the information common to all extractors,
1083 # and latter we extract those that are Vimeo specific.
1084 self.report_extraction(video_id)
1085
1086 # Extract the config JSON
59ae15a5 1087 try:
1ca63e3a 1088 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1089 config = json.loads(config)
1090 except:
3820df01
JMF
1091 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1092 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1093 else:
1094 self._downloader.report_error(u'unable to extract info section')
59ae15a5 1095 return
cdb30764 1096
59ae15a5
PH
1097 # Extract title
1098 video_title = config["video"]["title"]
1099
77c4beab 1100 # Extract uploader and uploader_id
59ae15a5 1101 video_uploader = config["video"]["owner"]["name"]
77c4beab 1102 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1103
1104 # Extract video thumbnail
1105 video_thumbnail = config["video"]["thumbnail"]
1106
1107 # Extract video description
0dcfb234 1108 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5 1109 if video_description: video_description = clean_html(video_description)
dc36bc94 1110 else: video_description = u''
59ae15a5
PH
1111
1112 # Extract upload date
1113 video_upload_date = None
6b3aef80 1114 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1115 if mobj is not None:
6b3aef80 1116 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1117
1118 # Vimeo specific: extract request signature and timestamp
1119 sig = config['request']['signature']
1120 timestamp = config['request']['timestamp']
1121
1122 # Vimeo specific: extract video codec and quality information
1123 # First consider quality, then codecs, then take everything
1124 # TODO bind to format param
1125 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126 files = { 'hd': [], 'sd': [], 'other': []}
1127 for codec_name, codec_extension in codecs:
1128 if codec_name in config["video"]["files"]:
1129 if 'hd' in config["video"]["files"][codec_name]:
1130 files['hd'].append((codec_name, codec_extension, 'hd'))
1131 elif 'sd' in config["video"]["files"][codec_name]:
1132 files['sd'].append((codec_name, codec_extension, 'sd'))
1133 else:
1134 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135
1136 for quality in ('hd', 'sd', 'other'):
1137 if len(files[quality]) > 0:
1138 video_quality = files[quality][0][2]
1139 video_codec = files[quality][0][0]
1140 video_extension = files[quality][0][1]
f17ce13a 1141 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
59ae15a5
PH
1142 break
1143 else:
e5f30ade 1144 self._downloader.report_error(u'no known codec found')
59ae15a5
PH
1145 return
1146
1147 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1148 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1149
1150 return [{
1151 'id': video_id,
1152 'url': video_url,
1153 'uploader': video_uploader,
77c4beab 1154 'uploader_id': video_uploader_id,
59ae15a5
PH
1155 'upload_date': video_upload_date,
1156 'title': video_title,
1157 'ext': video_extension,
1158 'thumbnail': video_thumbnail,
1159 'description': video_description,
1160 }]
d77c3dfd
FV
1161
1162
f2ad10a9 1163class ArteTvIE(InfoExtractor):
59ae15a5
PH
1164 """arte.tv information extractor."""
1165
1166 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1167 _LIVE_URL = r'index-[0-9]+\.html$'
1168
1169 IE_NAME = u'arte.tv'
1170
59ae15a5 1171 def fetch_webpage(self, url):
59ae15a5
PH
1172 request = compat_urllib_request.Request(url)
1173 try:
1174 self.report_download_webpage(url)
1175 webpage = compat_urllib_request.urlopen(request).read()
1176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1177 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
1178 return
1179 except ValueError as err:
e5f30ade 1180 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1181 return
1182 return webpage
1183
1184 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1185 page = self.fetch_webpage(url)
1186 mobj = re.search(regex, page, regexFlags)
1187 info = {}
1188
1189 if mobj is None:
e5f30ade 1190 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1191 return
1192
1193 for (i, key, err) in matchTuples:
1194 if mobj.group(i) is None:
613bf669 1195 self._downloader.report_error(err)
59ae15a5
PH
1196 return
1197 else:
1198 info[key] = mobj.group(i)
1199
1200 return info
1201
1202 def extractLiveStream(self, url):
1203 video_lang = url.split('/')[-4]
1204 info = self.grep_webpage(
1205 url,
1206 r'src="(.*?/videothek_js.*?\.js)',
1207 0,
1208 [
613bf669 1209 (1, 'url', u'Invalid URL: %s' % url)
59ae15a5
PH
1210 ]
1211 )
1212 http_host = url.split('/')[2]
1213 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1214 info = self.grep_webpage(
1215 next_url,
1216 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1217 '(http://.*?\.swf).*?' +
1218 '(rtmp://.*?)\'',
1219 re.DOTALL,
1220 [
613bf669
JMF
1221 (1, 'path', u'could not extract video path: %s' % url),
1222 (2, 'player', u'could not extract video player: %s' % url),
1223 (3, 'url', u'could not extract video url: %s' % url)
59ae15a5
PH
1224 ]
1225 )
1226 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1227
1228 def extractPlus7Stream(self, url):
1229 video_lang = url.split('/')[-3]
1230 info = self.grep_webpage(
1231 url,
1232 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1233 0,
1234 [
613bf669 1235 (1, 'url', u'Invalid URL: %s' % url)
59ae15a5
PH
1236 ]
1237 )
1238 next_url = compat_urllib_parse.unquote(info.get('url'))
1239 info = self.grep_webpage(
1240 next_url,
1241 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1242 0,
1243 [
613bf669 1244 (1, 'url', u'Could not find <video> tag: %s' % url)
59ae15a5
PH
1245 ]
1246 )
1247 next_url = compat_urllib_parse.unquote(info.get('url'))
1248
1249 info = self.grep_webpage(
1250 next_url,
1251 r'<video id="(.*?)".*?>.*?' +
1252 '<name>(.*?)</name>.*?' +
1253 '<dateVideo>(.*?)</dateVideo>.*?' +
1254 '<url quality="hd">(.*?)</url>',
1255 re.DOTALL,
1256 [
613bf669
JMF
1257 (1, 'id', u'could not extract video id: %s' % url),
1258 (2, 'title', u'could not extract video title: %s' % url),
1259 (3, 'date', u'could not extract video date: %s' % url),
1260 (4, 'url', u'could not extract video url: %s' % url)
59ae15a5
PH
1261 ]
1262 )
1263
1264 return {
1265 'id': info.get('id'),
1266 'url': compat_urllib_parse.unquote(info.get('url')),
1267 'uploader': u'arte.tv',
1268 'upload_date': info.get('date'),
93702113 1269 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1270 'ext': u'mp4',
1271 'format': u'NA',
1272 'player_url': None,
1273 }
1274
1275 def _real_extract(self, url):
1276 video_id = url.split('/')[-1]
1277 self.report_extraction(video_id)
1278
1279 if re.search(self._LIVE_URL, video_id) is not None:
1280 self.extractLiveStream(url)
1281 return
1282 else:
1283 info = self.extractPlus7Stream(url)
1284
1285 return [info]
f2ad10a9
CA
1286
1287
d77c3dfd 1288class GenericIE(InfoExtractor):
59ae15a5
PH
1289 """Generic last-resort information extractor."""
1290
1291 _VALID_URL = r'.*'
1292 IE_NAME = u'generic'
1293
59ae15a5
PH
1294 def report_download_webpage(self, video_id):
1295 """Report webpage download."""
3d342357 1296 if not self._downloader.params.get('test', False):
f17ce13a 1297 self._downloader.report_warning(u'Falling back on generic information extractor.')
0d173446 1298 super(GenericIE, self).report_download_webpage(video_id)
59ae15a5 1299
59ae15a5
PH
1300 def report_following_redirect(self, new_url):
1301 """Report information extraction."""
1302 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1303
59ae15a5 1304 def _test_redirect(self, url):
a0d6fe7b 1305 """Check if it is a redirect, like url shorteners, in case return the new url."""
59ae15a5
PH
1306 class HeadRequest(compat_urllib_request.Request):
1307 def get_method(self):
1308 return "HEAD"
1309
1310 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1311 """
cdb30764 1312 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1313 HeadRequest also on the redirected URL
1314 """
cdb30764 1315 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1316 if code in (301, 302, 303, 307):
cdb30764 1317 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1318 newheaders = dict((k,v) for k,v in req.headers.items()
1319 if k.lower() not in ("content-length", "content-type"))
cdb30764 1320 return HeadRequest(newurl,
59ae15a5 1321 headers=newheaders,
cdb30764
ND
1322 origin_req_host=req.get_origin_req_host(),
1323 unverifiable=True)
1324 else:
1325 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1326
1327 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1328 """
1329 Fallback to GET if HEAD is not allowed (405 HTTP error)
1330 """
cdb30764 1331 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1332 fp.read()
1333 fp.close()
1334
1335 newheaders = dict((k,v) for k,v in req.headers.items()
1336 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1337 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1338 headers=newheaders,
1339 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1340 unverifiable=True))
1341
1342 # Build our opener
cdb30764 1343 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1344 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1345 HTTPMethodFallback, HEADRedirectHandler,
7c038b3c 1346 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
59ae15a5
PH
1347 opener.add_handler(handler())
1348
1349 response = opener.open(HeadRequest(url))
1350 new_url = response.geturl()
1351
1352 if url == new_url:
1353 return False
1354
1355 self.report_following_redirect(new_url)
a0d6fe7b 1356 return new_url
59ae15a5
PH
1357
1358 def _real_extract(self, url):
a0d6fe7b
JMF
1359 new_url = self._test_redirect(url)
1360 if new_url: return [self.url_result(new_url)]
59ae15a5
PH
1361
1362 video_id = url.split('/')[-1]
59ae15a5 1363 try:
3d342357 1364 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
1365 except ValueError as err:
1366 # since this is the last-resort InfoExtractor, if
1367 # this error is thrown, it'll be thrown here
e5f30ade 1368 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1369 return
1370
1371 self.report_extraction(video_id)
1372 # Start with something easy: JW Player in SWFObject
1373 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1374 if mobj is None:
1375 # Broaden the search a little bit
1376 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1013186a
PH
1377 if mobj is None:
1378 # Broaden the search a little bit: JWPlayer JS loader
1379 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
59ae15a5 1380 if mobj is None:
e5f30ade 1381 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1382 return
1383
1384 # It's possible that one of the regexes
1385 # matched, but returned an empty group:
1386 if mobj.group(1) is None:
e5f30ade 1387 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1388 return
1389
1390 video_url = compat_urllib_parse.unquote(mobj.group(1))
1391 video_id = os.path.basename(video_url)
1392
1393 # here's a fun little line of code for you:
1394 video_extension = os.path.splitext(video_id)[1][1:]
1395 video_id = os.path.splitext(video_id)[0]
1396
1397 # it's tempting to parse this further, but you would
1398 # have to take into account all the variations like
1399 # Video Title - Site Name
1400 # Site Name | Video Title
1401 # Video Title - Tagline | Site Name
1402 # and so on and so forth; it's just not practical
1403 mobj = re.search(r'<title>(.*)</title>', webpage)
1404 if mobj is None:
e5f30ade 1405 self._downloader.report_error(u'unable to extract title')
59ae15a5 1406 return
f1171f7c 1407 video_title = mobj.group(1)
59ae15a5
PH
1408
1409 # video uploader is domain name
1410 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1411 if mobj is None:
e5f30ade 1412 self._downloader.report_error(u'unable to extract title')
59ae15a5 1413 return
f1171f7c 1414 video_uploader = mobj.group(1)
59ae15a5
PH
1415
1416 return [{
f1171f7c
PH
1417 'id': video_id,
1418 'url': video_url,
59ae15a5
PH
1419 'uploader': video_uploader,
1420 'upload_date': None,
1421 'title': video_title,
f1171f7c 1422 'ext': video_extension,
59ae15a5 1423 }]
d77c3dfd
FV
1424
1425
1426class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1427 """Information Extractor for YouTube search queries."""
1428 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1429 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1430 _max_youtube_results = 1000
1431 IE_NAME = u'youtube:search'
1432
59ae15a5
PH
1433 def report_download_page(self, query, pagenum):
1434 """Report attempt to download search page with given number."""
1435 query = query.decode(preferredencoding())
1436 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1437
1438 def _real_extract(self, query):
1439 mobj = re.match(self._VALID_URL, query)
1440 if mobj is None:
e5f30ade 1441 self._downloader.report_error(u'invalid search query "%s"' % query)
59ae15a5
PH
1442 return
1443
1444 prefix, query = query.split(':')
1445 prefix = prefix[8:]
1446 query = query.encode('utf-8')
1447 if prefix == '':
8c416ad2 1448 return self._get_n_results(query, 1)
59ae15a5 1449 elif prefix == 'all':
8c416ad2 1450 self._get_n_results(query, self._max_youtube_results)
59ae15a5
PH
1451 else:
1452 try:
1453 n = int(prefix)
1454 if n <= 0:
e5f30ade 1455 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
59ae15a5
PH
1456 return
1457 elif n > self._max_youtube_results:
2e5457be 1458 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
59ae15a5 1459 n = self._max_youtube_results
8c416ad2 1460 return self._get_n_results(query, n)
59ae15a5 1461 except ValueError: # parsing prefix as integer fails
8c416ad2 1462 return self._get_n_results(query, 1)
59ae15a5 1463
8c416ad2
JMF
1464 def _get_n_results(self, query, n):
1465 """Get a specified number of results for a query"""
59ae15a5
PH
1466
1467 video_ids = []
1468 pagenum = 0
1469 limit = n
1470
1471 while (50 * pagenum) < limit:
1472 self.report_download_page(query, pagenum+1)
1473 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474 request = compat_urllib_request.Request(result_url)
1475 try:
d1b7a243 1476 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 1477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1478 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
59ae15a5
PH
1479 return
1480 api_response = json.loads(data)['data']
1481
9e07cf29 1482 if not 'items' in api_response:
613bf669 1483 self._downloader.report_error(u'[youtube] No video results')
9e07cf29
J
1484 return
1485
59ae15a5
PH
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1488
1489 limit = min(n, api_response['totalItems'])
1490 pagenum += 1
1491
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
8c416ad2
JMF
1494 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495 return videos
d77c3dfd
FV
1496
1497
1498class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1499 """Information Extractor for Google Video search queries."""
1500 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504 _max_google_results = 1000
1505 IE_NAME = u'video.google:search'
1506
59ae15a5
PH
1507 def report_download_page(self, query, pagenum):
1508 """Report attempt to download playlist page with given number."""
1509 query = query.decode(preferredencoding())
f17ce13a 1510 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
59ae15a5
PH
1511
1512 def _real_extract(self, query):
1513 mobj = re.match(self._VALID_URL, query)
1514 if mobj is None:
e5f30ade 1515 self._downloader.report_error(u'invalid search query "%s"' % query)
59ae15a5
PH
1516 return
1517
1518 prefix, query = query.split(':')
1519 prefix = prefix[8:]
1520 query = query.encode('utf-8')
1521 if prefix == '':
1522 self._download_n_results(query, 1)
1523 return
1524 elif prefix == 'all':
1525 self._download_n_results(query, self._max_google_results)
1526 return
1527 else:
1528 try:
1529 n = int(prefix)
1530 if n <= 0:
e5f30ade 1531 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
59ae15a5
PH
1532 return
1533 elif n > self._max_google_results:
2e5457be 1534 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
59ae15a5
PH
1535 n = self._max_google_results
1536 self._download_n_results(query, n)
1537 return
1538 except ValueError: # parsing prefix as integer fails
1539 self._download_n_results(query, 1)
1540 return
1541
1542 def _download_n_results(self, query, n):
1543 """Downloads a specified number of results for a query"""
1544
1545 video_ids = []
1546 pagenum = 0
1547
1548 while True:
1549 self.report_download_page(query, pagenum)
1550 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1551 request = compat_urllib_request.Request(result_url)
1552 try:
1553 page = compat_urllib_request.urlopen(request).read()
1554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1555 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1556 return
1557
1558 # Extract video identifiers
1559 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560 video_id = mobj.group(1)
1561 if video_id not in video_ids:
1562 video_ids.append(video_id)
1563 if len(video_ids) == n:
1564 # Specified n videos reached
1565 for id in video_ids:
1566 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1567 return
1568
1569 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572 return
1573
1574 pagenum = pagenum + 1
d77c3dfd
FV
1575
1576
1577class YahooSearchIE(InfoExtractor):
59ae15a5 1578 """Information Extractor for Yahoo! Video search queries."""
93702113
FV
1579
1580 _WORKING = False
59ae15a5
PH
1581 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1582 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1583 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1584 _MORE_PAGES_INDICATOR = r'\s*Next'
1585 _max_yahoo_results = 1000
1586 IE_NAME = u'video.yahoo:search'
1587
59ae15a5
PH
1588 def report_download_page(self, query, pagenum):
1589 """Report attempt to download playlist page with given number."""
1590 query = query.decode(preferredencoding())
f17ce13a 1591 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
59ae15a5
PH
1592
1593 def _real_extract(self, query):
1594 mobj = re.match(self._VALID_URL, query)
1595 if mobj is None:
e5f30ade 1596 self._downloader.report_error(u'invalid search query "%s"' % query)
59ae15a5
PH
1597 return
1598
1599 prefix, query = query.split(':')
1600 prefix = prefix[8:]
1601 query = query.encode('utf-8')
1602 if prefix == '':
1603 self._download_n_results(query, 1)
1604 return
1605 elif prefix == 'all':
1606 self._download_n_results(query, self._max_yahoo_results)
1607 return
1608 else:
1609 try:
1610 n = int(prefix)
1611 if n <= 0:
e5f30ade 1612 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
59ae15a5
PH
1613 return
1614 elif n > self._max_yahoo_results:
2e5457be 1615 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
59ae15a5
PH
1616 n = self._max_yahoo_results
1617 self._download_n_results(query, n)
1618 return
1619 except ValueError: # parsing prefix as integer fails
1620 self._download_n_results(query, 1)
1621 return
1622
1623 def _download_n_results(self, query, n):
1624 """Downloads a specified number of results for a query"""
1625
1626 video_ids = []
1627 already_seen = set()
1628 pagenum = 1
1629
1630 while True:
1631 self.report_download_page(query, pagenum)
1632 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1633 request = compat_urllib_request.Request(result_url)
1634 try:
1635 page = compat_urllib_request.urlopen(request).read()
1636 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1637 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1638 return
1639
1640 # Extract video identifiers
1641 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1642 video_id = mobj.group(1)
1643 if video_id not in already_seen:
1644 video_ids.append(video_id)
1645 already_seen.add(video_id)
1646 if len(video_ids) == n:
1647 # Specified n videos reached
1648 for id in video_ids:
1649 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1650 return
1651
1652 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655 return
1656
1657 pagenum = pagenum + 1
d77c3dfd
FV
1658
1659
1660class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1661 """Information Extractor for YouTube playlists."""
1662
6324fd1d
FV
1663 _VALID_URL = r"""(?:
1664 (?:https?://)?
1665 (?:\w+\.)?
1666 youtube\.com/
1667 (?:
89de9eb1
FV
1668 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1669 \? (?:.*?&)*? (?:p|a|list)=
6324fd1d 1670 | p/
6324fd1d 1671 )
89de9eb1
FV
1672 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1673 .*
1674 |
1675 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1676 )"""
6324fd1d
FV
1677 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1678 _MAX_RESULTS = 50
59ae15a5
PH
1679 IE_NAME = u'youtube:playlist'
1680
89de9eb1
FV
1681 @classmethod
1682 def suitable(cls, url):
6324fd1d 1683 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 1684 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
6324fd1d 1685
59ae15a5
PH
1686 def report_download_page(self, playlist_id, pagenum):
1687 """Report attempt to download playlist page with given number."""
1688 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1689
1690 def _real_extract(self, url):
1691 # Extract playlist id
6324fd1d 1692 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 1693 if mobj is None:
e5f30ade 1694 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5
PH
1695 return
1696
6324fd1d 1697 # Download playlist videos from API
89de9eb1 1698 playlist_id = mobj.group(1) or mobj.group(2)
6324fd1d
FV
1699 page_num = 1
1700 videos = []
59ae15a5
PH
1701
1702 while True:
6324fd1d
FV
1703 self.report_download_page(playlist_id, page_num)
1704
1705 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
59ae15a5 1706 try:
6324fd1d 1707 page = compat_urllib_request.urlopen(url).read().decode('utf8')
59ae15a5 1708 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1709 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1710 return
1711
6324fd1d
FV
1712 try:
1713 response = json.loads(page)
1714 except ValueError as err:
e5f30ade 1715 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
6324fd1d 1716 return
59ae15a5 1717
feba604e 1718 if 'feed' not in response:
e5f30ade 1719 self._downloader.report_error(u'Got a malformed response from YouTube API')
89de9eb1 1720 return
aba8df23 1721 playlist_title = response['feed']['title']['$t']
feba604e
PH
1722 if 'entry' not in response['feed']:
1723 # Number of videos is a multiple of self._MAX_RESULTS
1724 break
1725
89de9eb1
FV
1726 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1727 for entry in response['feed']['entry']
1728 if 'content' in entry ]
6324fd1d
FV
1729
1730 if len(response['feed']['entry']) < self._MAX_RESULTS:
59ae15a5 1731 break
6324fd1d 1732 page_num += 1
59ae15a5 1733
691db5ba 1734 videos = [v[1] for v in sorted(videos)]
9789a05c 1735
6de8f1af 1736 url_results = [self.url_result(url, 'Youtube') for url in videos]
c7293824 1737 return [self.playlist_result(url_results, playlist_id, playlist_title)]
d77c3dfd
FV
1738
1739
902b2a0a 1740class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1741 """Information Extractor for YouTube channels."""
1742
5a8d1319 1743 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
59ae15a5 1744 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
5a8d1319 1745 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1746 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
59ae15a5
PH
1747 IE_NAME = u'youtube:channel'
1748
1749 def report_download_page(self, channel_id, pagenum):
1750 """Report attempt to download channel page with given number."""
1751 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1752
5a8d1319 1753 def extract_videos_from_page(self, page):
1754 ids_in_page = []
1755 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1756 if mobj.group(1) not in ids_in_page:
1757 ids_in_page.append(mobj.group(1))
1758 return ids_in_page
1759
59ae15a5
PH
1760 def _real_extract(self, url):
1761 # Extract channel id
1762 mobj = re.match(self._VALID_URL, url)
1763 if mobj is None:
e5f30ade 1764 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5
PH
1765 return
1766
5a8d1319 1767 # Download channel page
59ae15a5
PH
1768 channel_id = mobj.group(1)
1769 video_ids = []
1770 pagenum = 1
1771
5a8d1319 1772 self.report_download_page(channel_id, pagenum)
1773 url = self._TEMPLATE_URL % (channel_id, pagenum)
1774 request = compat_urllib_request.Request(url)
1775 try:
1776 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1779 return
59ae15a5 1780
5a8d1319 1781 # Extract video identifiers
1782 ids_in_page = self.extract_videos_from_page(page)
1783 video_ids.extend(ids_in_page)
59ae15a5 1784
5a8d1319 1785 # Download any subsequent channel pages using the json-based channel_ajax query
1786 if self._MORE_PAGES_INDICATOR in page:
1787 while True:
1788 pagenum = pagenum + 1
1789
1790 self.report_download_page(channel_id, pagenum)
1791 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1792 request = compat_urllib_request.Request(url)
1793 try:
1794 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1796 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1797 return
1798
1799 page = json.loads(page)
1800
1801 ids_in_page = self.extract_videos_from_page(page['content_html'])
1802 video_ids.extend(ids_in_page)
1803
1804 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1805 break
59ae15a5 1806
9789a05c
FV
1807 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808
f6e6da95 1809 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
6de8f1af 1810 url_entries = [self.url_result(url, 'Youtube') for url in urls]
d2c69082 1811 return [self.playlist_result(url_entries, channel_id)]
902b2a0a
FV
1812
1813
d77c3dfd 1814class YoutubeUserIE(InfoExtractor):
59ae15a5 1815 """Information Extractor for YouTube users."""
d77c3dfd 1816
59ae15a5
PH
1817 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1818 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1819 _GDATA_PAGE_SIZE = 50
1820 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1821 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1822 IE_NAME = u'youtube:user'
d77c3dfd 1823
59ae15a5
PH
1824 def report_download_page(self, username, start_index):
1825 """Report attempt to download user page."""
1826 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1827 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1828
59ae15a5
PH
1829 def _real_extract(self, url):
1830 # Extract username
1831 mobj = re.match(self._VALID_URL, url)
1832 if mobj is None:
e5f30ade 1833 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5 1834 return
d77c3dfd 1835
59ae15a5 1836 username = mobj.group(1)
d77c3dfd 1837
59ae15a5
PH
1838 # Download video ids using YouTube Data API. Result size per
1839 # query is limited (currently to 50 videos) so we need to query
1840 # page by page until there are no video ids - it means we got
1841 # all of them.
d77c3dfd 1842
59ae15a5
PH
1843 video_ids = []
1844 pagenum = 0
d77c3dfd 1845
59ae15a5
PH
1846 while True:
1847 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1848 self.report_download_page(username, start_index)
d77c3dfd 1849
59ae15a5 1850 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1851
59ae15a5 1852 try:
80d3177e 1853 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 1854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1855 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5 1856 return
d77c3dfd 1857
59ae15a5
PH
1858 # Extract video identifiers
1859 ids_in_page = []
d77c3dfd 1860
59ae15a5
PH
1861 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1862 if mobj.group(1) not in ids_in_page:
1863 ids_in_page.append(mobj.group(1))
d77c3dfd 1864
59ae15a5 1865 video_ids.extend(ids_in_page)
d77c3dfd 1866
59ae15a5
PH
1867 # A little optimization - if current page is not
1868 # "full", ie. does not contain PAGE_SIZE video ids then
1869 # we can assume that this page is the last one - there
1870 # are no more ids on further pages - no need to query
1871 # again.
d77c3dfd 1872
59ae15a5
PH
1873 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1874 break
d77c3dfd 1875
59ae15a5 1876 pagenum += 1
d77c3dfd 1877
597cc8a4 1878 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
6de8f1af 1879 url_results = [self.url_result(url, 'Youtube') for url in urls]
d2c69082 1880 return [self.playlist_result(url_results, playlist_title = username)]
d77c3dfd
FV
1881
1882
eeeb4daa 1883class BlipTVUserIE(InfoExtractor):
59ae15a5 1884 """Information Extractor for blip.tv users."""
eeeb4daa 1885
59ae15a5
PH
1886 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1887 _PAGE_SIZE = 12
1888 IE_NAME = u'blip.tv:user'
eeeb4daa 1889
59ae15a5
PH
1890 def report_download_page(self, username, pagenum):
1891 """Report attempt to download user page."""
f17ce13a
JMF
1892 self.to_screen(u'user %s: Downloading video ids from page %d' %
1893 (username, pagenum))
eeeb4daa 1894
59ae15a5
PH
1895 def _real_extract(self, url):
1896 # Extract username
1897 mobj = re.match(self._VALID_URL, url)
1898 if mobj is None:
e5f30ade 1899 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5 1900 return
eeeb4daa 1901
59ae15a5 1902 username = mobj.group(1)
eeeb4daa 1903
59ae15a5 1904 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1905
59ae15a5 1906 request = compat_urllib_request.Request(url)
eeeb4daa 1907
59ae15a5
PH
1908 try:
1909 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1910 mobj = re.search(r'data-users-id="([^"]+)"', page)
1911 page_base = page_base % mobj.group(1)
1912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1913 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5 1914 return
eeeb4daa
JCGS
1915
1916
59ae15a5
PH
1917 # Download video ids using BlipTV Ajax calls. Result size per
1918 # query is limited (currently to 12 videos) so we need to query
1919 # page by page until there are no video ids - it means we got
1920 # all of them.
eeeb4daa 1921
59ae15a5
PH
1922 video_ids = []
1923 pagenum = 1
eeeb4daa 1924
59ae15a5
PH
1925 while True:
1926 self.report_download_page(username, pagenum)
450e7099
PH
1927 url = page_base + "&page=" + str(pagenum)
1928 request = compat_urllib_request.Request( url )
59ae15a5
PH
1929 try:
1930 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1932 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
59ae15a5 1933 return
eeeb4daa 1934
59ae15a5
PH
1935 # Extract video identifiers
1936 ids_in_page = []
eeeb4daa 1937
59ae15a5
PH
1938 for mobj in re.finditer(r'href="/([^"]+)"', page):
1939 if mobj.group(1) not in ids_in_page:
1940 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1941
59ae15a5 1942 video_ids.extend(ids_in_page)
eeeb4daa 1943
59ae15a5
PH
1944 # A little optimization - if current page is not
1945 # "full", ie. does not contain PAGE_SIZE video ids then
1946 # we can assume that this page is the last one - there
1947 # are no more ids on further pages - no need to query
1948 # again.
eeeb4daa 1949
59ae15a5
PH
1950 if len(ids_in_page) < self._PAGE_SIZE:
1951 break
eeeb4daa 1952
59ae15a5 1953 pagenum += 1
eeeb4daa 1954
f6e6da95 1955 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
6de8f1af 1956 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
d2c69082 1957 return [self.playlist_result(url_entries, playlist_title = username)]
eeeb4daa
JCGS
1958
1959
d77c3dfd 1960class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1961 """Information extractor for depositfiles.com"""
1962
1963 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59ae15a5 1964
59ae15a5
PH
1965 def _real_extract(self, url):
1966 file_id = url.split('/')[-1]
1967 # Rebuild url in english locale
1968 url = 'http://depositfiles.com/en/files/' + file_id
1969
1970 # Retrieve file webpage with 'Free download' button pressed
1971 free_download_indication = { 'gateway_result' : '1' }
1972 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1973 try:
1974 self.report_download_webpage(file_id)
1975 webpage = compat_urllib_request.urlopen(request).read()
1976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1977 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
59ae15a5
PH
1978 return
1979
1980 # Search for the real file URL
1981 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1982 if (mobj is None) or (mobj.group(1) is None):
1983 # Try to figure out reason of the error.
1984 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1985 if (mobj is not None) and (mobj.group(1) is not None):
1986 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
e5f30ade 1987 self._downloader.report_error(u'%s' % restriction_message)
59ae15a5 1988 else:
e5f30ade 1989 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
59ae15a5
PH
1990 return
1991
1992 file_url = mobj.group(1)
1993 file_extension = os.path.splitext(file_url)[1][1:]
1994
1995 # Search for file title
1996 mobj = re.search(r'<b title="(.*?)">', webpage)
1997 if mobj is None:
e5f30ade 1998 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
1999 return
2000 file_title = mobj.group(1).decode('utf-8')
2001
2002 return [{
2003 'id': file_id.decode('utf-8'),
2004 'url': file_url.decode('utf-8'),
2005 'uploader': None,
2006 'upload_date': None,
2007 'title': file_title,
2008 'ext': file_extension.decode('utf-8'),
2009 }]
d77c3dfd
FV
2010
2011
2012class FacebookIE(InfoExtractor):
59ae15a5
PH
2013 """Information Extractor for Facebook"""
2014
59ae15a5
PH
2015 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2016 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2017 _NETRC_MACHINE = 'facebook'
59ae15a5
PH
2018 IE_NAME = u'facebook'
2019
59ae15a5
PH
2020 def report_login(self):
2021 """Report attempt to log in."""
f17ce13a 2022 self.to_screen(u'Logging in')
59ae15a5
PH
2023
2024 def _real_initialize(self):
2025 if self._downloader is None:
2026 return
2027
2028 useremail = None
2029 password = None
2030 downloader_params = self._downloader.params
2031
2032 # Attempt to use provided username and password or .netrc data
2033 if downloader_params.get('username', None) is not None:
2034 useremail = downloader_params['username']
2035 password = downloader_params['password']
2036 elif downloader_params.get('usenetrc', False):
2037 try:
2038 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2039 if info is not None:
2040 useremail = info[0]
2041 password = info[2]
2042 else:
2043 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2044 except (IOError, netrc.NetrcParseError) as err:
2e5457be 2045 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
2046 return
2047
2048 if useremail is None:
2049 return
2050
2051 # Log in
2052 login_form = {
2053 'email': useremail,
2054 'pass': password,
2055 'login': 'Log+In'
2056 }
2057 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2058 try:
2059 self.report_login()
2060 login_results = compat_urllib_request.urlopen(request).read()
2061 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2e5457be 2062 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
59ae15a5
PH
2063 return
2064 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 2065 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
2066 return
2067
2068 def _real_extract(self, url):
2069 mobj = re.match(self._VALID_URL, url)
2070 if mobj is None:
e5f30ade 2071 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2072 return
2073 video_id = mobj.group('ID')
2074
b954070d
PH
2075 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2076 webpage = self._download_webpage(url, video_id)
2077
32c96387 2078 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
b954070d
PH
2079 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2080 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2081 if not m:
2082 raise ExtractorError(u'Cannot parse data')
2083 data = dict(json.loads(m.group(1)))
edba5137
PH
2084 params_raw = compat_urllib_parse.unquote(data['params'])
2085 params = json.loads(params_raw)
32c96387
PH
2086 video_data = params['video_data'][0]
2087 video_url = video_data.get('hd_src')
7796e8c2 2088 if not video_url:
32c96387 2089 video_url = video_data['sd_src']
7796e8c2
PH
2090 if not video_url:
2091 raise ExtractorError(u'Cannot find video URL')
32c96387
PH
2092 video_duration = int(video_data['video_duration'])
2093 thumbnail = video_data['thumbnail_src']
b954070d
PH
2094
2095 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2096 if not m:
2097 raise ExtractorError(u'Cannot find title in webpage')
2098 video_title = unescapeHTML(m.group(1))
2099
2100 info = {
2101 'id': video_id,
2102 'title': video_title,
2103 'url': video_url,
2104 'ext': 'mp4',
2105 'duration': video_duration,
32c96387 2106 'thumbnail': thumbnail,
b954070d
PH
2107 }
2108 return [info]
59ae15a5 2109
d77c3dfd
FV
2110
2111class BlipTVIE(InfoExtractor):
59ae15a5
PH
2112 """Information extractor for blip.tv"""
2113
2114 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2115 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2116 IE_NAME = u'blip.tv'
2117
59ae15a5
PH
2118 def report_direct_download(self, title):
2119 """Report information extraction."""
f17ce13a 2120 self.to_screen(u'%s: Direct download detected' % title)
59ae15a5
PH
2121
2122 def _real_extract(self, url):
2123 mobj = re.match(self._VALID_URL, url)
2124 if mobj is None:
e5f30ade 2125 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2126 return
2127
f7b567ff
PH
2128 urlp = compat_urllib_parse_urlparse(url)
2129 if urlp.path.startswith('/play/'):
7f9d41a5
JCGS
2130 request = compat_urllib_request.Request(url)
2131 response = compat_urllib_request.urlopen(request)
2132 redirecturl = response.geturl()
f7b567ff
PH
2133 rurlp = compat_urllib_parse_urlparse(redirecturl)
2134 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2135 url = 'http://blip.tv/a/a-' + file_id
2136 return self._real_extract(url)
2137
7f9d41a5 2138
59ae15a5
PH
2139 if '?' in url:
2140 cchar = '&'
2141 else:
2142 cchar = '?'
2143 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2144 request = compat_urllib_request.Request(json_url)
3446dfb7 2145 request.add_header('User-Agent', 'iTunes/10.6.1')
59ae15a5
PH
2146 self.report_extraction(mobj.group(1))
2147 info = None
2148 try:
2149 urlh = compat_urllib_request.urlopen(request)
2150 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2151 basename = url.split('/')[-1]
2152 title,ext = os.path.splitext(basename)
2153 title = title.decode('UTF-8')
2154 ext = ext.replace('.', '')
2155 self.report_direct_download(title)
2156 info = {
2157 'id': title,
2158 'url': url,
2159 'uploader': None,
2160 'upload_date': None,
2161 'title': title,
2162 'ext': ext,
2163 'urlhandle': urlh
2164 }
2165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446dfb7 2166 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
59ae15a5
PH
2167 if info is None: # Regular URL
2168 try:
55c05398
PH
2169 json_code_bytes = urlh.read()
2170 json_code = json_code_bytes.decode('utf-8')
59ae15a5 2171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2172 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
59ae15a5
PH
2173 return
2174
2175 try:
2176 json_data = json.loads(json_code)
2177 if 'Post' in json_data:
2178 data = json_data['Post']
2179 else:
2180 data = json_data
2181
2182 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2183 video_url = data['media']['url']
2184 umobj = re.match(self._URL_EXT, video_url)
2185 if umobj is None:
2186 raise ValueError('Can not determine filename extension')
2187 ext = umobj.group(1)
2188
2189 info = {
2190 'id': data['item_id'],
2191 'url': video_url,
2192 'uploader': data['display_name'],
2193 'upload_date': upload_date,
2194 'title': data['title'],
2195 'ext': ext,
2196 'format': data['media']['mimeType'],
2197 'thumbnail': data['thumbnailUrl'],
2198 'description': data['description'],
3446dfb7
PH
2199 'player_url': data['embedUrl'],
2200 'user_agent': 'iTunes/10.6.1',
59ae15a5
PH
2201 }
2202 except (ValueError,KeyError) as err:
e5f30ade 2203 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
59ae15a5
PH
2204 return
2205
59ae15a5 2206 return [info]
d77c3dfd
FV
2207
2208
2209class MyVideoIE(InfoExtractor):
59ae15a5
PH
2210 """Information Extractor for myvideo.de."""
2211
2212 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2213 IE_NAME = u'myvideo'
2214
59ae15a5
PH
2215 def _real_extract(self,url):
2216 mobj = re.match(self._VALID_URL, url)
2217 if mobj is None:
e5f30ade 2218 self._download.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2219 return
2220
2221 video_id = mobj.group(1)
2222
2223 # Get video webpage
5f955171
PH
2224 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2225 webpage = self._download_webpage(webpage_url, video_id)
59ae15a5
PH
2226
2227 self.report_extraction(video_id)
7decf895 2228 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
59ae15a5
PH
2229 webpage)
2230 if mobj is None:
e5f30ade 2231 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
2232 return
2233 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2234
2235 mobj = re.search('<title>([^<]+)</title>', webpage)
2236 if mobj is None:
e5f30ade 2237 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
2238 return
2239
2240 video_title = mobj.group(1)
2241
2242 return [{
2243 'id': video_id,
2244 'url': video_url,
2245 'uploader': None,
2246 'upload_date': None,
2247 'title': video_title,
2248 'ext': u'flv',
2249 }]
d77c3dfd
FV
2250
2251class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2252 """Information extractor for The Daily Show and Colbert Report """
2253
ca6849e6 2254 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2255 # urls for episodes like:
ca6849e6 2256 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2257 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2258 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2259 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2260 |(https?://)?(www\.)?
2261 (?P<showname>thedailyshow|colbertnation)\.com/
2262 (full-episodes/(?P<episode>.*)|
2263 (?P<clip>
2264 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2265 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2266 $"""
59ae15a5
PH
2267
2268 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2269
2270 _video_extensions = {
2271 '3500': 'mp4',
2272 '2200': 'mp4',
2273 '1700': 'mp4',
2274 '1200': 'mp4',
2275 '750': 'mp4',
2276 '400': 'mp4',
2277 }
2278 _video_dimensions = {
2279 '3500': '1280x720',
2280 '2200': '960x540',
2281 '1700': '768x432',
2282 '1200': '640x360',
2283 '750': '512x288',
2284 '400': '384x216',
2285 }
2286
89de9eb1
FV
2287 @classmethod
2288 def suitable(cls, url):
ca6849e6 2289 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 2290 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
ca6849e6 2291
32635ec6 2292 def report_config_download(self, episode_id, media_id):
f17ce13a 2293 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
59ae15a5
PH
2294
2295 def report_index_download(self, episode_id):
f17ce13a 2296 self.to_screen(u'%s: Downloading show index' % episode_id)
59ae15a5 2297
59ae15a5
PH
2298 def _print_formats(self, formats):
2299 print('Available formats:')
2300 for x in formats:
2301 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2302
2303
2304 def _real_extract(self, url):
ca6849e6 2305 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 2306 if mobj is None:
e5f30ade 2307 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2308 return
2309
2310 if mobj.group('shortname'):
2311 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2312 url = u'http://www.thedailyshow.com/full-episodes/'
2313 else:
2314 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2315 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2316 assert mobj is not None
2317
ca6849e6 2318 if mobj.group('clip'):
2319 if mobj.group('showname') == 'thedailyshow':
2320 epTitle = mobj.group('tdstitle')
2321 else:
2322 epTitle = mobj.group('cntitle')
2323 dlNewest = False
59ae15a5 2324 else:
ca6849e6 2325 dlNewest = not mobj.group('episode')
2326 if dlNewest:
2327 epTitle = mobj.group('showname')
2328 else:
2329 epTitle = mobj.group('episode')
59ae15a5
PH
2330
2331 req = compat_urllib_request.Request(url)
2332 self.report_extraction(epTitle)
2333 try:
2334 htmlHandle = compat_urllib_request.urlopen(req)
2335 html = htmlHandle.read()
93148102 2336 webpage = html.decode('utf-8')
59ae15a5 2337 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2338 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
2339 return
2340 if dlNewest:
2341 url = htmlHandle.geturl()
ca6849e6 2342 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 2343 if mobj is None:
e5f30ade 2344 self._downloader.report_error(u'Invalid redirected URL: ' + url)
59ae15a5
PH
2345 return
2346 if mobj.group('episode') == '':
e5f30ade 2347 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
59ae15a5
PH
2348 return
2349 epTitle = mobj.group('episode')
2350
93148102 2351 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
59ae15a5
PH
2352
2353 if len(mMovieParams) == 0:
2354 # The Colbert Report embeds the information in a without
2355 # a URL prefix; so extract the alternate reference
2356 # and then add the URL prefix manually.
2357
93148102 2358 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
59ae15a5 2359 if len(altMovieParams) == 0:
e5f30ade 2360 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
59ae15a5
PH
2361 return
2362 else:
2363 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2364
59ae15a5
PH
2365 uri = mMovieParams[0][1]
2366 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2367 self.report_index_download(epTitle)
2368 try:
2369 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2371 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
59ae15a5
PH
2372 return
2373
2374 results = []
2375
2376 idoc = xml.etree.ElementTree.fromstring(indexXml)
2377 itemEls = idoc.findall('.//item')
7717ae19 2378 for partNum,itemEl in enumerate(itemEls):
59ae15a5
PH
2379 mediaId = itemEl.findall('./guid')[0].text
2380 shortMediaId = mediaId.split(':')[-1]
2381 showId = mediaId.split(':')[-2].replace('.com', '')
2382 officialTitle = itemEl.findall('./title')[0].text
bf50b038 2383 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
59ae15a5
PH
2384
2385 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2386 compat_urllib_parse.urlencode({'uri': mediaId}))
2387 configReq = compat_urllib_request.Request(configUrl)
32635ec6 2388 self.report_config_download(epTitle, shortMediaId)
59ae15a5
PH
2389 try:
2390 configXml = compat_urllib_request.urlopen(configReq).read()
2391 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2392 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
2393 return
2394
2395 cdoc = xml.etree.ElementTree.fromstring(configXml)
2396 turls = []
2397 for rendition in cdoc.findall('.//rendition'):
2398 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2399 turls.append(finfo)
2400
2401 if len(turls) == 0:
c9fa1cba 2402 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
59ae15a5 2403 continue
cdb30764 2404
59ae15a5
PH
2405 if self._downloader.params.get('listformats', None):
2406 self._print_formats([i[0] for i in turls])
2407 return
2408
2409 # For now, just pick the highest bitrate
32635ec6 2410 format,rtmp_video_url = turls[-1]
59ae15a5
PH
2411
2412 # Get the format arg from the arg stream
2413 req_format = self._downloader.params.get('format', None)
2414
2415 # Select format if we can find one
2416 for f,v in turls:
2417 if f == req_format:
32635ec6 2418 format, rtmp_video_url = f, v
59ae15a5
PH
2419 break
2420
32635ec6
PH
2421 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2422 if not m:
2423 raise ExtractorError(u'Cannot transform RTMP url')
2424 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2425 video_url = base + m.group('finalid')
59ae15a5 2426
7717ae19 2427 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
59ae15a5
PH
2428 info = {
2429 'id': shortMediaId,
2430 'url': video_url,
2431 'uploader': showId,
2432 'upload_date': officialDate,
2433 'title': effTitle,
2434 'ext': 'mp4',
2435 'format': format,
2436 'thumbnail': None,
2437 'description': officialTitle,
59ae15a5 2438 }
59ae15a5 2439 results.append(info)
cdb30764 2440
59ae15a5 2441 return results
d77c3dfd
FV
2442
2443
2444class EscapistIE(InfoExtractor):
59ae15a5
PH
2445 """Information extractor for The Escapist """
2446
2447 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2448 IE_NAME = u'escapist'
2449
59ae15a5 2450 def report_config_download(self, showName):
f17ce13a 2451 self.to_screen(u'%s: Downloading configuration' % showName)
59ae15a5
PH
2452
2453 def _real_extract(self, url):
2454 mobj = re.match(self._VALID_URL, url)
2455 if mobj is None:
e5f30ade 2456 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2457 return
2458 showName = mobj.group('showname')
2459 videoId = mobj.group('episode')
2460
2461 self.report_extraction(showName)
2462 try:
2463 webPage = compat_urllib_request.urlopen(url)
2464 webPageBytes = webPage.read()
2465 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2466 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2467 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2468 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
59ae15a5
PH
2469 return
2470
2471 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2472 description = unescapeHTML(descMatch.group(1))
2473 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2474 imgUrl = unescapeHTML(imgMatch.group(1))
2475 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2476 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2477 configUrlMatch = re.search('config=(.*)$', playerUrl)
2478 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2479
2480 self.report_config_download(showName)
2481 try:
93702113
FV
2482 configJSON = compat_urllib_request.urlopen(configUrl)
2483 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2484 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
59ae15a5 2485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2486 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
59ae15a5
PH
2487 return
2488
2489 # Technically, it's JavaScript, not JSON
2490 configJSON = configJSON.replace("'", '"')
2491
2492 try:
2493 config = json.loads(configJSON)
2494 except (ValueError,) as err:
e5f30ade 2495 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
59ae15a5
PH
2496 return
2497
2498 playlist = config['playlist']
2499 videoUrl = playlist[1]['url']
2500
2501 info = {
2502 'id': videoId,
2503 'url': videoUrl,
2504 'uploader': showName,
2505 'upload_date': None,
2506 'title': showName,
47dcd621 2507 'ext': 'mp4',
59ae15a5
PH
2508 'thumbnail': imgUrl,
2509 'description': description,
2510 'player_url': playerUrl,
2511 }
2512
2513 return [info]
d77c3dfd 2514
d77c3dfd 2515class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2516 """Information extractor for collegehumor.com"""
2517
0eb0faa2 2518 _WORKING = False
59ae15a5
PH
2519 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2520 IE_NAME = u'collegehumor'
2521
799c0763 2522 def report_manifest(self, video_id):
59ae15a5 2523 """Report information extraction."""
f17ce13a 2524 self.to_screen(u'%s: Downloading XML manifest' % video_id)
59ae15a5 2525
59ae15a5
PH
2526 def _real_extract(self, url):
2527 mobj = re.match(self._VALID_URL, url)
2528 if mobj is None:
e5f30ade 2529 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2530 return
2531 video_id = mobj.group('videoid')
2532
59ae15a5
PH
2533 info = {
2534 'id': video_id,
59ae15a5
PH
2535 'uploader': None,
2536 'upload_date': None,
2537 }
2538
2539 self.report_extraction(video_id)
799c0763 2540 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2541 try:
2542 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2543 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2544 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
59ae15a5
PH
2545 return
2546
2547 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2548 try:
2549 videoNode = mdoc.findall('./video')[0]
2550 info['description'] = videoNode.findall('./description')[0].text
2551 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2552 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2553 manifest_url = videoNode.findall('./file')[0].text
59ae15a5 2554 except IndexError:
c9fa1cba 2555 self._downloader.report_error(u'Invalid metadata XML file')
59ae15a5
PH
2556 return
2557
799c0763
PH
2558 manifest_url += '?hdcore=2.10.3'
2559 self.report_manifest(video_id)
2560 try:
2561 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2563 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
799c0763
PH
2564 return
2565
2566 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2567 try:
2568 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2569 node_id = media_node.attrib['url']
2570 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2571 except IndexError as err:
c9fa1cba 2572 self._downloader.report_error(u'Invalid manifest file')
799c0763
PH
2573 return
2574
2575 url_pr = compat_urllib_parse_urlparse(manifest_url)
2576 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2577
2578 info['url'] = url
2579 info['ext'] = 'f4f'
59ae15a5 2580 return [info]
d77c3dfd
FV
2581
2582
2583class XVideosIE(InfoExtractor):
59ae15a5 2584 """Information extractor for xvideos.com"""
d77c3dfd 2585
59ae15a5
PH
2586 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2587 IE_NAME = u'xvideos'
d77c3dfd 2588
59ae15a5
PH
2589 def _real_extract(self, url):
2590 mobj = re.match(self._VALID_URL, url)
2591 if mobj is None:
e5f30ade 2592 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5 2593 return
8588a86f 2594 video_id = mobj.group(1)
d77c3dfd 2595
5f955171 2596 webpage = self._download_webpage(url, video_id)
d77c3dfd 2597
59ae15a5 2598 self.report_extraction(video_id)
d77c3dfd
FV
2599
2600
59ae15a5
PH
2601 # Extract video URL
2602 mobj = re.search(r'flv_url=(.+?)&', webpage)
2603 if mobj is None:
e5f30ade 2604 self._downloader.report_error(u'unable to extract video url')
59ae15a5 2605 return
8588a86f 2606 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2607
2608
59ae15a5
PH
2609 # Extract title
2610 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2611 if mobj is None:
e5f30ade 2612 self._downloader.report_error(u'unable to extract video title')
59ae15a5 2613 return
8588a86f 2614 video_title = mobj.group(1)
d77c3dfd
FV
2615
2616
59ae15a5
PH
2617 # Extract video thumbnail
2618 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2619 if mobj is None:
e5f30ade 2620 self._downloader.report_error(u'unable to extract video thumbnail')
59ae15a5 2621 return
8588a86f 2622 video_thumbnail = mobj.group(0)
d77c3dfd 2623
59ae15a5
PH
2624 info = {
2625 'id': video_id,
2626 'url': video_url,
2627 'uploader': None,
2628 'upload_date': None,
2629 'title': video_title,
2630 'ext': 'flv',
2631 'thumbnail': video_thumbnail,
2632 'description': None,
2633 }
d77c3dfd 2634
59ae15a5 2635 return [info]
d77c3dfd
FV
2636
2637
2638class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2639 """Information extractor for soundcloud.com
2640 To access the media, the uid of the song and a stream token
2641 must be extracted from the page source and the script must make
2642 a request to media.soundcloud.com/crossdomain.xml. Then
2643 the media can be grabbed by requesting from an url composed
2644 of the stream token and uid
2645 """
2646
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME = u'soundcloud'
2649
8fd3afd5 2650 def report_resolve(self, video_id):
59ae15a5 2651 """Report information extraction."""
f17ce13a 2652 self.to_screen(u'%s: Resolving id' % video_id)
59ae15a5 2653
59ae15a5
PH
2654 def _real_extract(self, url):
2655 mobj = re.match(self._VALID_URL, url)
2656 if mobj is None:
e5f30ade 2657 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2658 return
2659
2660 # extract uploader (which is in the url)
15c8d833 2661 uploader = mobj.group(1)
59ae15a5 2662 # extract simple title (uploader + slug of song title)
15c8d833 2663 slug_title = mobj.group(2)
59ae15a5
PH
2664 simple_title = uploader + u'-' + slug_title
2665
8fd3afd5 2666 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2667
8fd3afd5
PH
2668 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2669 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2670 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2671 try:
8fd3afd5
PH
2672 info_json_bytes = compat_urllib_request.urlopen(request).read()
2673 info_json = info_json_bytes.decode('utf-8')
59ae15a5 2674 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2675 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
59ae15a5
PH
2676 return
2677
8fd3afd5
PH
2678 info = json.loads(info_json)
2679 video_id = info['id']
59ae15a5
PH
2680 self.report_extraction('%s/%s' % (uploader, slug_title))
2681
8fd3afd5 2682 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2683 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2684 try:
2685 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2686 stream_json = stream_json_bytes.decode('utf-8')
2687 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2688 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
b4cd069d 2689 return
59ae15a5 2690
8fd3afd5 2691 streams = json.loads(stream_json)
c7214f9a 2692 mediaURL = streams['http_mp3_128_url']
bf50b038 2693 upload_date = unified_strdate(info['created_at'])
59ae15a5
PH
2694
2695 return [{
c7214f9a 2696 'id': info['id'],
59ae15a5 2697 'url': mediaURL,
c7214f9a 2698 'uploader': info['user']['username'],
bf50b038 2699 'upload_date': upload_date,
c7214f9a 2700 'title': info['title'],
59ae15a5 2701 'ext': u'mp3',
c7214f9a 2702 'description': info['description'],
59ae15a5 2703 }]
d77c3dfd 2704
5011cded 2705class SoundcloudSetIE(InfoExtractor):
2706 """Information extractor for soundcloud.com sets
2707 To access the media, the uid of the song and a stream token
2708 must be extracted from the page source and the script must make
2709 a request to media.soundcloud.com/crossdomain.xml. Then
2710 the media can be grabbed by requesting from an url composed
2711 of the stream token and uid
2712 """
2713
2714 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2715 IE_NAME = u'soundcloud'
2716
5011cded 2717 def report_resolve(self, video_id):
2718 """Report information extraction."""
f17ce13a 2719 self.to_screen(u'%s: Resolving id' % video_id)
5011cded 2720
5011cded 2721 def _real_extract(self, url):
2722 mobj = re.match(self._VALID_URL, url)
2723 if mobj is None:
613bf669 2724 self._downloader.report_error(u'invalid URL: %s' % url)
5011cded 2725 return
2726
2727 # extract uploader (which is in the url)
2728 uploader = mobj.group(1)
2729 # extract simple title (uploader + slug of song title)
2730 slug_title = mobj.group(2)
2731 simple_title = uploader + u'-' + slug_title
2732
2733 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2734
2735 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2736 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737 request = compat_urllib_request.Request(resolv_url)
2738 try:
2739 info_json_bytes = compat_urllib_request.urlopen(request).read()
2740 info_json = info_json_bytes.decode('utf-8')
2741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613bf669 2742 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
5011cded 2743 return
2744
2745 videos = []
2746 info = json.loads(info_json)
2747 if 'errors' in info:
2748 for err in info['errors']:
613bf669 2749 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
5011cded 2750 return
2751
2752 for track in info['tracks']:
2753 video_id = track['id']
2754 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2755
2756 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2757 request = compat_urllib_request.Request(streams_url)
2758 try:
2759 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2760 stream_json = stream_json_bytes.decode('utf-8')
2761 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613bf669 2762 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
5011cded 2763 return
2764
2765 streams = json.loads(stream_json)
2766 mediaURL = streams['http_mp3_128_url']
2767
2768 videos.append({
2769 'id': video_id,
2770 'url': mediaURL,
2771 'uploader': track['user']['username'],
2772 'upload_date': track['created_at'],
2773 'title': track['title'],
2774 'ext': u'mp3',
2775 'description': track['description'],
2776 })
2777 return videos
2778
d77c3dfd
FV
2779
2780class InfoQIE(InfoExtractor):
59ae15a5 2781 """Information extractor for infoq.com"""
59ae15a5 2782 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
59ae15a5 2783
59ae15a5
PH
2784 def _real_extract(self, url):
2785 mobj = re.match(self._VALID_URL, url)
2786 if mobj is None:
e5f30ade 2787 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2788 return
2789
4fcca4bb 2790 webpage = self._download_webpage(url, video_id=url)
59ae15a5
PH
2791 self.report_extraction(url)
2792
59ae15a5 2793 # Extract video URL
a3d689cf 2794 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
59ae15a5 2795 if mobj is None:
e5f30ade 2796 self._downloader.report_error(u'unable to extract video url')
59ae15a5 2797 return
4fcca4bb
PH
2798 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2799 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
59ae15a5
PH
2800
2801 # Extract title
2802 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2803 if mobj is None:
e5f30ade 2804 self._downloader.report_error(u'unable to extract video title')
59ae15a5 2805 return
4fcca4bb 2806 video_title = mobj.group(1)
59ae15a5
PH
2807
2808 # Extract description
2809 video_description = u'No description available.'
2810 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2811 if mobj is not None:
4fcca4bb 2812 video_description = mobj.group(1)
59ae15a5
PH
2813
2814 video_filename = video_url.split('/')[-1]
2815 video_id, extension = video_filename.split('.')
2816
2817 info = {
2818 'id': video_id,
2819 'url': video_url,
2820 'uploader': None,
2821 'upload_date': None,
2822 'title': video_title,
2823 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2824 'thumbnail': None,
2825 'description': video_description,
2826 }
2827
2828 return [info]
d77c3dfd
FV
2829
2830class MixcloudIE(InfoExtractor):
59ae15a5 2831 """Information extractor for www.mixcloud.com"""
93702113
FV
2832
2833 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2834 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2835 IE_NAME = u'mixcloud'
2836
59ae15a5
PH
2837 def report_download_json(self, file_id):
2838 """Report JSON download."""
f17ce13a 2839 self.to_screen(u'Downloading json')
59ae15a5 2840
59ae15a5
PH
2841 def get_urls(self, jsonData, fmt, bitrate='best'):
2842 """Get urls from 'audio_formats' section in json"""
2843 file_url = None
2844 try:
2845 bitrate_list = jsonData[fmt]
2846 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2847 bitrate = max(bitrate_list) # select highest
2848
2849 url_list = jsonData[fmt][bitrate]
2850 except TypeError: # we have no bitrate info.
2851 url_list = jsonData[fmt]
2852 return url_list
2853
2854 def check_urls(self, url_list):
2855 """Returns 1st active url from list"""
2856 for url in url_list:
2857 try:
2858 compat_urllib_request.urlopen(url)
2859 return url
2860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861 url = None
2862
2863 return None
2864
2865 def _print_formats(self, formats):
2866 print('Available formats:')
2867 for fmt in formats.keys():
2868 for b in formats[fmt]:
2869 try:
2870 ext = formats[fmt][b][0]
2871 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2872 except TypeError: # we have no bitrate info
2873 ext = formats[fmt][0]
2874 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2875 break
2876
2877 def _real_extract(self, url):
2878 mobj = re.match(self._VALID_URL, url)
2879 if mobj is None:
e5f30ade 2880 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2881 return
2882 # extract uploader & filename from url
2883 uploader = mobj.group(1).decode('utf-8')
2884 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2885
2886 # construct API request
2887 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2888 # retrieve .json file with links to files
2889 request = compat_urllib_request.Request(file_url)
2890 try:
2891 self.report_download_json(file_url)
2892 jsonData = compat_urllib_request.urlopen(request).read()
2893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2894 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
59ae15a5
PH
2895 return
2896
2897 # parse JSON
2898 json_data = json.loads(jsonData)
2899 player_url = json_data['player_swf_url']
2900 formats = dict(json_data['audio_formats'])
2901
2902 req_format = self._downloader.params.get('format', None)
2903 bitrate = None
2904
2905 if self._downloader.params.get('listformats', None):
2906 self._print_formats(formats)
2907 return
2908
2909 if req_format is None or req_format == 'best':
2910 for format_param in formats.keys():
2911 url_list = self.get_urls(formats, format_param)
2912 # check urls
2913 file_url = self.check_urls(url_list)
2914 if file_url is not None:
2915 break # got it!
2916 else:
99b0a129 2917 if req_format not in formats:
e5f30ade 2918 self._downloader.report_error(u'format is not available')
59ae15a5
PH
2919 return
2920
2921 url_list = self.get_urls(formats, req_format)
2922 file_url = self.check_urls(url_list)
2923 format_param = req_format
2924
2925 return [{
2926 'id': file_id.decode('utf-8'),
2927 'url': file_url.decode('utf-8'),
2928 'uploader': uploader.decode('utf-8'),
2929 'upload_date': None,
2930 'title': json_data['name'],
2931 'ext': file_url.split('.')[-1].decode('utf-8'),
2932 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2933 'thumbnail': json_data['thumbnail_url'],
2934 'description': json_data['description'],
2935 'player_url': player_url.decode('utf-8'),
2936 }]
d77c3dfd
FV
2937
2938class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
2939 """Information extractor for Stanford's Open ClassRoom"""
2940
2941 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2942 IE_NAME = u'stanfordoc'
2943
59ae15a5
PH
2944 def _real_extract(self, url):
2945 mobj = re.match(self._VALID_URL, url)
2946 if mobj is None:
f0bad2b0 2947 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2948
2949 if mobj.group('course') and mobj.group('video'): # A specific video
2950 course = mobj.group('course')
2951 video = mobj.group('video')
2952 info = {
2953 'id': course + '_' + video,
2954 'uploader': None,
2955 'upload_date': None,
2956 }
2957
2958 self.report_extraction(info['id'])
2959 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2960 xmlUrl = baseUrl + video + '.xml'
2961 try:
2962 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2963 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2964 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
59ae15a5
PH
2965 return
2966 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2967 try:
2968 info['title'] = mdoc.findall('./title')[0].text
2969 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2970 except IndexError:
c9fa1cba 2971 self._downloader.report_error(u'Invalid metadata XML file')
59ae15a5
PH
2972 return
2973 info['ext'] = info['url'].rpartition('.')[2]
2974 return [info]
2975 elif mobj.group('course'): # A course page
2976 course = mobj.group('course')
2977 info = {
2978 'id': course,
2979 'type': 'playlist',
2980 'uploader': None,
2981 'upload_date': None,
2982 }
2983
f0bad2b0
PH
2984 coursepage = self._download_webpage(url, info['id'],
2985 note='Downloading course info page',
2986 errnote='Unable to download course info page')
59ae15a5
PH
2987
2988 m = re.search('<h1>([^<]+)</h1>', coursepage)
2989 if m:
2990 info['title'] = unescapeHTML(m.group(1))
2991 else:
2992 info['title'] = info['id']
2993
2994 m = re.search('<description>([^<]+)</description>', coursepage)
2995 if m:
2996 info['description'] = unescapeHTML(m.group(1))
2997
2998 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2999 info['list'] = [
3000 {
3001 'type': 'reference',
3002 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3003 }
3004 for vpage in links]
3005 results = []
3006 for entry in info['list']:
3007 assert entry['type'] == 'reference'
3008 results += self.extract(entry['url'])
3009 return results
59ae15a5
PH
3010 else: # Root page
3011 info = {
3012 'id': 'Stanford OpenClassroom',
3013 'type': 'playlist',
3014 'uploader': None,
3015 'upload_date': None,
3016 }
3017
3018 self.report_download_webpage(info['id'])
3019 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3020 try:
3021 rootpage = compat_urllib_request.urlopen(rootURL).read()
3022 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3023 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
59ae15a5
PH
3024 return
3025
3026 info['title'] = info['id']
3027
3028 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3029 info['list'] = [
3030 {
3031 'type': 'reference',
3032 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3033 }
3034 for cpage in links]
3035
3036 results = []
3037 for entry in info['list']:
3038 assert entry['type'] == 'reference'
3039 results += self.extract(entry['url'])
3040 return results
d77c3dfd
FV
3041
3042class MTVIE(InfoExtractor):
59ae15a5
PH
3043 """Information extractor for MTV.com"""
3044
3045 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3046 IE_NAME = u'mtv'
3047
59ae15a5
PH
3048 def _real_extract(self, url):
3049 mobj = re.match(self._VALID_URL, url)
3050 if mobj is None:
e5f30ade 3051 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
3052 return
3053 if not mobj.group('proto'):
3054 url = 'http://' + url
3055 video_id = mobj.group('videoid')
59ae15a5 3056
5f955171 3057 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
3058
3059 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3060 if mobj is None:
e5f30ade 3061 self._downloader.report_error(u'unable to extract song name')
59ae15a5
PH
3062 return
3063 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3064 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3065 if mobj is None:
e5f30ade 3066 self._downloader.report_error(u'unable to extract performer')
59ae15a5
PH
3067 return
3068 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 3069 video_title = performer + ' - ' + song_name
59ae15a5
PH
3070
3071 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3072 if mobj is None:
e5f30ade 3073 self._downloader.report_error(u'unable to mtvn_uri')
59ae15a5
PH
3074 return
3075 mtvn_uri = mobj.group(1)
3076
3077 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3078 if mobj is None:
e5f30ade 3079 self._downloader.report_error(u'unable to extract content id')
59ae15a5
PH
3080 return
3081 content_id = mobj.group(1)
3082
3083 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3084 self.report_extraction(video_id)
3085 request = compat_urllib_request.Request(videogen_url)
3086 try:
3087 metadataXml = compat_urllib_request.urlopen(request).read()
3088 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3089 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
59ae15a5
PH
3090 return
3091
3092 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3093 renditions = mdoc.findall('.//rendition')
3094
3095 # For now, always pick the highest quality.
3096 rendition = renditions[-1]
3097
3098 try:
3099 _,_,ext = rendition.attrib['type'].partition('/')
3100 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3101 video_url = rendition.find('./src').text
3102 except KeyError:
613bf669 3103 self._downloader.report_error('Invalid rendition field.')
59ae15a5
PH
3104 return
3105
3106 info = {
3107 'id': video_id,
3108 'url': video_url,
3109 'uploader': performer,
3110 'upload_date': None,
3111 'title': video_title,
3112 'ext': ext,
3113 'format': format,
3114 }
3115
3116 return [info]
6de7ef9b 3117
302efc19 3118
302efc19 3119class YoukuIE(InfoExtractor):
59ae15a5 3120 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59ae15a5 3121
59ae15a5
PH
3122 def _gen_sid(self):
3123 nowTime = int(time.time() * 1000)
3124 random1 = random.randint(1000,1998)
3125 random2 = random.randint(1000,9999)
3126
3127 return "%d%d%d" %(nowTime,random1,random2)
3128
3129 def _get_file_ID_mix_string(self, seed):
3130 mixed = []
3131 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3132 seed = float(seed)
3133 for i in range(len(source)):
3134 seed = (seed * 211 + 30031 ) % 65536
3135 index = math.floor(seed / 65536 * len(source) )
3136 mixed.append(source[int(index)])
3137 source.remove(source[int(index)])
3138 #return ''.join(mixed)
3139 return mixed
3140
3141 def _get_file_id(self, fileId, seed):
3142 mixed = self._get_file_ID_mix_string(seed)
3143 ids = fileId.split('*')
3144 realId = []
3145 for ch in ids:
3146 if ch:
3147 realId.append(mixed[int(ch)])
3148 return ''.join(realId)
3149
3150 def _real_extract(self, url):
3151 mobj = re.match(self._VALID_URL, url)
3152 if mobj is None:
e5f30ade 3153 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
3154 return
3155 video_id = mobj.group('ID')
3156
3157 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3158
3159 request = compat_urllib_request.Request(info_url, None, std_headers)
3160 try:
3161 self.report_download_webpage(video_id)
3162 jsondata = compat_urllib_request.urlopen(request).read()
3163 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3164 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
3165 return
3166
3167 self.report_extraction(video_id)
3168 try:
8f6f40d9
PH
3169 jsonstr = jsondata.decode('utf-8')
3170 config = json.loads(jsonstr)
59ae15a5
PH
3171
3172 video_title = config['data'][0]['title']
3173 seed = config['data'][0]['seed']
3174
3175 format = self._downloader.params.get('format', None)
1a2c3c0f 3176 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
3177
3178 if format is None or format == 'best':
3179 if 'hd2' in supported_format:
3180 format = 'hd2'
3181 else:
3182 format = 'flv'
3183 ext = u'flv'
3184 elif format == 'worst':
3185 format = 'mp4'
3186 ext = u'mp4'
3187 else:
3188 format = 'flv'
3189 ext = u'flv'
3190
3191
3192 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 3193 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 3194 except (UnicodeDecodeError, ValueError, KeyError):
e5f30ade 3195 self._downloader.report_error(u'unable to extract info section')
59ae15a5
PH
3196 return
3197
3198 files_info=[]
3199 sid = self._gen_sid()
3200 fileid = self._get_file_id(fileid, seed)
3201
3202 #column 8,9 of fileid represent the segment number
3203 #fileid[7:9] should be changed
3204 for index, key in enumerate(keys):
3205
3206 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3207 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3208
3209 info = {
3210 'id': '%s_part%02d' % (video_id, index),
3211 'url': download_url,
3212 'uploader': None,
3213 'upload_date': None,
3214 'title': video_title,
3215 'ext': ext,
3216 }
3217 files_info.append(info)
3218
3219 return files_info
5dc846fa
FV
3220
3221
6de7ef9b 3222class XNXXIE(InfoExtractor):
59ae15a5
PH
3223 """Information extractor for xnxx.com"""
3224
caec7618 3225 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
59ae15a5
PH
3226 IE_NAME = u'xnxx'
3227 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3228 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3229 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3230
59ae15a5
PH
3231 def _real_extract(self, url):
3232 mobj = re.match(self._VALID_URL, url)
3233 if mobj is None:
e5f30ade 3234 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5 3235 return
bec102a8 3236 video_id = mobj.group(1)
59ae15a5 3237
320e26a0 3238 self.report_download_webpage(video_id)
59ae15a5
PH
3239
3240 # Get webpage content
3241 try:
bec102a8
PH
3242 webpage_bytes = compat_urllib_request.urlopen(url).read()
3243 webpage = webpage_bytes.decode('utf-8')
59ae15a5 3244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3245 self._downloader.report_error(u'unable to download video webpage: %s' % err)
59ae15a5
PH
3246 return
3247
3248 result = re.search(self.VIDEO_URL_RE, webpage)
3249 if result is None:
e5f30ade 3250 self._downloader.report_error(u'unable to extract video url')
59ae15a5 3251 return
bec102a8 3252 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
3253
3254 result = re.search(self.VIDEO_TITLE_RE, webpage)
3255 if result is None:
e5f30ade 3256 self._downloader.report_error(u'unable to extract video title')
59ae15a5 3257 return
bec102a8 3258 video_title = result.group(1)
59ae15a5
PH
3259
3260 result = re.search(self.VIDEO_THUMB_RE, webpage)
3261 if result is None:
e5f30ade 3262 self._downloader.report_error(u'unable to extract video thumbnail')
59ae15a5 3263 return
bec102a8 3264 video_thumbnail = result.group(1)
59ae15a5
PH
3265
3266 return [{
3267 'id': video_id,
3268 'url': video_url,
3269 'uploader': None,
3270 'upload_date': None,
3271 'title': video_title,
3272 'ext': 'flv',
3273 'thumbnail': video_thumbnail,
3274 'description': None,
3275 }]
fd873c69
FV
3276
3277
d443aca8 3278class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3279 """Information extractor for plus.google.com."""
3280
93702113 3281 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
3282 IE_NAME = u'plus.google'
3283
59ae15a5
PH
3284 def report_extract_entry(self, url):
3285 """Report downloading extry"""
f17ce13a 3286 self.to_screen(u'Downloading entry: %s' % url)
59ae15a5
PH
3287
3288 def report_date(self, upload_date):
3289 """Report downloading extry"""
f17ce13a 3290 self.to_screen(u'Entry date: %s' % upload_date)
59ae15a5
PH
3291
3292 def report_uploader(self, uploader):
3293 """Report downloading extry"""
f17ce13a 3294 self.to_screen(u'Uploader: %s' % uploader)
59ae15a5
PH
3295
3296 def report_title(self, video_title):
3297 """Report downloading extry"""
f17ce13a 3298 self.to_screen(u'Title: %s' % video_title)
59ae15a5
PH
3299
3300 def report_extract_vid_page(self, video_page):
3301 """Report information extraction."""
f17ce13a 3302 self.to_screen(u'Extracting video page: %s' % video_page)
59ae15a5
PH
3303
3304 def _real_extract(self, url):
3305 # Extract id from URL
3306 mobj = re.match(self._VALID_URL, url)
3307 if mobj is None:
e5f30ade 3308 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
3309 return
3310
3311 post_url = mobj.group(0)
93702113 3312 video_id = mobj.group(1)
59ae15a5
PH
3313
3314 video_extension = 'flv'
3315
3316 # Step 1, Retrieve post webpage to extract further information
3317 self.report_extract_entry(post_url)
3318 request = compat_urllib_request.Request(post_url)
3319 try:
93702113 3320 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 3321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3322 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
59ae15a5
PH
3323 return
3324
3325 # Extract update date
3326 upload_date = None
3327 pattern = 'title="Timestamp">(.*?)</a>'
3328 mobj = re.search(pattern, webpage)
3329 if mobj:
3330 upload_date = mobj.group(1)
3331 # Convert timestring to a format suitable for filename
3332 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3333 upload_date = upload_date.strftime('%Y%m%d')
3334 self.report_date(upload_date)
3335
3336 # Extract uploader
3337 uploader = None
3338 pattern = r'rel\="author".*?>(.*?)</a>'
3339 mobj = re.search(pattern, webpage)
3340 if mobj:
3341 uploader = mobj.group(1)
3342 self.report_uploader(uploader)
3343
3344 # Extract title
3345 # Get the first line for title
3346 video_title = u'NA'
3347 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3348 mobj = re.search(pattern, webpage)
3349 if mobj:
3350 video_title = mobj.group(1)
3351 self.report_title(video_title)
3352
3353 # Step 2, Stimulate clicking the image box to launch video
3354 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3355 mobj = re.search(pattern, webpage)
3356 if mobj is None:
e5f30ade 3357 self._downloader.report_error(u'unable to extract video page URL')
59ae15a5
PH
3358
3359 video_page = mobj.group(1)
3360 request = compat_urllib_request.Request(video_page)
3361 try:
93702113 3362 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 3363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3364 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
3365 return
3366 self.report_extract_vid_page(video_page)
3367
3368
3369 # Extract video links on video page
3370 """Extract video links of all sizes"""
3371 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3372 mobj = re.findall(pattern, webpage)
3373 if len(mobj) == 0:
e5f30ade 3374 self._downloader.report_error(u'unable to extract video links')
59ae15a5
PH
3375
3376 # Sort in resolution
3377 links = sorted(mobj)
3378
3379 # Choose the lowest of the sort, i.e. highest resolution
3380 video_url = links[-1]
3381 # Only get the url. The resolution part in the tuple has no use anymore
3382 video_url = video_url[-1]
3383 # Treat escaped \u0026 style hex
93702113
FV
3384 try:
3385 video_url = video_url.decode("unicode_escape")
3386 except AttributeError: # Python 3
3387 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3388
3389
3390 return [{
93702113 3391 'id': video_id,
59ae15a5 3392 'url': video_url,
93702113
FV
3393 'uploader': uploader,
3394 'upload_date': upload_date,
3395 'title': video_title,
3396 'ext': video_extension,
59ae15a5 3397 }]
4cc3d074
PH
3398
3399class NBAIE(InfoExtractor):
3400 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3401 IE_NAME = u'nba'
3402
4cc3d074
PH
3403 def _real_extract(self, url):
3404 mobj = re.match(self._VALID_URL, url)
3405 if mobj is None:
e5f30ade 3406 self._downloader.report_error(u'invalid URL: %s' % url)
4cc3d074
PH
3407 return
3408
3409 video_id = mobj.group(1)
3410 if video_id.endswith('/index.html'):
3411 video_id = video_id[:-len('/index.html')]
3412
5f955171 3413 webpage = self._download_webpage(url, video_id)
4cc3d074
PH
3414
3415 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3416 def _findProp(rexp, default=None):
3417 m = re.search(rexp, webpage)
3418 if m:
3419 return unescapeHTML(m.group(1))
3420 else:
3421 return default
3422
3423 shortened_video_id = video_id.rpartition('/')[2]
3424 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3425 info = {
3426 'id': shortened_video_id,
3427 'url': video_url,
3428 'ext': 'mp4',
3429 'title': title,
3430 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3431 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3432 }
3433 return [info]
0b40544f
DV
3434
3435class JustinTVIE(InfoExtractor):
3436 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3437 # TODO: One broadcast may be split into multiple videos. The key
3438 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3439 # starts at 1 and increases. Can we treat all parts as one video?
3440
4096b609
DV
3441 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3442 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3443 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3444 IE_NAME = u'justin.tv'
3445
4096b609
DV
3446 def report_download_page(self, channel, offset):
3447 """Report attempt to download a single page of videos."""
f17ce13a
JMF
3448 self.to_screen(u'%s: Downloading video information from %d to %d' %
3449 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
4096b609 3450
2ab1c5ed
DV
3451 # Return count of items, list of *valid* items
3452 def _parse_page(self, url):
0b40544f 3453 try:
2ab1c5ed 3454 urlh = compat_urllib_request.urlopen(url)
0b40544f
DV
3455 webpage_bytes = urlh.read()
3456 webpage = webpage_bytes.decode('utf-8', 'ignore')
3457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3458 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
0b40544f 3459 return
cdb30764 3460
0b40544f 3461 response = json.loads(webpage)
fa1bf9c6 3462 if type(response) != list:
3463 error_text = response.get('error', 'unknown error')
e5f30ade 3464 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
fa1bf9c6 3465 return
0b40544f
DV
3466 info = []
3467 for clip in response:
3468 video_url = clip['video_file_url']
3469 if video_url:
3470 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 3471 video_date = re.sub('-', '', clip['start_time'][:10])
3472 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
3473 video_id = clip['id']
3474 video_title = clip.get('title', video_id)
0b40544f 3475 info.append({
97f194c1 3476 'id': video_id,
0b40544f 3477 'url': video_url,
97f194c1 3478 'title': video_title,
fa1bf9c6 3479 'uploader': clip.get('channel_name', video_uploader_id),
3480 'uploader_id': video_uploader_id,
0b40544f
DV
3481 'upload_date': video_date,
3482 'ext': video_extension,
3483 })
2ab1c5ed
DV
3484 return (len(response), info)
3485
3486 def _real_extract(self, url):
3487 mobj = re.match(self._VALID_URL, url)
3488 if mobj is None:
e5f30ade 3489 self._downloader.report_error(u'invalid URL: %s' % url)
2ab1c5ed 3490 return
cdb30764 3491
2ab1c5ed
DV
3492 api = 'http://api.justin.tv'
3493 video_id = mobj.group(mobj.lastindex)
3494 paged = False
3495 if mobj.lastindex == 1:
3496 paged = True
3497 api += '/channel/archives/%s.json'
3498 else:
fa1bf9c6 3499 api += '/broadcast/by_archive/%s.json'
2ab1c5ed 3500 api = api % (video_id,)
cdb30764 3501
2ab1c5ed 3502 self.report_extraction(video_id)
cdb30764 3503
2ab1c5ed
DV
3504 info = []
3505 offset = 0
4096b609
DV
3506 limit = self._JUSTIN_PAGE_LIMIT
3507 while True:
3508 if paged:
3509 self.report_download_page(video_id, offset)
2ab1c5ed
DV
3510 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3511 page_count, page_info = self._parse_page(page_url)
3512 info.extend(page_info)
3513 if not paged or page_count != limit:
3514 break
3515 offset += limit
0b40544f 3516 return info
21a9c6aa
PH
3517
3518class FunnyOrDieIE(InfoExtractor):
3519 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
21a9c6aa 3520
21a9c6aa
PH
3521 def _real_extract(self, url):
3522 mobj = re.match(self._VALID_URL, url)
3523 if mobj is None:
e5f30ade 3524 self._downloader.report_error(u'invalid URL: %s' % url)
21a9c6aa
PH
3525 return
3526
3527 video_id = mobj.group('id')
5f955171 3528 webpage = self._download_webpage(url, video_id)
21a9c6aa
PH
3529
3530 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3531 if not m:
e5f30ade 3532 self._downloader.report_error(u'unable to find video information')
21a9c6aa 3533 video_url = unescapeHTML(m.group('url'))
21a9c6aa 3534
7decf895 3535 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
21a9c6aa 3536 if not m:
bfdf4692
PH
3537 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3538 if not m:
613bf669 3539 self._downloader.report_error(u'Cannot find video title')
7decf895 3540 title = clean_html(m.group('title'))
21a9c6aa
PH
3541
3542 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3543 if m:
3544 desc = unescapeHTML(m.group('desc'))
3545 else:
3546 desc = None
3547
3548 info = {
3549 'id': video_id,
3550 'url': video_url,
3551 'ext': 'mp4',
3552 'title': title,
3553 'description': desc,
3554 }
3555 return [info]
d0d4f277 3556
e314ba67 3557class SteamIE(InfoExtractor):
6324fd1d 3558 _VALID_URL = r"""http://store.steampowered.com/
4c9f7a99 3559 (agecheck/)?
e314ba67
JMF
3560 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3561 (?P<gameID>\d+)/?
3562 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3563 """
4aeae91f 3564
89de9eb1
FV
3565 @classmethod
3566 def suitable(cls, url):
e314ba67 3567 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 3568 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
5f955171 3569
e314ba67
JMF
3570 def _real_extract(self, url):
3571 m = re.match(self._VALID_URL, url, re.VERBOSE)
e314ba67 3572 gameID = m.group('gameID')
e11eb119
JMF
3573 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3574 self.report_age_confirmation()
5f955171 3575 webpage = self._download_webpage(videourl, gameID)
9e1cf0c2
JMF
3576 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3577
3578 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
e314ba67 3579 mweb = re.finditer(urlRE, webpage)
5e9d042d
JMF
3580 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3581 titles = re.finditer(namesRE, webpage)
60bd48b1
JMF
3582 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3583 thumbs = re.finditer(thumbsRE, webpage)
e314ba67 3584 videos = []
60bd48b1 3585 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
e314ba67 3586 video_id = vid.group('videoID')
5f955171
PH
3587 title = vtitle.group('videoName')
3588 video_url = vid.group('videoURL')
60bd48b1 3589 video_thumb = thumb.group('thumbnail')
e314ba67 3590 if not video_url:
e5f30ade 3591 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
e314ba67
JMF
3592 info = {
3593 'id':video_id,
3594 'url':video_url,
3595 'ext': 'flv',
60bd48b1
JMF
3596 'title': unescapeHTML(title),
3597 'thumbnail': video_thumb
e314ba67
JMF
3598 }
3599 videos.append(info)
9e1cf0c2 3600 return [self.playlist_result(videos, gameID, game_title)]
ef0c8d5f 3601
278986ea 3602class UstreamIE(InfoExtractor):
ef0c8d5f 3603 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 3604 IE_NAME = u'ustream'
ef0c8d5f 3605
278986ea
JMF
3606 def _real_extract(self, url):
3607 m = re.match(self._VALID_URL, url)
3608 video_id = m.group('videoID')
3609 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 3610 webpage = self._download_webpage(url, video_id)
278986ea
JMF
3611 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3612 title = m.group('title')
3613 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3614 uploader = m.group('uploader')
3615 info = {
3616 'id':video_id,
3617 'url':video_url,
3618 'ext': 'flv',
3619 'title': title,
3620 'uploader': uploader
3621 }
3622 return [info]
4aeae91f 3623
40634747 3624class WorldStarHipHopIE(InfoExtractor):
64c78d50 3625 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
40634747
JMS
3626 IE_NAME = u'WorldStarHipHop'
3627
3628 def _real_extract(self, url):
40634747
JMS
3629 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3630
3b221c54 3631 webpage_src = compat_urllib_request.urlopen(url).read()
b3bcca08 3632 webpage_src = webpage_src.decode('utf-8')
40634747
JMS
3633
3634 mobj = re.search(_src_url, webpage_src)
3635
08ec0af7
JMS
3636 m = re.match(self._VALID_URL, url)
3637 video_id = m.group('id')
3638
40634747
JMS
3639 if mobj is not None:
3640 video_url = mobj.group()
3641 if 'mp4' in video_url:
b3bcca08 3642 ext = 'mp4'
40634747 3643 else:
b3bcca08 3644 ext = 'flv'
40634747 3645 else:
613bf669 3646 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
08ec0af7 3647 return
40634747
JMS
3648
3649 _title = r"""<title>(.*)</title>"""
3650
3651 mobj = re.search(_title, webpage_src)
fa41fbd3 3652
40634747
JMS
3653 if mobj is not None:
3654 title = mobj.group(1)
40634747 3655 else:
b3bcca08 3656 title = 'World Start Hip Hop - %s' % time.ctime()
40634747
JMS
3657
3658 _thumbnail = r"""rel="image_src" href="(.*)" />"""
40634747
JMS
3659 mobj = re.search(_thumbnail, webpage_src)
3660
3661 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3662 if mobj is not None:
3663 thumbnail = mobj.group(1)
3664 else:
3665 _title = r"""candytitles.*>(.*)</span>"""
3666 mobj = re.search(_title, webpage_src)
3667 if mobj is not None:
3668 title = mobj.group(1)
3669 thumbnail = None
fa41fbd3 3670
b3bcca08 3671 results = [{
64c78d50 3672 'id': video_id,
b3bcca08
JMS
3673 'url' : video_url,
3674 'title' : title,
3675 'thumbnail' : thumbnail,
3676 'ext' : ext,
3677 }]
40634747
JMS
3678 return results
3679
ca0a0bbe
PH
3680class RBMARadioIE(InfoExtractor):
3681 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3682
3683 def _real_extract(self, url):
3684 m = re.match(self._VALID_URL, url)
3685 video_id = m.group('videoID')
3686
3687 webpage = self._download_webpage(url, video_id)
3688 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3689 if not m:
3690 raise ExtractorError(u'Cannot find metadata')
3691 json_data = m.group(1)
3692
3693 try:
3694 data = json.loads(json_data)
3695 except ValueError as e:
3696 raise ExtractorError(u'Invalid JSON: ' + str(e))
3697
3698 video_url = data['akamai_url'] + '&cbr=256'
3699 url_parts = compat_urllib_parse_urlparse(video_url)
3700 video_ext = url_parts.path.rpartition('.')[2]
3701 info = {
3702 'id': video_id,
3703 'url': video_url,
3704 'ext': video_ext,
3705 'title': data['title'],
3706 'description': data.get('teaser_text'),
3707 'location': data.get('country_of_origin'),
3708 'uploader': data.get('host', {}).get('name'),
3709 'uploader_id': data.get('host', {}).get('slug'),
187f491a 3710 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
3711 'duration': data.get('duration'),
3712 }
3713 return [info]
4aeae91f 3714
991ba7fa
JC
3715
3716class YouPornIE(InfoExtractor):
3717 """Information extractor for youporn.com."""
991ba7fa 3718 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
6324fd1d 3719
991ba7fa
JC
3720 def _print_formats(self, formats):
3721 """Print all available formats"""
565f7519 3722 print(u'Available formats:')
ca6710ee
JC
3723 print(u'ext\t\tformat')
3724 print(u'---------------------------------')
991ba7fa 3725 for format in formats:
ca6710ee 3726 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
3727
3728 def _specific(self, req_format, formats):
3729 for x in formats:
3730 if(x["format"]==req_format):
3731 return x
3732 return None
3733
991ba7fa
JC
3734 def _real_extract(self, url):
3735 mobj = re.match(self._VALID_URL, url)
3736 if mobj is None:
e5f30ade 3737 self._downloader.report_error(u'invalid URL: %s' % url)
991ba7fa
JC
3738 return
3739
ca6710ee 3740 video_id = mobj.group('videoid')
991ba7fa 3741
629fcdd1
PH
3742 req = compat_urllib_request.Request(url)
3743 req.add_header('Cookie', 'age_verified=1')
3744 webpage = self._download_webpage(req, video_id)
991ba7fa
JC
3745
3746 # Get the video title
e711babb 3747 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
991ba7fa 3748 if result is None:
e711babb 3749 raise ExtractorError(u'Unable to extract video title')
ca6710ee 3750 video_title = result.group('title').strip()
991ba7fa
JC
3751
3752 # Get the video date
e711babb 3753 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
991ba7fa 3754 if result is None:
2e5457be 3755 self._downloader.report_warning(u'unable to extract video date')
629fcdd1
PH
3756 upload_date = None
3757 else:
bf50b038 3758 upload_date = unified_strdate(result.group('date').strip())
991ba7fa
JC
3759
3760 # Get the video uploader
e711babb 3761 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
991ba7fa 3762 if result is None:
2e5457be 3763 self._downloader.report_warning(u'unable to extract uploader')
629fcdd1
PH
3764 video_uploader = None
3765 else:
3766 video_uploader = result.group('uploader').strip()
3767 video_uploader = clean_html( video_uploader )
991ba7fa
JC
3768
3769 # Get all of the formats available
ca6710ee
JC
3770 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3771 result = re.search(DOWNLOAD_LIST_RE, webpage)
991ba7fa 3772 if result is None:
629fcdd1 3773 raise ExtractorError(u'Unable to extract download list')
ca6710ee 3774 download_list_html = result.group('download_list').strip()
991ba7fa
JC
3775
3776 # Get all of the links from the page
ca6710ee
JC
3777 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3778 links = re.findall(LINK_RE, download_list_html)
991ba7fa 3779 if(len(links) == 0):
629fcdd1 3780 raise ExtractorError(u'ERROR: no known formats available for video')
6324fd1d 3781
f17ce13a 3782 self.to_screen(u'Links found: %d' % len(links))
991ba7fa
JC
3783
3784 formats = []
3785 for link in links:
3786
3787 # A link looks like this:
3788 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3789 # A path looks like this:
3790 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
3791 video_url = unescapeHTML( link )
3792 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
3793 extension = os.path.splitext( path )[1][1:]
3794 format = path.split('/')[4].split('_')[:2]
3795 size = format[0]
3796 bitrate = format[1]
3797 format = "-".join( format )
3798 title = u'%s-%s-%s' % (video_title, size, bitrate)
3799
3800 formats.append({
3801 'id': video_id,
3802 'url': video_url,
3803 'uploader': video_uploader,
3804 'upload_date': upload_date,
3805 'title': title,
3806 'ext': extension,
3807 'format': format,
3808 'thumbnail': None,
3809 'description': None,
3810 'player_url': None
3811 })
3812
3813 if self._downloader.params.get('listformats', None):
3814 self._print_formats(formats)
3815 return
3816
3817 req_format = self._downloader.params.get('format', None)
f17ce13a 3818 self.to_screen(u'Format: %s' % req_format)
991ba7fa 3819
991ba7fa
JC
3820 if req_format is None or req_format == 'best':
3821 return [formats[0]]
3822 elif req_format == 'worst':
3823 return [formats[-1]]
3824 elif req_format in ('-1', 'all'):
3825 return formats
3826 else:
3827 format = self._specific( req_format, formats )
3828 if result is None:
e5f30ade 3829 self._downloader.report_error(u'requested format not available')
991ba7fa
JC
3830 return
3831 return [format]
3832
6324fd1d 3833
991ba7fa
JC
3834
3835class PornotubeIE(InfoExtractor):
3836 """Information extractor for pornotube.com."""
991ba7fa 3837 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 3838
991ba7fa
JC
3839 def _real_extract(self, url):
3840 mobj = re.match(self._VALID_URL, url)
3841 if mobj is None:
e5f30ade 3842 self._downloader.report_error(u'invalid URL: %s' % url)
991ba7fa
JC
3843 return
3844
ca6710ee
JC
3845 video_id = mobj.group('videoid')
3846 video_title = mobj.group('title')
991ba7fa
JC
3847
3848 # Get webpage content
ca6710ee 3849 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3850
3851 # Get the video URL
ca6710ee
JC
3852 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3853 result = re.search(VIDEO_URL_RE, webpage)
991ba7fa 3854 if result is None:
e5f30ade 3855 self._downloader.report_error(u'unable to extract video url')
991ba7fa 3856 return
ca6710ee 3857 video_url = compat_urllib_parse.unquote(result.group('url'))
991ba7fa
JC
3858
3859 #Get the uploaded date
ca6710ee
JC
3860 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3861 result = re.search(VIDEO_UPLOADED_RE, webpage)
991ba7fa 3862 if result is None:
e5f30ade 3863 self._downloader.report_error(u'unable to extract video title')
991ba7fa 3864 return
bf50b038 3865 upload_date = unified_strdate(result.group('date'))
991ba7fa
JC
3866
3867 info = {'id': video_id,
3868 'url': video_url,
3869 'uploader': None,
3870 'upload_date': upload_date,
3871 'title': video_title,
3872 'ext': 'flv',
565f7519 3873 'format': 'flv'}
991ba7fa
JC
3874
3875 return [info]
3876
991ba7fa
JC
3877class YouJizzIE(InfoExtractor):
3878 """Information extractor for youjizz.com."""
ca6710ee 3879 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 3880
991ba7fa 3881 def _real_extract(self, url):
ca6710ee
JC
3882 mobj = re.match(self._VALID_URL, url)
3883 if mobj is None:
e5f30ade 3884 self._downloader.report_error(u'invalid URL: %s' % url)
991ba7fa 3885 return
ca6710ee
JC
3886
3887 video_id = mobj.group('videoid')
3888
3889 # Get webpage content
3890 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3891
3892 # Get the video title
db16276b 3893 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
991ba7fa 3894 if result is None:
db16276b 3895 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 3896 video_title = result.group('title').strip()
991ba7fa
JC
3897
3898 # Get the embed page
db16276b 3899 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 3900 if result is None:
db16276b 3901 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 3902
ca6710ee
JC
3903 embed_page_url = result.group(0).strip()
3904 video_id = result.group('videoid')
6324fd1d 3905
ca6710ee
JC
3906 webpage = self._download_webpage(embed_page_url, video_id)
3907
991ba7fa 3908 # Get the video URL
db16276b 3909 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
991ba7fa 3910 if result is None:
db16276b 3911 raise ExtractorError(u'ERROR: unable to extract video url')
ca6710ee 3912 video_url = result.group('source')
991ba7fa
JC
3913
3914 info = {'id': video_id,
3915 'url': video_url,
991ba7fa
JC
3916 'title': video_title,
3917 'ext': 'flv',
3918 'format': 'flv',
991ba7fa
JC
3919 'player_url': embed_page_url}
3920
3921 return [info]
3922
ccf65f9d
PH
3923class EightTracksIE(InfoExtractor):
3924 IE_NAME = '8tracks'
25580f32 3925 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
ccf65f9d
PH
3926
3927 def _real_extract(self, url):
3928 mobj = re.match(self._VALID_URL, url)
3929 if mobj is None:
3930 raise ExtractorError(u'Invalid URL: %s' % url)
3931 playlist_id = mobj.group('id')
3932
3933 webpage = self._download_webpage(url, playlist_id)
3934
2a9983b7 3935 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
ccf65f9d
PH
3936 if not m:
3937 raise ExtractorError(u'Cannot find trax information')
3938 json_like = m.group(1)
3939 data = json.loads(json_like)
3940
3941 session = str(random.randint(0, 1000000000))
3942 mix_id = data['id']
3943 track_count = data['tracks_count']
3944 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3945 next_url = first_url
3946 res = []
3947 for i in itertools.count():
3948 api_json = self._download_webpage(next_url, playlist_id,
3949 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3950 errnote=u'Failed to download song information')
3951 api_data = json.loads(api_json)
3952 track_data = api_data[u'set']['track']
3953 info = {
3954 'id': track_data['id'],
3955 'url': track_data['track_file_stream_url'],
da4de959
PH
3956 'title': track_data['performer'] + u' - ' + track_data['name'],
3957 'raw_title': track_data['name'],
3958 'uploader_id': data['user']['login'],
ccf65f9d
PH
3959 'ext': 'm4a',
3960 }
3961 res.append(info)
3962 if api_data['set']['at_last_track']:
3963 break
3964 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3965 return res
991ba7fa 3966
da06e2da
OK
3967class KeekIE(InfoExtractor):
3968 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3969 IE_NAME = u'keek'
3970
3971 def _real_extract(self, url):
3972 m = re.match(self._VALID_URL, url)
3973 video_id = m.group('videoID')
3974 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3975 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3976 webpage = self._download_webpage(url, video_id)
f4381ab8 3977 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
f0877a44 3978 title = unescapeHTML(m.group('title'))
f10b2a9c
FV
3979 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3980 uploader = clean_html(m.group('uploader'))
da06e2da 3981 info = {
f10b2a9c
FV
3982 'id': video_id,
3983 'url': video_url,
da06e2da
OK
3984 'ext': 'mp4',
3985 'title': title,
3986 'thumbnail': thumbnail,
3987 'uploader': uploader
f0877a44 3988 }
da06e2da
OK
3989 return [info]
3990
3a468f2d 3991class TEDIE(InfoExtractor):
414638cd
JMF
3992 _VALID_URL=r'''http://www.ted.com/
3993 (
3994 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3995 |
3996 ((?P<type_talk>talks)) # We have a simple talk
3997 )
3998 /(?P<name>\w+) # Here goes the name and then ".html"
3999 '''
4000
89de9eb1
FV
4001 @classmethod
4002 def suitable(cls, url):
414638cd 4003 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 4004 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
414638cd 4005
3a468f2d 4006 def _real_extract(self, url):
414638cd
JMF
4007 m=re.match(self._VALID_URL, url, re.VERBOSE)
4008 if m.group('type_talk'):
4009 return [self._talk_info(url)]
4010 else :
4011 playlist_id=m.group('playlist_id')
4012 name=m.group('name')
f17ce13a 4013 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
e905b6f8 4014 return [self._playlist_videos_info(url,name,playlist_id)]
414638cd
JMF
4015
4016 def _talk_video_link(self,mediaSlug):
4017 '''Returns the video link for that mediaSlug'''
4018 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4019
4020 def _playlist_videos_info(self,url,name,playlist_id=0):
4021 '''Returns the videos of the playlist'''
4022 video_RE=r'''
4023 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4024 ([.\s]*?)data-playlist_item_id="(\d+)"
4025 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4026 '''
c85538db 4027 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
414638cd
JMF
4028 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4029 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4030 m_names=re.finditer(video_name_RE,webpage)
e905b6f8
JMF
4031
4032 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4033 m_playlist = re.search(playlist_RE, webpage)
4034 playlist_title = m_playlist.group('playlist_title')
4035
4036 playlist_entries = []
414638cd 4037 for m_video, m_name in zip(m_videos,m_names):
c85538db
JMF
4038 video_id=m_video.group('video_id')
4039 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
e905b6f8
JMF
4040 playlist_entries.append(self.url_result(talk_url, 'TED'))
4041 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
c85538db 4042
414638cd
JMF
4043 def _talk_info(self, url, video_id=0):
4044 """Return the video for the talk in the url"""
4045 m=re.match(self._VALID_URL, url,re.VERBOSE)
4046 videoName=m.group('name')
4047 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4048 # If the url includes the language we get the title translated
7decf895 4049 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3a468f2d
JMF
4050 title=re.search(title_RE, webpage).group('title')
4051 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4052 "id":(?P<videoID>[\d]+).*?
4053 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
c85538db
JMF
4054 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4055 thumb_match=re.search(thumb_RE,webpage)
3a468f2d
JMF
4056 info_match=re.search(info_RE,webpage,re.VERBOSE)
4057 video_id=info_match.group('videoID')
4058 mediaSlug=info_match.group('mediaSlug')
414638cd 4059 video_url=self._talk_video_link(mediaSlug)
3a468f2d 4060 info = {
414638cd
JMF
4061 'id': video_id,
4062 'url': video_url,
3a468f2d 4063 'ext': 'mp4',
c85538db
JMF
4064 'title': title,
4065 'thumbnail': thumb_match.group('thumbnail')
414638cd
JMF
4066 }
4067 return info
da06e2da 4068
58994225 4069class MySpassIE(InfoExtractor):
1ad5d872 4070 _VALID_URL = r'http://www.myspass.de/.*'
6324fd1d 4071
1ad5d872 4072 def _real_extract(self, url):
4073 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
58994225 4074
1ad5d872 4075 # video id is the last path element of the URL
4076 # usually there is a trailing slash, so also try the second but last
4077 url_path = compat_urllib_parse_urlparse(url).path
4078 url_parent_path, video_id = os.path.split(url_path)
4079 if not video_id:
4080 _, video_id = os.path.split(url_parent_path)
6324fd1d 4081
1ad5d872 4082 # get metadata
4083 metadata_url = META_DATA_URL_TEMPLATE % video_id
4084 metadata_text = self._download_webpage(metadata_url, video_id)
4085 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
6324fd1d 4086
1ad5d872 4087 # extract values from metadata
4088 url_flv_el = metadata.find('url_flv')
4089 if url_flv_el is None:
e5f30ade 4090 self._downloader.report_error(u'unable to extract download url')
1ad5d872 4091 return
4092 video_url = url_flv_el.text
4093 extension = os.path.splitext(video_url)[1][1:]
4094 title_el = metadata.find('title')
4095 if title_el is None:
e5f30ade 4096 self._downloader.report_error(u'unable to extract title')
1ad5d872 4097 return
4098 title = title_el.text
4099 format_id_el = metadata.find('format_id')
4100 if format_id_el is None:
4101 format = ext
4102 else:
4103 format = format_id_el.text
4104 description_el = metadata.find('description')
4105 if description_el is not None:
4106 description = description_el.text
4107 else:
4108 description = None
4109 imagePreview_el = metadata.find('imagePreview')
4110 if imagePreview_el is not None:
4111 thumbnail = imagePreview_el.text
4112 else:
4113 thumbnail = None
4114 info = {
4115 'id': video_id,
4116 'url': video_url,
4117 'title': title,
4118 'ext': extension,
4119 'format': format,
4120 'thumbnail': thumbnail,
4121 'description': description
4122 }
4123 return [info]
4124
e32b06e9 4125class SpiegelIE(InfoExtractor):
1f46c152 4126 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
e32b06e9
PH
4127
4128 def _real_extract(self, url):
4129 m = re.match(self._VALID_URL, url)
4130 video_id = m.group('videoID')
4131
4132 webpage = self._download_webpage(url, video_id)
4133 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4134 if not m:
4135 raise ExtractorError(u'Cannot find title')
4136 video_title = unescapeHTML(m.group(1))
4137
4138 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4139 xml_code = self._download_webpage(xml_url, video_id,
4140 note=u'Downloading XML', errnote=u'Failed to download XML')
4141
4142 idoc = xml.etree.ElementTree.fromstring(xml_code)
4143 last_type = idoc[-1]
4144 filename = last_type.findall('./filename')[0].text
4145 duration = float(last_type.findall('./duration')[0].text)
4146
4147 video_url = 'http://video2.spiegel.de/flash/' + filename
4148 video_ext = filename.rpartition('.')[2]
4149 info = {
4150 'id': video_id,
4151 'url': video_url,
4152 'ext': video_ext,
4153 'title': video_title,
4154 'duration': duration,
4155 }
4156 return [info]
4157
0cd35867 4158class LiveLeakIE(InfoExtractor):
43113d92 4159
0cd35867 4160 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
43113d92 4161 IE_NAME = u'liveleak'
4162
4163 def _real_extract(self, url):
4164 mobj = re.match(self._VALID_URL, url)
4165 if mobj is None:
613bf669 4166 self._downloader.report_error(u'invalid URL: %s' % url)
43113d92 4167 return
4168
0cd35867 4169 video_id = mobj.group('video_id')
43113d92 4170
4171 webpage = self._download_webpage(url, video_id)
4172
0cd35867
FV
4173 m = re.search(r'file: "(.*?)",', webpage)
4174 if not m:
4175 self._downloader.report_error(u'unable to find video url')
4176 return
4177 video_url = m.group(1)
4178
43113d92 4179 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4180 if not m:
613bf669 4181 self._downloader.report_error(u'Cannot find video title')
0cd35867 4182 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
43113d92 4183
4184 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4185 if m:
4186 desc = unescapeHTML(m.group('desc'))
4187 else:
4188 desc = None
4189
0cd35867
FV
4190 m = re.search(r'By:.*?(\w+)</a>', webpage)
4191 if m:
4192 uploader = clean_html(m.group(1))
4193 else:
4194 uploader = None
43113d92 4195
4196 info = {
4197 'id': video_id,
4198 'url': video_url,
4199 'ext': 'mp4',
4200 'title': title,
0cd35867
FV
4201 'description': desc,
4202 'uploader': uploader
43113d92 4203 }
4204
4205 return [info]
4206
df2dedee 4207class ARDIE(InfoExtractor):
b03d65c2
PH
4208 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4209 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
df2dedee
MW
4210 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4211
4212 def _real_extract(self, url):
4213 # determine video id from url
4214 m = re.match(self._VALID_URL, url)
b03d65c2
PH
4215
4216 numid = re.search(r'documentId=([0-9]+)', url)
4217 if numid:
4218 video_id = numid.group(1)
4219 else:
4220 video_id = m.group('video_id')
df2dedee
MW
4221
4222 # determine title and media streams from webpage
4223 html = self._download_webpage(url, video_id)
4224 title = re.search(self._TITLE, html).group('title')
4225 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4226 if not streams:
4227 assert '"fsk"' in html
4228 self._downloader.report_error(u'this video is only available after 8:00 pm')
4229 return
4230
4231 # choose default media type and highest quality for now
b03d65c2
PH
4232 stream = max([s for s in streams if int(s["media_type"]) == 0],
4233 key=lambda s: int(s["quality"]))
df2dedee
MW
4234
4235 # there's two possibilities: RTMP stream or HTTP download
4236 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4237 if stream['rtmp_url']:
f17ce13a 4238 self.to_screen(u'RTMP download detected')
df2dedee
MW
4239 assert stream['video_url'].startswith('mp4:')
4240 info["url"] = stream["rtmp_url"]
4241 info["play_path"] = stream['video_url']
4242 else:
4243 assert stream["video_url"].endswith('.mp4')
4244 info["url"] = stream["video_url"]
4245 return [info]
4246
c15e0241
JMF
4247class TumblrIE(InfoExtractor):
4248 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4249
4250 def _real_extract(self, url):
4251 m_url = re.match(self._VALID_URL, url)
4252 video_id = m_url.group('id')
4253 blog = m_url.group('blog_name')
4254
4255 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4256 webpage = self._download_webpage(url, video_id)
4257
4258 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4259 video = re.search(re_video, webpage)
4260 if video is None:
4261 self.to_screen("No video founded")
4262 return []
4263 video_url = video.group('video_url')
4264 ext = video.group('ext')
4265
4266 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4267 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4268
4269 # The only place where you can get a title, it's not complete,
4270 # but searching in other places doesn't work for all videos
4271 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4272 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4273
4274 return [{'id': video_id,
4275 'url': video_url,
4276 'title': title,
4277 'thumbnail': thumb,
4278 'ext': ext
4279 }]
4280
e32b06e9 4281
4aeae91f
PH
4282def gen_extractors():
4283 """ Return a list of an instance of every supported extractor.
4284 The order does matter; the first extractor matched is the one handling the URL.
4285 """
4286 return [
4287 YoutubePlaylistIE(),
4288 YoutubeChannelIE(),
4289 YoutubeUserIE(),
4290 YoutubeSearchIE(),
4291 YoutubeIE(),
4292 MetacafeIE(),
4293 DailymotionIE(),
4294 GoogleSearchIE(),
4295 PhotobucketIE(),
4296 YahooIE(),
4297 YahooSearchIE(),
4298 DepositFilesIE(),
4299 FacebookIE(),
4300 BlipTVUserIE(),
4301 BlipTVIE(),
4302 VimeoIE(),
4303 MyVideoIE(),
4304 ComedyCentralIE(),
4305 EscapistIE(),
4306 CollegeHumorIE(),
4307 XVideosIE(),
5011cded 4308 SoundcloudSetIE(),
4aeae91f
PH
4309 SoundcloudIE(),
4310 InfoQIE(),
4311 MixcloudIE(),
4312 StanfordOpenClassroomIE(),
4313 MTVIE(),
4314 YoukuIE(),
4315 XNXXIE(),
18be482a
JC
4316 YouJizzIE(),
4317 PornotubeIE(),
4318 YouPornIE(),
4aeae91f
PH
4319 GooglePlusIE(),
4320 ArteTvIE(),
4321 NBAIE(),
40634747 4322 WorldStarHipHopIE(),
4aeae91f
PH
4323 JustinTVIE(),
4324 FunnyOrDieIE(),
4aeae91f
PH
4325 SteamIE(),
4326 UstreamIE(),
ca0a0bbe 4327 RBMARadioIE(),
ccf65f9d 4328 EightTracksIE(),
da06e2da 4329 KeekIE(),
3a468f2d 4330 TEDIE(),
58994225 4331 MySpassIE(),
e32b06e9 4332 SpiegelIE(),
0cd35867 4333 LiveLeakIE(),
df2dedee 4334 ARDIE(),
c15e0241 4335 TumblrIE(),
4aeae91f
PH
4336 GenericIE()
4337 ]
93412126
JMF
4338
4339def get_info_extractor(ie_name):
4340 """Returns the info extractor class with the given ie_name"""
4341 return globals()[ie_name+'IE']