]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Spiegel IE
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
4fcca4bb 6import base64
d77c3dfd 7import datetime
ccf65f9d 8import itertools
d77c3dfd
FV
9import netrc
10import os
11import re
12import socket
13import time
d77c3dfd 14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
6324fd1d 18import operator
d77c3dfd 19
9e8056d5 20from .utils import *
d77c3dfd
FV
21
22
23class InfoExtractor(object):
59ae15a5 24 """Information Extractor class.
d77c3dfd 25
59ae15a5
PH
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
cdb30764 29 others. The information is stored in a dictionary which is then
59ae15a5
PH
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
717b1f72 33
59ae15a5 34 The dictionaries must include the following fields:
717b1f72 35
59ae15a5
PH
36 id: Video identifier.
37 url: Final video URL.
59ae15a5
PH
38 title: Video title, unescaped.
39 ext: Video filename extension.
717b1f72 40
59ae15a5 41 The following fields are optional:
717b1f72 42
59ae15a5
PH
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
539679c7
PH
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
77c4beab 48 uploader_id: Nickname or id of the video uploader.
6119f78c 49 location: Physical location of the video.
59ae15a5
PH
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
d77c3dfd 54
59ae15a5 55 The fields should all be Unicode strings.
9ce5d9ee 56
59ae15a5
PH
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
717b1f72 60
59ae15a5
PH
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
03c5b0fb 63
59ae15a5
PH
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
d77c3dfd 67
59ae15a5
PH
68 _ready = False
69 _downloader = None
70 _WORKING = True
d77c3dfd 71
59ae15a5
PH
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
d77c3dfd 76
89de9eb1
FV
77 @classmethod
78 def suitable(cls, url):
59ae15a5 79 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 80 return re.match(cls._VALID_URL, url) is not None
d77c3dfd 81
89de9eb1
FV
82 @classmethod
83 def working(cls):
59ae15a5 84 """Getter method for _WORKING."""
89de9eb1 85 return cls._WORKING
03c5b0fb 86
59ae15a5
PH
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
d77c3dfd 92
59ae15a5
PH
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
d77c3dfd 97
59ae15a5
PH
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
d77c3dfd 101
59ae15a5
PH
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
d77c3dfd 105
59ae15a5
PH
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
d77c3dfd 109
d0d4f277
PH
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
d77c3dfd 113
64ce2aad
PH
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
d830b7c2
PH
116 if note is None:
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
119 try:
64ce2aad 120 return compat_urllib_request.urlopen(url_or_request)
d830b7c2
PH
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122 if errnote is None:
123 errnote = u'Unable to download webpage'
01951dda 124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
d830b7c2 125
64ce2aad
PH
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
e32b06e9
PH
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
131 if m:
132 encoding = m.group(1)
133 else:
134 encoding = 'utf-8'
64ce2aad 135 webpage_bytes = urlh.read()
e32b06e9 136 return webpage_bytes.decode(encoding, 'replace')
64ce2aad 137
d830b7c2 138
d77c3dfd 139class YoutubeIE(InfoExtractor):
59ae15a5
PH
140 """Information extractor for youtube.com."""
141
142 _VALID_URL = r"""^
143 (
144 (?:https?://)? # http(s):// (optional)
145 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
59ae15a5
PH
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
3bb61659 153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
59ae15a5
PH
154 v=
155 )
156 )? # optional -> youtube.com/xxxx is OK
157 )? # all until now is optional -> you can pass the naked ID
158 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
159 (?(1).+)? # if we found the ID, everything can follow
160 $"""
161 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
d3f5f9f6 162 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
59ae15a5
PH
163 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 _NETRC_MACHINE = 'youtube'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169 _video_extensions = {
170 '13': '3gp',
171 '17': 'mp4',
172 '18': 'mp4',
173 '22': 'mp4',
174 '37': 'mp4',
175 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
176 '43': 'webm',
177 '44': 'webm',
178 '45': 'webm',
179 '46': 'webm',
180 }
181 _video_dimensions = {
182 '5': '240x400',
183 '6': '???',
184 '13': '???',
185 '17': '144x176',
186 '18': '360x640',
187 '22': '720x1280',
188 '34': '360x640',
189 '35': '480x854',
190 '37': '1080x1920',
191 '38': '3072x4096',
192 '43': '360x640',
193 '44': '480x854',
194 '45': '720x1280',
195 '46': '1080x1920',
cdb30764 196 }
59ae15a5
PH
197 IE_NAME = u'youtube'
198
89de9eb1
FV
199 @classmethod
200 def suitable(cls, url):
59ae15a5 201 """Receives a URL and returns True if suitable for this IE."""
89de9eb1
FV
202 if YoutubePlaylistIE.suitable(url): return False
203 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
59ae15a5
PH
204
205 def report_lang(self):
206 """Report attempt to set language."""
207 self._downloader.to_screen(u'[youtube] Setting language')
208
209 def report_login(self):
210 """Report attempt to log in."""
211 self._downloader.to_screen(u'[youtube] Logging in')
212
213 def report_age_confirmation(self):
214 """Report attempt to confirm age."""
215 self._downloader.to_screen(u'[youtube] Confirming age')
216
217 def report_video_webpage_download(self, video_id):
218 """Report attempt to download video webpage."""
219 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
220
221 def report_video_info_webpage_download(self, video_id):
222 """Report attempt to download video info webpage."""
223 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
224
225 def report_video_subtitles_download(self, video_id):
226 """Report attempt to download video info webpage."""
227 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
228
229 def report_information_extraction(self, video_id):
230 """Report attempt to extract video information."""
231 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
232
233 def report_unavailable_format(self, video_id, format):
234 """Report extracted video URL."""
235 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
236
237 def report_rtmp_download(self):
238 """Indicate the download will use the RTMP protocol."""
239 self._downloader.to_screen(u'[youtube] RTMP download detected')
240
241 def _closed_captions_xml_to_srt(self, xml_string):
242 srt = ''
243 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
244 # TODO parse xml instead of regex
245 for n, (start, dur_tag, dur, caption) in enumerate(texts):
246 if not dur: dur = '4'
247 start = float(start)
248 end = start + float(dur)
249 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
250 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
251 caption = unescapeHTML(caption)
252 caption = unescapeHTML(caption) # double cycle, intentional
253 srt += str(n+1) + '\n'
254 srt += start + ' --> ' + end + '\n'
255 srt += caption + '\n\n'
256 return srt
257
056d8575
FV
258 def _extract_subtitles(self, video_id):
259 self.report_video_subtitles_download(video_id)
260 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
261 try:
262 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
266 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
267 if not srt_lang_list:
268 return (u'WARNING: video has no closed captions', None)
269 if self._downloader.params.get('subtitleslang', False):
270 srt_lang = self._downloader.params.get('subtitleslang')
271 elif 'en' in srt_lang_list:
272 srt_lang = 'en'
273 else:
1a2c3c0f 274 srt_lang = list(srt_lang_list.keys())[0]
056d8575
FV
275 if not srt_lang in srt_lang_list:
276 return (u'WARNING: no closed captions found in the specified language', None)
fb778e66
PH
277 params = compat_urllib_parse.urlencode({
278 'lang': srt_lang,
279 'name': srt_lang_list[srt_lang].encode('utf-8'),
280 'v': video_id,
281 })
282 url = 'http://www.youtube.com/api/timedtext?' + params
056d8575 283 try:
fb778e66 284 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
056d8575
FV
285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
286 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
287 if not srt_xml:
fb778e66 288 return (u'WARNING: Did not fetch video subtitles', None)
056d8575
FV
289 return (None, self._closed_captions_xml_to_srt(srt_xml))
290
59ae15a5
PH
291 def _print_formats(self, formats):
292 print('Available formats:')
293 for x in formats:
294 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
295
296 def _real_initialize(self):
297 if self._downloader is None:
298 return
299
300 username = None
301 password = None
302 downloader_params = self._downloader.params
303
304 # Attempt to use provided username and password or .netrc data
305 if downloader_params.get('username', None) is not None:
306 username = downloader_params['username']
307 password = downloader_params['password']
308 elif downloader_params.get('usenetrc', False):
309 try:
310 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
311 if info is not None:
312 username = info[0]
313 password = info[2]
314 else:
315 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
316 except (IOError, netrc.NetrcParseError) as err:
2e5457be 317 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
318 return
319
320 # Set language
321 request = compat_urllib_request.Request(self._LANG_URL)
322 try:
323 self.report_lang()
324 compat_urllib_request.urlopen(request).read()
325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 326 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59ae15a5
PH
327 return
328
329 # No authentication to be performed
330 if username is None:
331 return
332
d3f5f9f6
PH
333 request = compat_urllib_request.Request(self._LOGIN_URL)
334 try:
335 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
336 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 337 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
d3f5f9f6
PH
338 return
339
340 galx = None
341 dsh = None
342 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
343 if match:
344 galx = match.group(1)
345
346 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
347 if match:
348 dsh = match.group(1)
349
59ae15a5 350 # Log in
d3f5f9f6
PH
351 login_form_strs = {
352 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
353 u'Email': username,
354 u'GALX': galx,
355 u'Passwd': password,
356 u'PersistentCookie': u'yes',
357 u'_utf8': u'霱',
358 u'bgresponse': u'js_disabled',
359 u'checkConnection': u'',
360 u'checkedDomains': u'youtube',
361 u'dnConn': u'',
362 u'dsh': dsh,
363 u'pstMsg': u'0',
364 u'rmShown': u'1',
365 u'secTok': u'',
366 u'signIn': u'Sign in',
367 u'timeStmp': u'',
368 u'service': u'youtube',
369 u'uilel': u'3',
370 u'hl': u'en_US',
371 }
372 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
373 # chokes on unicode
374 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
375 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
376 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
59ae15a5
PH
377 try:
378 self.report_login()
80d3177e 379 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
d3f5f9f6 380 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
2e5457be 381 self._downloader.report_warning(u'unable to log in: bad username or password')
59ae15a5
PH
382 return
383 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 384 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
385 return
386
387 # Confirm age
388 age_form = {
389 'next_url': '/',
390 'action_confirm': 'Confirm',
391 }
392 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
393 try:
394 self.report_age_confirmation()
80d3177e 395 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
398 return
399
3bb61659 400 def _extract_id(self, url):
59ae15a5
PH
401 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
402 if mobj is None:
403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
404 return
405 video_id = mobj.group(2)
3bb61659
PH
406 return video_id
407
408 def _real_extract(self, url):
409 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
410 mobj = re.search(self._NEXT_URL_RE, url)
411 if mobj:
412 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
413 video_id = self._extract_id(url)
59ae15a5
PH
414
415 # Get video webpage
416 self.report_video_webpage_download(video_id)
3bb61659
PH
417 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
418 request = compat_urllib_request.Request(url)
59ae15a5
PH
419 try:
420 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
423 return
424
425 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
426
427 # Attempt to extract SWF player URL
428 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
429 if mobj is not None:
430 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
431 else:
432 player_url = None
433
434 # Get video info
435 self.report_video_info_webpage_download(video_id)
436 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
437 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
438 % (video_id, el_type))
439 request = compat_urllib_request.Request(video_info_url)
440 try:
441 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
442 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
443 video_info = compat_parse_qs(video_info_webpage)
444 if 'token' in video_info:
445 break
446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
447 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
448 return
449 if 'token' not in video_info:
450 if 'reason' in video_info:
451 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
452 else:
453 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
454 return
455
456 # Check for "rental" videos
457 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
458 self._downloader.trouble(u'ERROR: "rental" videos not supported')
459 return
460
461 # Start extracting information
462 self.report_information_extraction(video_id)
463
464 # uploader
465 if 'author' not in video_info:
77c4beab 466 self._downloader.trouble(u'ERROR: unable to extract uploader name')
59ae15a5
PH
467 return
468 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
469
77c4beab
FV
470 # uploader_id
471 video_uploader_id = None
26cf0408 472 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
77c4beab
FV
473 if mobj is not None:
474 video_uploader_id = mobj.group(1)
475 else:
476 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
477
59ae15a5
PH
478 # title
479 if 'title' not in video_info:
480 self._downloader.trouble(u'ERROR: unable to extract video title')
481 return
482 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
483
484 # thumbnail image
485 if 'thumbnail_url' not in video_info:
486 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
487 video_thumbnail = ''
488 else: # don't panic if we can't find it
489 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
490
491 # upload date
492 upload_date = None
493 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
494 if mobj is not None:
495 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
496 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
497 for expression in format_expressions:
498 try:
499 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
500 except:
501 pass
502
503 # description
504 video_description = get_element_by_id("eow-description", video_webpage)
505 if video_description:
506 video_description = clean_html(video_description)
507 else:
508 video_description = ''
509
510 # closed captions
511 video_subtitles = None
512 if self._downloader.params.get('writesubtitles', False):
056d8575
FV
513 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
514 if srt_error:
515 self._downloader.trouble(srt_error)
59ae15a5
PH
516
517 if 'length_seconds' not in video_info:
518 self._downloader.trouble(u'WARNING: unable to extract video duration')
519 video_duration = ''
520 else:
521 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
522
523 # token
524 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
525
526 # Decide which formats to download
527 req_format = self._downloader.params.get('format', None)
528
529 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
530 self.report_rtmp_download()
531 video_url_list = [(None, video_info['conn'][0])]
532 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
533 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
534 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1a2c3c0f 535 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
59ae15a5
PH
536 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
537
538 format_limit = self._downloader.params.get('format_limit', None)
539 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
540 if format_limit is not None and format_limit in available_formats:
541 format_list = available_formats[available_formats.index(format_limit):]
542 else:
543 format_list = available_formats
544 existing_formats = [x for x in format_list if x in url_map]
545 if len(existing_formats) == 0:
546 self._downloader.trouble(u'ERROR: no known formats available for video')
547 return
548 if self._downloader.params.get('listformats', None):
549 self._print_formats(existing_formats)
550 return
551 if req_format is None or req_format == 'best':
552 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
553 elif req_format == 'worst':
554 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
555 elif req_format in ('-1', 'all'):
556 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
557 else:
558 # Specific formats. We pick the first in a slash-delimeted sequence.
559 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
560 req_formats = req_format.split('/')
561 video_url_list = None
562 for rf in req_formats:
563 if rf in url_map:
564 video_url_list = [(rf, url_map[rf])]
565 break
566 if video_url_list is None:
567 self._downloader.trouble(u'ERROR: requested format not available')
568 return
569 else:
570 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
571 return
572
573 results = []
574 for format_param, video_real_url in video_url_list:
575 # Extension
576 video_extension = self._video_extensions.get(format_param, 'flv')
577
32761d86
FV
578 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
579 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
580
581 results.append({
582 'id': video_id,
583 'url': video_real_url,
584 'uploader': video_uploader,
77c4beab 585 'uploader_id': video_uploader_id,
59ae15a5
PH
586 'upload_date': upload_date,
587 'title': video_title,
588 'ext': video_extension,
589 'format': video_format,
590 'thumbnail': video_thumbnail,
591 'description': video_description,
592 'player_url': player_url,
593 'subtitles': video_subtitles,
594 'duration': video_duration
595 })
596 return results
d77c3dfd
FV
597
598
599class MetacafeIE(InfoExtractor):
59ae15a5
PH
600 """Information Extractor for metacafe.com."""
601
602 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
603 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
604 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
605 IE_NAME = u'metacafe'
606
607 def __init__(self, downloader=None):
608 InfoExtractor.__init__(self, downloader)
609
610 def report_disclaimer(self):
611 """Report disclaimer retrieval."""
612 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
613
614 def report_age_confirmation(self):
615 """Report attempt to confirm age."""
616 self._downloader.to_screen(u'[metacafe] Confirming age')
617
618 def report_download_webpage(self, video_id):
619 """Report webpage download."""
620 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
621
622 def report_extraction(self, video_id):
623 """Report information extraction."""
624 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
625
626 def _real_initialize(self):
627 # Retrieve disclaimer
628 request = compat_urllib_request.Request(self._DISCLAIMER)
629 try:
630 self.report_disclaimer()
631 disclaimer = compat_urllib_request.urlopen(request).read()
632 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
633 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
634 return
635
636 # Confirm age
637 disclaimer_form = {
638 'filters': '0',
639 'submit': "Continue - I'm over 18",
640 }
641 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
642 try:
643 self.report_age_confirmation()
644 disclaimer = compat_urllib_request.urlopen(request).read()
645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
646 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
647 return
648
649 def _real_extract(self, url):
650 # Extract id and simplified title from URL
651 mobj = re.match(self._VALID_URL, url)
652 if mobj is None:
653 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
654 return
655
656 video_id = mobj.group(1)
657
658 # Check if video comes from YouTube
659 mobj2 = re.match(r'^yt-(.*)$', video_id)
660 if mobj2 is not None:
661 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
662 return
663
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
666 try:
667 self.report_download_webpage(video_id)
668 webpage = compat_urllib_request.urlopen(request).read()
669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
670 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
671 return
672
673 # Extract URL, uploader and title from webpage
674 self.report_extraction(video_id)
675 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
676 if mobj is not None:
677 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
678 video_extension = mediaURL[-3:]
679
680 # Extract gdaKey if available
681 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
682 if mobj is None:
683 video_url = mediaURL
684 else:
685 gdaKey = mobj.group(1)
686 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
687 else:
688 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
689 if mobj is None:
690 self._downloader.trouble(u'ERROR: unable to extract media URL')
691 return
692 vardict = compat_parse_qs(mobj.group(1))
693 if 'mediaData' not in vardict:
694 self._downloader.trouble(u'ERROR: unable to extract media URL')
695 return
696 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
697 if mobj is None:
698 self._downloader.trouble(u'ERROR: unable to extract media URL')
699 return
700 mediaURL = mobj.group(1).replace('\\/', '/')
701 video_extension = mediaURL[-3:]
702 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
703
704 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
705 if mobj is None:
706 self._downloader.trouble(u'ERROR: unable to extract title')
707 return
708 video_title = mobj.group(1).decode('utf-8')
709
710 mobj = re.search(r'submitter=(.*?);', webpage)
711 if mobj is None:
712 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
713 return
714 video_uploader = mobj.group(1)
715
716 return [{
717 'id': video_id.decode('utf-8'),
718 'url': video_url.decode('utf-8'),
719 'uploader': video_uploader.decode('utf-8'),
720 'upload_date': None,
721 'title': video_title,
722 'ext': video_extension.decode('utf-8'),
723 }]
d77c3dfd
FV
724
725
726class DailymotionIE(InfoExtractor):
59ae15a5
PH
727 """Information Extractor for Dailymotion"""
728
729 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
730 IE_NAME = u'dailymotion'
b17c974a 731 _WORKING = False
59ae15a5
PH
732
733 def __init__(self, downloader=None):
734 InfoExtractor.__init__(self, downloader)
735
59ae15a5
PH
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
739
740 def _real_extract(self, url):
741 # Extract id and simplified title from URL
742 mobj = re.match(self._VALID_URL, url)
743 if mobj is None:
744 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
745 return
746
747 video_id = mobj.group(1).split('_')[0].split('?')[0]
748
749 video_extension = 'mp4'
750
751 # Retrieve video webpage to extract further information
752 request = compat_urllib_request.Request(url)
753 request.add_header('Cookie', 'family_filter=off')
8e241d1a 754 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
755
756 # Extract URL, uploader and title from webpage
757 self.report_extraction(video_id)
758 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
759 if mobj is None:
760 self._downloader.trouble(u'ERROR: unable to extract media URL')
761 return
762 flashvars = compat_urllib_parse.unquote(mobj.group(1))
763
764 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
765 if key in flashvars:
766 max_quality = key
767 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
768 break
769 else:
770 self._downloader.trouble(u'ERROR: unable to extract video URL')
771 return
772
773 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
774 if mobj is None:
775 self._downloader.trouble(u'ERROR: unable to extract video URL')
776 return
777
778 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
779
780 # TODO: support choosing qualities
781
782 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: unable to extract title')
785 return
28ca6b5a 786 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
787
788 video_uploader = None
789 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
790 if mobj is None:
791 # lookin for official user
792 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
793 if mobj_official is None:
794 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
795 else:
796 video_uploader = mobj_official.group(1)
797 else:
798 video_uploader = mobj.group(1)
799
800 video_upload_date = None
801 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
802 if mobj is not None:
803 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
804
805 return [{
28ca6b5a
PH
806 'id': video_id,
807 'url': video_url,
808 'uploader': video_uploader,
59ae15a5
PH
809 'upload_date': video_upload_date,
810 'title': video_title,
28ca6b5a 811 'ext': video_extension,
59ae15a5 812 }]
d77c3dfd
FV
813
814
d77c3dfd 815class PhotobucketIE(InfoExtractor):
59ae15a5
PH
816 """Information extractor for photobucket.com."""
817
818 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
819 IE_NAME = u'photobucket'
820
821 def __init__(self, downloader=None):
822 InfoExtractor.__init__(self, downloader)
823
824 def report_download_webpage(self, video_id):
825 """Report webpage download."""
826 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
827
828 def report_extraction(self, video_id):
829 """Report information extraction."""
830 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
831
832 def _real_extract(self, url):
833 # Extract id from URL
834 mobj = re.match(self._VALID_URL, url)
835 if mobj is None:
836 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
837 return
838
839 video_id = mobj.group(1)
840
841 video_extension = 'flv'
842
843 # Retrieve video webpage to extract further information
844 request = compat_urllib_request.Request(url)
845 try:
846 self.report_download_webpage(video_id)
847 webpage = compat_urllib_request.urlopen(request).read()
848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
849 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
850 return
851
852 # Extract URL, uploader, and title from webpage
853 self.report_extraction(video_id)
854 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
855 if mobj is None:
856 self._downloader.trouble(u'ERROR: unable to extract media URL')
857 return
858 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
859
860 video_url = mediaURL
861
862 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
863 if mobj is None:
864 self._downloader.trouble(u'ERROR: unable to extract title')
865 return
866 video_title = mobj.group(1).decode('utf-8')
867
868 video_uploader = mobj.group(2).decode('utf-8')
869
870 return [{
871 'id': video_id.decode('utf-8'),
872 'url': video_url.decode('utf-8'),
873 'uploader': video_uploader,
874 'upload_date': None,
875 'title': video_title,
876 'ext': video_extension.decode('utf-8'),
877 }]
d77c3dfd
FV
878
879
880class YahooIE(InfoExtractor):
59ae15a5
PH
881 """Information extractor for video.yahoo.com."""
882
93702113 883 _WORKING = False
59ae15a5
PH
884 # _VALID_URL matches all Yahoo! Video URLs
885 # _VPAGE_URL matches only the extractable '/watch/' URLs
886 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
887 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
888 IE_NAME = u'video.yahoo'
889
890 def __init__(self, downloader=None):
891 InfoExtractor.__init__(self, downloader)
892
893 def report_download_webpage(self, video_id):
894 """Report webpage download."""
895 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
896
897 def report_extraction(self, video_id):
898 """Report information extraction."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
900
901 def _real_extract(self, url, new_video=True):
902 # Extract ID from URL
903 mobj = re.match(self._VALID_URL, url)
904 if mobj is None:
905 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
906 return
907
908 video_id = mobj.group(2)
909 video_extension = 'flv'
910
911 # Rewrite valid but non-extractable URLs as
912 # extractable English language /watch/ URLs
913 if re.match(self._VPAGE_URL, url) is None:
914 request = compat_urllib_request.Request(url)
915 try:
916 webpage = compat_urllib_request.urlopen(request).read()
917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
919 return
920
921 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
922 if mobj is None:
923 self._downloader.trouble(u'ERROR: Unable to extract id field')
924 return
925 yahoo_id = mobj.group(1)
926
927 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
928 if mobj is None:
929 self._downloader.trouble(u'ERROR: Unable to extract vid field')
930 return
931 yahoo_vid = mobj.group(1)
932
933 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
934 return self._real_extract(url, new_video=False)
935
936 # Retrieve video webpage to extract further information
937 request = compat_urllib_request.Request(url)
938 try:
939 self.report_download_webpage(video_id)
940 webpage = compat_urllib_request.urlopen(request).read()
941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
943 return
944
945 # Extract uploader and title from webpage
946 self.report_extraction(video_id)
947 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
948 if mobj is None:
949 self._downloader.trouble(u'ERROR: unable to extract video title')
950 return
951 video_title = mobj.group(1).decode('utf-8')
952
953 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
954 if mobj is None:
955 self._downloader.trouble(u'ERROR: unable to extract video uploader')
956 return
957 video_uploader = mobj.group(1).decode('utf-8')
958
959 # Extract video thumbnail
960 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
961 if mobj is None:
962 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
963 return
964 video_thumbnail = mobj.group(1).decode('utf-8')
965
966 # Extract video description
967 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
968 if mobj is None:
969 self._downloader.trouble(u'ERROR: unable to extract video description')
970 return
971 video_description = mobj.group(1).decode('utf-8')
972 if not video_description:
973 video_description = 'No description available.'
974
975 # Extract video height and width
976 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
977 if mobj is None:
978 self._downloader.trouble(u'ERROR: unable to extract video height')
979 return
980 yv_video_height = mobj.group(1)
981
982 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
983 if mobj is None:
984 self._downloader.trouble(u'ERROR: unable to extract video width')
985 return
986 yv_video_width = mobj.group(1)
987
988 # Retrieve video playlist to extract media URL
989 # I'm not completely sure what all these options are, but we
990 # seem to need most of them, otherwise the server sends a 401.
991 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
992 yv_bitrate = '700' # according to Wikipedia this is hard-coded
993 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
994 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
995 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
996 try:
997 self.report_download_webpage(video_id)
998 webpage = compat_urllib_request.urlopen(request).read()
999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1000 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1001 return
1002
1003 # Extract media URL from playlist XML
1004 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1005 if mobj is None:
1006 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1007 return
1008 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1009 video_url = unescapeHTML(video_url)
1010
1011 return [{
1012 'id': video_id.decode('utf-8'),
1013 'url': video_url,
1014 'uploader': video_uploader,
1015 'upload_date': None,
1016 'title': video_title,
1017 'ext': video_extension.decode('utf-8'),
1018 'thumbnail': video_thumbnail.decode('utf-8'),
1019 'description': video_description,
1020 }]
d77c3dfd
FV
1021
1022
1023class VimeoIE(InfoExtractor):
59ae15a5
PH
1024 """Information extractor for vimeo.com."""
1025
1026 # _VALID_URL matches Vimeo URLs
8edc2cf8 1027 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
59ae15a5
PH
1028 IE_NAME = u'vimeo'
1029
1030 def __init__(self, downloader=None):
1031 InfoExtractor.__init__(self, downloader)
1032
1033 def report_download_webpage(self, video_id):
1034 """Report webpage download."""
1035 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1036
1037 def report_extraction(self, video_id):
1038 """Report information extraction."""
1039 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1040
1041 def _real_extract(self, url, new_video=True):
1042 # Extract ID from URL
1043 mobj = re.match(self._VALID_URL, url)
1044 if mobj is None:
1045 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1046 return
1047
8edc2cf8
PH
1048 video_id = mobj.group('id')
1049 if not mobj.group('proto'):
1050 url = 'https://' + url
1051 if mobj.group('direct_link'):
1052 url = 'https://vimeo.com/' + video_id
59ae15a5
PH
1053
1054 # Retrieve video webpage to extract further information
1055 request = compat_urllib_request.Request(url, None, std_headers)
1056 try:
1057 self.report_download_webpage(video_id)
f1171f7c
PH
1058 webpage_bytes = compat_urllib_request.urlopen(request).read()
1059 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
1060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1062 return
1063
1064 # Now we begin extracting as much information as we can from what we
1065 # retrieved. First we extract the information common to all extractors,
1066 # and latter we extract those that are Vimeo specific.
1067 self.report_extraction(video_id)
1068
1069 # Extract the config JSON
59ae15a5 1070 try:
1ca63e3a 1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1072 config = json.loads(config)
1073 except:
1074 self._downloader.trouble(u'ERROR: unable to extract info section')
1075 return
cdb30764 1076
59ae15a5
PH
1077 # Extract title
1078 video_title = config["video"]["title"]
1079
77c4beab 1080 # Extract uploader and uploader_id
59ae15a5 1081 video_uploader = config["video"]["owner"]["name"]
77c4beab 1082 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1083
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1086
1087 # Extract video description
0dcfb234 1088 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5
PH
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1091
1092 # Extract upload date
1093 video_upload_date = None
6b3aef80 1094 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1095 if mobj is not None:
6b3aef80 1096 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1097
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1101
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1113 else:
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122 break
1123 else:
1124 self._downloader.trouble(u'ERROR: no known codec found')
1125 return
1126
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130 return [{
1131 'id': video_id,
1132 'url': video_url,
1133 'uploader': video_uploader,
77c4beab 1134 'uploader_id': video_uploader_id,
59ae15a5
PH
1135 'upload_date': video_upload_date,
1136 'title': video_title,
1137 'ext': video_extension,
1138 'thumbnail': video_thumbnail,
1139 'description': video_description,
1140 }]
d77c3dfd
FV
1141
1142
f2ad10a9 1143class ArteTvIE(InfoExtractor):
59ae15a5
PH
1144 """arte.tv information extractor."""
1145
1146 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1147 _LIVE_URL = r'index-[0-9]+\.html$'
1148
1149 IE_NAME = u'arte.tv'
1150
1151 def __init__(self, downloader=None):
1152 InfoExtractor.__init__(self, downloader)
1153
1154 def report_download_webpage(self, video_id):
1155 """Report webpage download."""
1156 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1157
1158 def report_extraction(self, video_id):
1159 """Report information extraction."""
1160 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1161
1162 def fetch_webpage(self, url):
59ae15a5
PH
1163 request = compat_urllib_request.Request(url)
1164 try:
1165 self.report_download_webpage(url)
1166 webpage = compat_urllib_request.urlopen(request).read()
1167 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1168 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1169 return
1170 except ValueError as err:
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172 return
1173 return webpage
1174
1175 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1176 page = self.fetch_webpage(url)
1177 mobj = re.search(regex, page, regexFlags)
1178 info = {}
1179
1180 if mobj is None:
1181 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1182 return
1183
1184 for (i, key, err) in matchTuples:
1185 if mobj.group(i) is None:
1186 self._downloader.trouble(err)
1187 return
1188 else:
1189 info[key] = mobj.group(i)
1190
1191 return info
1192
1193 def extractLiveStream(self, url):
1194 video_lang = url.split('/')[-4]
1195 info = self.grep_webpage(
1196 url,
1197 r'src="(.*?/videothek_js.*?\.js)',
1198 0,
1199 [
1200 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1201 ]
1202 )
1203 http_host = url.split('/')[2]
1204 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1205 info = self.grep_webpage(
1206 next_url,
1207 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1208 '(http://.*?\.swf).*?' +
1209 '(rtmp://.*?)\'',
1210 re.DOTALL,
1211 [
1212 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1213 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1214 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1215 ]
1216 )
1217 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1218
1219 def extractPlus7Stream(self, url):
1220 video_lang = url.split('/')[-3]
1221 info = self.grep_webpage(
1222 url,
1223 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1224 0,
1225 [
1226 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1227 ]
1228 )
1229 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1231 next_url,
1232 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1233 0,
1234 [
1235 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1236 ]
1237 )
1238 next_url = compat_urllib_parse.unquote(info.get('url'))
1239
1240 info = self.grep_webpage(
1241 next_url,
1242 r'<video id="(.*?)".*?>.*?' +
1243 '<name>(.*?)</name>.*?' +
1244 '<dateVideo>(.*?)</dateVideo>.*?' +
1245 '<url quality="hd">(.*?)</url>',
1246 re.DOTALL,
1247 [
1248 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1249 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1250 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1251 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1252 ]
1253 )
1254
1255 return {
1256 'id': info.get('id'),
1257 'url': compat_urllib_parse.unquote(info.get('url')),
1258 'uploader': u'arte.tv',
1259 'upload_date': info.get('date'),
93702113 1260 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1261 'ext': u'mp4',
1262 'format': u'NA',
1263 'player_url': None,
1264 }
1265
1266 def _real_extract(self, url):
1267 video_id = url.split('/')[-1]
1268 self.report_extraction(video_id)
1269
1270 if re.search(self._LIVE_URL, video_id) is not None:
1271 self.extractLiveStream(url)
1272 return
1273 else:
1274 info = self.extractPlus7Stream(url)
1275
1276 return [info]
f2ad10a9
CA
1277
1278
d77c3dfd 1279class GenericIE(InfoExtractor):
59ae15a5
PH
1280 """Generic last-resort information extractor."""
1281
1282 _VALID_URL = r'.*'
1283 IE_NAME = u'generic'
1284
1285 def __init__(self, downloader=None):
1286 InfoExtractor.__init__(self, downloader)
1287
1288 def report_download_webpage(self, video_id):
1289 """Report webpage download."""
3d342357
PH
1290 if not self._downloader.params.get('test', False):
1291 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
59ae15a5
PH
1292 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1293
1294 def report_extraction(self, video_id):
1295 """Report information extraction."""
1296 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1297
1298 def report_following_redirect(self, new_url):
1299 """Report information extraction."""
1300 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1301
59ae15a5
PH
1302 def _test_redirect(self, url):
1303 """Check if it is a redirect, like url shorteners, in case restart chain."""
1304 class HeadRequest(compat_urllib_request.Request):
1305 def get_method(self):
1306 return "HEAD"
1307
1308 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1309 """
cdb30764 1310 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1311 HeadRequest also on the redirected URL
1312 """
cdb30764 1313 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1314 if code in (301, 302, 303, 307):
cdb30764 1315 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1316 newheaders = dict((k,v) for k,v in req.headers.items()
1317 if k.lower() not in ("content-length", "content-type"))
cdb30764 1318 return HeadRequest(newurl,
59ae15a5 1319 headers=newheaders,
cdb30764
ND
1320 origin_req_host=req.get_origin_req_host(),
1321 unverifiable=True)
1322 else:
1323 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1324
1325 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1326 """
1327 Fallback to GET if HEAD is not allowed (405 HTTP error)
1328 """
cdb30764 1329 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1330 fp.read()
1331 fp.close()
1332
1333 newheaders = dict((k,v) for k,v in req.headers.items()
1334 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1335 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1336 headers=newheaders,
1337 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1338 unverifiable=True))
1339
1340 # Build our opener
cdb30764 1341 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1342 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1343 HTTPMethodFallback, HEADRedirectHandler,
7c038b3c 1344 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
59ae15a5
PH
1345 opener.add_handler(handler())
1346
1347 response = opener.open(HeadRequest(url))
1348 new_url = response.geturl()
1349
1350 if url == new_url:
1351 return False
1352
1353 self.report_following_redirect(new_url)
1354 self._downloader.download([new_url])
1355 return True
1356
1357 def _real_extract(self, url):
1358 if self._test_redirect(url): return
1359
1360 video_id = url.split('/')[-1]
59ae15a5 1361 try:
3d342357 1362 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
1363 except ValueError as err:
1364 # since this is the last-resort InfoExtractor, if
1365 # this error is thrown, it'll be thrown here
1366 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1367 return
1368
1369 self.report_extraction(video_id)
1370 # Start with something easy: JW Player in SWFObject
1371 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1372 if mobj is None:
1373 # Broaden the search a little bit
1374 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1013186a
PH
1375 if mobj is None:
1376 # Broaden the search a little bit: JWPlayer JS loader
1377 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
59ae15a5
PH
1378 if mobj is None:
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380 return
1381
1382 # It's possible that one of the regexes
1383 # matched, but returned an empty group:
1384 if mobj.group(1) is None:
1385 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1386 return
1387
1388 video_url = compat_urllib_parse.unquote(mobj.group(1))
1389 video_id = os.path.basename(video_url)
1390
1391 # here's a fun little line of code for you:
1392 video_extension = os.path.splitext(video_id)[1][1:]
1393 video_id = os.path.splitext(video_id)[0]
1394
1395 # it's tempting to parse this further, but you would
1396 # have to take into account all the variations like
1397 # Video Title - Site Name
1398 # Site Name | Video Title
1399 # Video Title - Tagline | Site Name
1400 # and so on and so forth; it's just not practical
1401 mobj = re.search(r'<title>(.*)</title>', webpage)
1402 if mobj is None:
1403 self._downloader.trouble(u'ERROR: unable to extract title')
1404 return
f1171f7c 1405 video_title = mobj.group(1)
59ae15a5
PH
1406
1407 # video uploader is domain name
1408 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1409 if mobj is None:
1410 self._downloader.trouble(u'ERROR: unable to extract title')
1411 return
f1171f7c 1412 video_uploader = mobj.group(1)
59ae15a5
PH
1413
1414 return [{
f1171f7c
PH
1415 'id': video_id,
1416 'url': video_url,
59ae15a5
PH
1417 'uploader': video_uploader,
1418 'upload_date': None,
1419 'title': video_title,
f1171f7c 1420 'ext': video_extension,
59ae15a5 1421 }]
d77c3dfd
FV
1422
1423
1424class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1425 """Information Extractor for YouTube search queries."""
1426 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1427 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1428 _max_youtube_results = 1000
1429 IE_NAME = u'youtube:search'
1430
1431 def __init__(self, downloader=None):
1432 InfoExtractor.__init__(self, downloader)
1433
1434 def report_download_page(self, query, pagenum):
1435 """Report attempt to download search page with given number."""
1436 query = query.decode(preferredencoding())
1437 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1438
1439 def _real_extract(self, query):
1440 mobj = re.match(self._VALID_URL, query)
1441 if mobj is None:
1442 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1443 return
1444
1445 prefix, query = query.split(':')
1446 prefix = prefix[8:]
1447 query = query.encode('utf-8')
1448 if prefix == '':
1449 self._download_n_results(query, 1)
1450 return
1451 elif prefix == 'all':
1452 self._download_n_results(query, self._max_youtube_results)
1453 return
1454 else:
1455 try:
1456 n = int(prefix)
1457 if n <= 0:
1458 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1459 return
1460 elif n > self._max_youtube_results:
2e5457be 1461 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
59ae15a5
PH
1462 n = self._max_youtube_results
1463 self._download_n_results(query, n)
1464 return
1465 except ValueError: # parsing prefix as integer fails
1466 self._download_n_results(query, 1)
1467 return
1468
1469 def _download_n_results(self, query, n):
1470 """Downloads a specified number of results for a query"""
1471
1472 video_ids = []
1473 pagenum = 0
1474 limit = n
1475
1476 while (50 * pagenum) < limit:
1477 self.report_download_page(query, pagenum+1)
1478 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479 request = compat_urllib_request.Request(result_url)
1480 try:
d1b7a243 1481 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1484 return
1485 api_response = json.loads(data)['data']
1486
9e07cf29
J
1487 if not 'items' in api_response:
1488 self._downloader.trouble(u'[youtube] No video results')
1489 return
1490
59ae15a5
PH
1491 new_ids = list(video['id'] for video in api_response['items'])
1492 video_ids += new_ids
1493
1494 limit = min(n, api_response['totalItems'])
1495 pagenum += 1
1496
1497 if len(video_ids) > n:
1498 video_ids = video_ids[:n]
1499 for id in video_ids:
1500 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1501 return
d77c3dfd
FV
1502
1503
1504class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1505 """Information Extractor for Google Video search queries."""
1506 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1507 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1508 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1509 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1510 _max_google_results = 1000
1511 IE_NAME = u'video.google:search'
1512
1513 def __init__(self, downloader=None):
1514 InfoExtractor.__init__(self, downloader)
1515
1516 def report_download_page(self, query, pagenum):
1517 """Report attempt to download playlist page with given number."""
1518 query = query.decode(preferredencoding())
1519 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1520
1521 def _real_extract(self, query):
1522 mobj = re.match(self._VALID_URL, query)
1523 if mobj is None:
1524 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1525 return
1526
1527 prefix, query = query.split(':')
1528 prefix = prefix[8:]
1529 query = query.encode('utf-8')
1530 if prefix == '':
1531 self._download_n_results(query, 1)
1532 return
1533 elif prefix == 'all':
1534 self._download_n_results(query, self._max_google_results)
1535 return
1536 else:
1537 try:
1538 n = int(prefix)
1539 if n <= 0:
1540 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1541 return
1542 elif n > self._max_google_results:
2e5457be 1543 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
59ae15a5
PH
1544 n = self._max_google_results
1545 self._download_n_results(query, n)
1546 return
1547 except ValueError: # parsing prefix as integer fails
1548 self._download_n_results(query, 1)
1549 return
1550
1551 def _download_n_results(self, query, n):
1552 """Downloads a specified number of results for a query"""
1553
1554 video_ids = []
1555 pagenum = 0
1556
1557 while True:
1558 self.report_download_page(query, pagenum)
1559 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1560 request = compat_urllib_request.Request(result_url)
1561 try:
1562 page = compat_urllib_request.urlopen(request).read()
1563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1564 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1565 return
1566
1567 # Extract video identifiers
1568 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1569 video_id = mobj.group(1)
1570 if video_id not in video_ids:
1571 video_ids.append(video_id)
1572 if len(video_ids) == n:
1573 # Specified n videos reached
1574 for id in video_ids:
1575 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1576 return
1577
1578 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1579 for id in video_ids:
1580 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1581 return
1582
1583 pagenum = pagenum + 1
d77c3dfd
FV
1584
1585
1586class YahooSearchIE(InfoExtractor):
59ae15a5 1587 """Information Extractor for Yahoo! Video search queries."""
93702113
FV
1588
1589 _WORKING = False
59ae15a5
PH
1590 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1591 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1592 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1593 _MORE_PAGES_INDICATOR = r'\s*Next'
1594 _max_yahoo_results = 1000
1595 IE_NAME = u'video.yahoo:search'
1596
1597 def __init__(self, downloader=None):
1598 InfoExtractor.__init__(self, downloader)
1599
1600 def report_download_page(self, query, pagenum):
1601 """Report attempt to download playlist page with given number."""
1602 query = query.decode(preferredencoding())
1603 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1604
1605 def _real_extract(self, query):
1606 mobj = re.match(self._VALID_URL, query)
1607 if mobj is None:
1608 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1609 return
1610
1611 prefix, query = query.split(':')
1612 prefix = prefix[8:]
1613 query = query.encode('utf-8')
1614 if prefix == '':
1615 self._download_n_results(query, 1)
1616 return
1617 elif prefix == 'all':
1618 self._download_n_results(query, self._max_yahoo_results)
1619 return
1620 else:
1621 try:
1622 n = int(prefix)
1623 if n <= 0:
1624 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1625 return
1626 elif n > self._max_yahoo_results:
2e5457be 1627 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
59ae15a5
PH
1628 n = self._max_yahoo_results
1629 self._download_n_results(query, n)
1630 return
1631 except ValueError: # parsing prefix as integer fails
1632 self._download_n_results(query, 1)
1633 return
1634
1635 def _download_n_results(self, query, n):
1636 """Downloads a specified number of results for a query"""
1637
1638 video_ids = []
1639 already_seen = set()
1640 pagenum = 1
1641
1642 while True:
1643 self.report_download_page(query, pagenum)
1644 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1645 request = compat_urllib_request.Request(result_url)
1646 try:
1647 page = compat_urllib_request.urlopen(request).read()
1648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650 return
1651
1652 # Extract video identifiers
1653 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1654 video_id = mobj.group(1)
1655 if video_id not in already_seen:
1656 video_ids.append(video_id)
1657 already_seen.add(video_id)
1658 if len(video_ids) == n:
1659 # Specified n videos reached
1660 for id in video_ids:
1661 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662 return
1663
1664 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1665 for id in video_ids:
1666 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1667 return
1668
1669 pagenum = pagenum + 1
d77c3dfd
FV
1670
1671
1672class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1673 """Information Extractor for YouTube playlists."""
1674
6324fd1d
FV
1675 _VALID_URL = r"""(?:
1676 (?:https?://)?
1677 (?:\w+\.)?
1678 youtube\.com/
1679 (?:
89de9eb1
FV
1680 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1681 \? (?:.*?&)*? (?:p|a|list)=
6324fd1d
FV
1682 | user/.*?/user/
1683 | p/
1684 | user/.*?#[pg]/c/
1685 )
89de9eb1
FV
1686 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1687 .*
1688 |
1689 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1690 )"""
6324fd1d
FV
1691 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1692 _MAX_RESULTS = 50
59ae15a5
PH
1693 IE_NAME = u'youtube:playlist'
1694
1695 def __init__(self, downloader=None):
1696 InfoExtractor.__init__(self, downloader)
1697
89de9eb1
FV
1698 @classmethod
1699 def suitable(cls, url):
6324fd1d 1700 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 1701 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
6324fd1d 1702
59ae15a5
PH
1703 def report_download_page(self, playlist_id, pagenum):
1704 """Report attempt to download playlist page with given number."""
1705 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1706
1707 def _real_extract(self, url):
1708 # Extract playlist id
6324fd1d 1709 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
1710 if mobj is None:
1711 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1712 return
1713
6324fd1d 1714 # Download playlist videos from API
89de9eb1 1715 playlist_id = mobj.group(1) or mobj.group(2)
6324fd1d
FV
1716 page_num = 1
1717 videos = []
59ae15a5
PH
1718
1719 while True:
6324fd1d
FV
1720 self.report_download_page(playlist_id, page_num)
1721
1722 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
59ae15a5 1723 try:
6324fd1d 1724 page = compat_urllib_request.urlopen(url).read().decode('utf8')
59ae15a5
PH
1725 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1726 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1727 return
1728
6324fd1d
FV
1729 try:
1730 response = json.loads(page)
1731 except ValueError as err:
1732 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1733 return
59ae15a5 1734
89de9eb1
FV
1735 if not 'feed' in response or not 'entry' in response['feed']:
1736 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1737 return
1738 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1739 for entry in response['feed']['entry']
1740 if 'content' in entry ]
6324fd1d
FV
1741
1742 if len(response['feed']['entry']) < self._MAX_RESULTS:
59ae15a5 1743 break
6324fd1d 1744 page_num += 1
59ae15a5 1745
691db5ba 1746 videos = [v[1] for v in sorted(videos)]
6324fd1d 1747 total = len(videos)
9789a05c 1748
59ae15a5
PH
1749 playliststart = self._downloader.params.get('playliststart', 1) - 1
1750 playlistend = self._downloader.params.get('playlistend', -1)
1751 if playlistend == -1:
6324fd1d 1752 videos = videos[playliststart:]
59ae15a5 1753 else:
6324fd1d 1754 videos = videos[playliststart:playlistend]
59ae15a5 1755
6324fd1d 1756 if len(videos) == total:
9789a05c
FV
1757 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1758 else:
6324fd1d 1759 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
9789a05c 1760
6324fd1d
FV
1761 for video in videos:
1762 self._downloader.download([video])
59ae15a5 1763 return
d77c3dfd
FV
1764
1765
902b2a0a 1766class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1767 """Information Extractor for YouTube channels."""
1768
1769 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1770 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
9789a05c 1771 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1772 IE_NAME = u'youtube:channel'
1773
1774 def report_download_page(self, channel_id, pagenum):
1775 """Report attempt to download channel page with given number."""
1776 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1777
1778 def _real_extract(self, url):
1779 # Extract channel id
1780 mobj = re.match(self._VALID_URL, url)
1781 if mobj is None:
1782 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1783 return
1784
1785 # Download channel pages
1786 channel_id = mobj.group(1)
1787 video_ids = []
1788 pagenum = 1
1789
1790 while True:
1791 self.report_download_page(channel_id, pagenum)
1792 url = self._TEMPLATE_URL % (channel_id, pagenum)
1793 request = compat_urllib_request.Request(url)
1794 try:
9789a05c 1795 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5
PH
1796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1797 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1798 return
1799
1800 # Extract video identifiers
1801 ids_in_page = []
1802 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1803 if mobj.group(1) not in ids_in_page:
1804 ids_in_page.append(mobj.group(1))
1805 video_ids.extend(ids_in_page)
1806
9789a05c 1807 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1808 break
1809 pagenum = pagenum + 1
1810
9789a05c
FV
1811 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1812
59ae15a5
PH
1813 for id in video_ids:
1814 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1815 return
902b2a0a
FV
1816
1817
d77c3dfd 1818class YoutubeUserIE(InfoExtractor):
59ae15a5 1819 """Information Extractor for YouTube users."""
d77c3dfd 1820
59ae15a5
PH
1821 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1822 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1823 _GDATA_PAGE_SIZE = 50
1824 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1825 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1826 IE_NAME = u'youtube:user'
d77c3dfd 1827
59ae15a5
PH
1828 def __init__(self, downloader=None):
1829 InfoExtractor.__init__(self, downloader)
d77c3dfd 1830
59ae15a5
PH
1831 def report_download_page(self, username, start_index):
1832 """Report attempt to download user page."""
1833 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1834 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1835
59ae15a5
PH
1836 def _real_extract(self, url):
1837 # Extract username
1838 mobj = re.match(self._VALID_URL, url)
1839 if mobj is None:
1840 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1841 return
d77c3dfd 1842
59ae15a5 1843 username = mobj.group(1)
d77c3dfd 1844
59ae15a5
PH
1845 # Download video ids using YouTube Data API. Result size per
1846 # query is limited (currently to 50 videos) so we need to query
1847 # page by page until there are no video ids - it means we got
1848 # all of them.
d77c3dfd 1849
59ae15a5
PH
1850 video_ids = []
1851 pagenum = 0
d77c3dfd 1852
59ae15a5
PH
1853 while True:
1854 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1855 self.report_download_page(username, start_index)
d77c3dfd 1856
59ae15a5 1857 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1858
59ae15a5 1859 try:
80d3177e 1860 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1863 return
d77c3dfd 1864
59ae15a5
PH
1865 # Extract video identifiers
1866 ids_in_page = []
d77c3dfd 1867
59ae15a5
PH
1868 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1869 if mobj.group(1) not in ids_in_page:
1870 ids_in_page.append(mobj.group(1))
d77c3dfd 1871
59ae15a5 1872 video_ids.extend(ids_in_page)
d77c3dfd 1873
59ae15a5
PH
1874 # A little optimization - if current page is not
1875 # "full", ie. does not contain PAGE_SIZE video ids then
1876 # we can assume that this page is the last one - there
1877 # are no more ids on further pages - no need to query
1878 # again.
d77c3dfd 1879
59ae15a5
PH
1880 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1881 break
d77c3dfd 1882
59ae15a5 1883 pagenum += 1
d77c3dfd 1884
59ae15a5
PH
1885 all_ids_count = len(video_ids)
1886 playliststart = self._downloader.params.get('playliststart', 1) - 1
1887 playlistend = self._downloader.params.get('playlistend', -1)
d77c3dfd 1888
59ae15a5
PH
1889 if playlistend == -1:
1890 video_ids = video_ids[playliststart:]
1891 else:
1892 video_ids = video_ids[playliststart:playlistend]
d77c3dfd 1893
59ae15a5
PH
1894 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1895 (username, all_ids_count, len(video_ids)))
d77c3dfd 1896
59ae15a5
PH
1897 for video_id in video_ids:
1898 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1899
1900
eeeb4daa 1901class BlipTVUserIE(InfoExtractor):
59ae15a5 1902 """Information Extractor for blip.tv users."""
eeeb4daa 1903
59ae15a5
PH
1904 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1905 _PAGE_SIZE = 12
1906 IE_NAME = u'blip.tv:user'
eeeb4daa 1907
59ae15a5
PH
1908 def __init__(self, downloader=None):
1909 InfoExtractor.__init__(self, downloader)
eeeb4daa 1910
59ae15a5
PH
1911 def report_download_page(self, username, pagenum):
1912 """Report attempt to download user page."""
1913 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1914 (self.IE_NAME, username, pagenum))
eeeb4daa 1915
59ae15a5
PH
1916 def _real_extract(self, url):
1917 # Extract username
1918 mobj = re.match(self._VALID_URL, url)
1919 if mobj is None:
1920 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1921 return
eeeb4daa 1922
59ae15a5 1923 username = mobj.group(1)
eeeb4daa 1924
59ae15a5 1925 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1926
59ae15a5 1927 request = compat_urllib_request.Request(url)
eeeb4daa 1928
59ae15a5
PH
1929 try:
1930 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931 mobj = re.search(r'data-users-id="([^"]+)"', page)
1932 page_base = page_base % mobj.group(1)
1933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1934 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1935 return
eeeb4daa
JCGS
1936
1937
59ae15a5
PH
1938 # Download video ids using BlipTV Ajax calls. Result size per
1939 # query is limited (currently to 12 videos) so we need to query
1940 # page by page until there are no video ids - it means we got
1941 # all of them.
eeeb4daa 1942
59ae15a5
PH
1943 video_ids = []
1944 pagenum = 1
eeeb4daa 1945
59ae15a5
PH
1946 while True:
1947 self.report_download_page(username, pagenum)
450e7099
PH
1948 url = page_base + "&page=" + str(pagenum)
1949 request = compat_urllib_request.Request( url )
59ae15a5
PH
1950 try:
1951 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1953 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1954 return
eeeb4daa 1955
59ae15a5
PH
1956 # Extract video identifiers
1957 ids_in_page = []
eeeb4daa 1958
59ae15a5
PH
1959 for mobj in re.finditer(r'href="/([^"]+)"', page):
1960 if mobj.group(1) not in ids_in_page:
1961 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1962
59ae15a5 1963 video_ids.extend(ids_in_page)
eeeb4daa 1964
59ae15a5
PH
1965 # A little optimization - if current page is not
1966 # "full", ie. does not contain PAGE_SIZE video ids then
1967 # we can assume that this page is the last one - there
1968 # are no more ids on further pages - no need to query
1969 # again.
eeeb4daa 1970
59ae15a5
PH
1971 if len(ids_in_page) < self._PAGE_SIZE:
1972 break
eeeb4daa 1973
59ae15a5 1974 pagenum += 1
eeeb4daa 1975
59ae15a5
PH
1976 all_ids_count = len(video_ids)
1977 playliststart = self._downloader.params.get('playliststart', 1) - 1
1978 playlistend = self._downloader.params.get('playlistend', -1)
eeeb4daa 1979
59ae15a5
PH
1980 if playlistend == -1:
1981 video_ids = video_ids[playliststart:]
1982 else:
1983 video_ids = video_ids[playliststart:playlistend]
eeeb4daa 1984
59ae15a5
PH
1985 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1986 (self.IE_NAME, username, all_ids_count, len(video_ids)))
eeeb4daa 1987
59ae15a5
PH
1988 for video_id in video_ids:
1989 self._downloader.download([u'http://blip.tv/'+video_id])
eeeb4daa
JCGS
1990
1991
d77c3dfd 1992class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1993 """Information extractor for depositfiles.com"""
1994
1995 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59ae15a5
PH
1996
1997 def report_download_webpage(self, file_id):
1998 """Report webpage download."""
1999 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2000
2001 def report_extraction(self, file_id):
2002 """Report information extraction."""
2003 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2004
2005 def _real_extract(self, url):
2006 file_id = url.split('/')[-1]
2007 # Rebuild url in english locale
2008 url = 'http://depositfiles.com/en/files/' + file_id
2009
2010 # Retrieve file webpage with 'Free download' button pressed
2011 free_download_indication = { 'gateway_result' : '1' }
2012 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2013 try:
2014 self.report_download_webpage(file_id)
2015 webpage = compat_urllib_request.urlopen(request).read()
2016 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2018 return
2019
2020 # Search for the real file URL
2021 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2022 if (mobj is None) or (mobj.group(1) is None):
2023 # Try to figure out reason of the error.
2024 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2025 if (mobj is not None) and (mobj.group(1) is not None):
2026 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2027 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2028 else:
2029 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2030 return
2031
2032 file_url = mobj.group(1)
2033 file_extension = os.path.splitext(file_url)[1][1:]
2034
2035 # Search for file title
2036 mobj = re.search(r'<b title="(.*?)">', webpage)
2037 if mobj is None:
2038 self._downloader.trouble(u'ERROR: unable to extract title')
2039 return
2040 file_title = mobj.group(1).decode('utf-8')
2041
2042 return [{
2043 'id': file_id.decode('utf-8'),
2044 'url': file_url.decode('utf-8'),
2045 'uploader': None,
2046 'upload_date': None,
2047 'title': file_title,
2048 'ext': file_extension.decode('utf-8'),
2049 }]
d77c3dfd
FV
2050
2051
2052class FacebookIE(InfoExtractor):
59ae15a5
PH
2053 """Information Extractor for Facebook"""
2054
59ae15a5
PH
2055 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2056 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2057 _NETRC_MACHINE = 'facebook'
59ae15a5
PH
2058 IE_NAME = u'facebook'
2059
59ae15a5
PH
2060 def report_login(self):
2061 """Report attempt to log in."""
b954070d 2062 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
59ae15a5
PH
2063
2064 def _real_initialize(self):
2065 if self._downloader is None:
2066 return
2067
2068 useremail = None
2069 password = None
2070 downloader_params = self._downloader.params
2071
2072 # Attempt to use provided username and password or .netrc data
2073 if downloader_params.get('username', None) is not None:
2074 useremail = downloader_params['username']
2075 password = downloader_params['password']
2076 elif downloader_params.get('usenetrc', False):
2077 try:
2078 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2079 if info is not None:
2080 useremail = info[0]
2081 password = info[2]
2082 else:
2083 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2084 except (IOError, netrc.NetrcParseError) as err:
2e5457be 2085 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
2086 return
2087
2088 if useremail is None:
2089 return
2090
2091 # Log in
2092 login_form = {
2093 'email': useremail,
2094 'pass': password,
2095 'login': 'Log+In'
2096 }
2097 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2098 try:
2099 self.report_login()
2100 login_results = compat_urllib_request.urlopen(request).read()
2101 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2e5457be 2102 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
59ae15a5
PH
2103 return
2104 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 2105 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
2106 return
2107
2108 def _real_extract(self, url):
2109 mobj = re.match(self._VALID_URL, url)
2110 if mobj is None:
2111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2112 return
2113 video_id = mobj.group('ID')
2114
b954070d
PH
2115 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2116 webpage = self._download_webpage(url, video_id)
2117
2118 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2119 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2120 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2121 if not m:
2122 raise ExtractorError(u'Cannot parse data')
2123 data = dict(json.loads(m.group(1)))
edba5137
PH
2124 params_raw = compat_urllib_parse.unquote(data['params'])
2125 params = json.loads(params_raw)
2126 video_url = params['hd_src']
7796e8c2
PH
2127 if not video_url:
2128 video_url = params['sd_src']
2129 if not video_url:
2130 raise ExtractorError(u'Cannot find video URL')
edba5137 2131 video_duration = int(params['video_duration'])
b954070d
PH
2132
2133 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2134 if not m:
2135 raise ExtractorError(u'Cannot find title in webpage')
2136 video_title = unescapeHTML(m.group(1))
2137
2138 info = {
2139 'id': video_id,
2140 'title': video_title,
2141 'url': video_url,
2142 'ext': 'mp4',
2143 'duration': video_duration,
edba5137 2144 'thumbnail': params['thumbnail_src'],
b954070d
PH
2145 }
2146 return [info]
59ae15a5 2147
d77c3dfd
FV
2148
2149class BlipTVIE(InfoExtractor):
59ae15a5
PH
2150 """Information extractor for blip.tv"""
2151
2152 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2153 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2154 IE_NAME = u'blip.tv'
2155
2156 def report_extraction(self, file_id):
2157 """Report information extraction."""
2158 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2159
2160 def report_direct_download(self, title):
2161 """Report information extraction."""
2162 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2163
2164 def _real_extract(self, url):
2165 mobj = re.match(self._VALID_URL, url)
2166 if mobj is None:
2167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2168 return
2169
f7b567ff
PH
2170 urlp = compat_urllib_parse_urlparse(url)
2171 if urlp.path.startswith('/play/'):
7f9d41a5
JCGS
2172 request = compat_urllib_request.Request(url)
2173 response = compat_urllib_request.urlopen(request)
2174 redirecturl = response.geturl()
f7b567ff
PH
2175 rurlp = compat_urllib_parse_urlparse(redirecturl)
2176 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2177 url = 'http://blip.tv/a/a-' + file_id
2178 return self._real_extract(url)
2179
7f9d41a5 2180
59ae15a5
PH
2181 if '?' in url:
2182 cchar = '&'
2183 else:
2184 cchar = '?'
2185 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2186 request = compat_urllib_request.Request(json_url)
3446dfb7 2187 request.add_header('User-Agent', 'iTunes/10.6.1')
59ae15a5
PH
2188 self.report_extraction(mobj.group(1))
2189 info = None
2190 try:
2191 urlh = compat_urllib_request.urlopen(request)
2192 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2193 basename = url.split('/')[-1]
2194 title,ext = os.path.splitext(basename)
2195 title = title.decode('UTF-8')
2196 ext = ext.replace('.', '')
2197 self.report_direct_download(title)
2198 info = {
2199 'id': title,
2200 'url': url,
2201 'uploader': None,
2202 'upload_date': None,
2203 'title': title,
2204 'ext': ext,
2205 'urlhandle': urlh
2206 }
2207 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446dfb7 2208 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
59ae15a5
PH
2209 if info is None: # Regular URL
2210 try:
55c05398
PH
2211 json_code_bytes = urlh.read()
2212 json_code = json_code_bytes.decode('utf-8')
59ae15a5
PH
2213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2215 return
2216
2217 try:
2218 json_data = json.loads(json_code)
2219 if 'Post' in json_data:
2220 data = json_data['Post']
2221 else:
2222 data = json_data
2223
2224 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225 video_url = data['media']['url']
2226 umobj = re.match(self._URL_EXT, video_url)
2227 if umobj is None:
2228 raise ValueError('Can not determine filename extension')
2229 ext = umobj.group(1)
2230
2231 info = {
2232 'id': data['item_id'],
2233 'url': video_url,
2234 'uploader': data['display_name'],
2235 'upload_date': upload_date,
2236 'title': data['title'],
2237 'ext': ext,
2238 'format': data['media']['mimeType'],
2239 'thumbnail': data['thumbnailUrl'],
2240 'description': data['description'],
3446dfb7
PH
2241 'player_url': data['embedUrl'],
2242 'user_agent': 'iTunes/10.6.1',
59ae15a5
PH
2243 }
2244 except (ValueError,KeyError) as err:
2245 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2246 return
2247
59ae15a5 2248 return [info]
d77c3dfd
FV
2249
2250
2251class MyVideoIE(InfoExtractor):
59ae15a5
PH
2252 """Information Extractor for myvideo.de."""
2253
2254 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255 IE_NAME = u'myvideo'
2256
2257 def __init__(self, downloader=None):
2258 InfoExtractor.__init__(self, downloader)
cdb30764 2259
59ae15a5
PH
2260 def report_extraction(self, video_id):
2261 """Report information extraction."""
2262 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2263
2264 def _real_extract(self,url):
2265 mobj = re.match(self._VALID_URL, url)
2266 if mobj is None:
2267 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2268 return
2269
2270 video_id = mobj.group(1)
2271
2272 # Get video webpage
5f955171
PH
2273 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2274 webpage = self._download_webpage(webpage_url, video_id)
59ae15a5
PH
2275
2276 self.report_extraction(video_id)
6d436336 2277 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
59ae15a5
PH
2278 webpage)
2279 if mobj is None:
2280 self._downloader.trouble(u'ERROR: unable to extract media URL')
2281 return
2282 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2283
2284 mobj = re.search('<title>([^<]+)</title>', webpage)
2285 if mobj is None:
2286 self._downloader.trouble(u'ERROR: unable to extract title')
2287 return
2288
2289 video_title = mobj.group(1)
2290
2291 return [{
2292 'id': video_id,
2293 'url': video_url,
2294 'uploader': None,
2295 'upload_date': None,
2296 'title': video_title,
2297 'ext': u'flv',
2298 }]
d77c3dfd
FV
2299
2300class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2301 """Information extractor for The Daily Show and Colbert Report """
2302
ca6849e6 2303 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2304 # urls for episodes like:
ca6849e6 2305 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2306 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2307 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2308 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2309 |(https?://)?(www\.)?
2310 (?P<showname>thedailyshow|colbertnation)\.com/
2311 (full-episodes/(?P<episode>.*)|
2312 (?P<clip>
2313 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2314 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2315 $"""
59ae15a5
PH
2316
2317 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2318
2319 _video_extensions = {
2320 '3500': 'mp4',
2321 '2200': 'mp4',
2322 '1700': 'mp4',
2323 '1200': 'mp4',
2324 '750': 'mp4',
2325 '400': 'mp4',
2326 }
2327 _video_dimensions = {
2328 '3500': '1280x720',
2329 '2200': '960x540',
2330 '1700': '768x432',
2331 '1200': '640x360',
2332 '750': '512x288',
2333 '400': '384x216',
2334 }
2335
89de9eb1
FV
2336 @classmethod
2337 def suitable(cls, url):
ca6849e6 2338 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 2339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
ca6849e6 2340
59ae15a5
PH
2341 def report_extraction(self, episode_id):
2342 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2343
32635ec6
PH
2344 def report_config_download(self, episode_id, media_id):
2345 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
59ae15a5
PH
2346
2347 def report_index_download(self, episode_id):
2348 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2349
59ae15a5
PH
2350 def _print_formats(self, formats):
2351 print('Available formats:')
2352 for x in formats:
2353 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2354
2355
2356 def _real_extract(self, url):
ca6849e6 2357 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2358 if mobj is None:
2359 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2360 return
2361
2362 if mobj.group('shortname'):
2363 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2364 url = u'http://www.thedailyshow.com/full-episodes/'
2365 else:
2366 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2367 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2368 assert mobj is not None
2369
ca6849e6 2370 if mobj.group('clip'):
2371 if mobj.group('showname') == 'thedailyshow':
2372 epTitle = mobj.group('tdstitle')
2373 else:
2374 epTitle = mobj.group('cntitle')
2375 dlNewest = False
59ae15a5 2376 else:
ca6849e6 2377 dlNewest = not mobj.group('episode')
2378 if dlNewest:
2379 epTitle = mobj.group('showname')
2380 else:
2381 epTitle = mobj.group('episode')
59ae15a5
PH
2382
2383 req = compat_urllib_request.Request(url)
2384 self.report_extraction(epTitle)
2385 try:
2386 htmlHandle = compat_urllib_request.urlopen(req)
2387 html = htmlHandle.read()
93148102 2388 webpage = html.decode('utf-8')
59ae15a5
PH
2389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2390 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2391 return
2392 if dlNewest:
2393 url = htmlHandle.geturl()
ca6849e6 2394 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2395 if mobj is None:
2396 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2397 return
2398 if mobj.group('episode') == '':
2399 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2400 return
2401 epTitle = mobj.group('episode')
2402
93148102 2403 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
59ae15a5
PH
2404
2405 if len(mMovieParams) == 0:
2406 # The Colbert Report embeds the information in a without
2407 # a URL prefix; so extract the alternate reference
2408 # and then add the URL prefix manually.
2409
93148102 2410 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
59ae15a5
PH
2411 if len(altMovieParams) == 0:
2412 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2413 return
2414 else:
2415 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2416
59ae15a5
PH
2417 uri = mMovieParams[0][1]
2418 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2419 self.report_index_download(epTitle)
2420 try:
2421 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2424 return
2425
2426 results = []
2427
2428 idoc = xml.etree.ElementTree.fromstring(indexXml)
2429 itemEls = idoc.findall('.//item')
7717ae19 2430 for partNum,itemEl in enumerate(itemEls):
59ae15a5
PH
2431 mediaId = itemEl.findall('./guid')[0].text
2432 shortMediaId = mediaId.split(':')[-1]
2433 showId = mediaId.split(':')[-2].replace('.com', '')
2434 officialTitle = itemEl.findall('./title')[0].text
2435 officialDate = itemEl.findall('./pubDate')[0].text
2436
2437 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2438 compat_urllib_parse.urlencode({'uri': mediaId}))
2439 configReq = compat_urllib_request.Request(configUrl)
32635ec6 2440 self.report_config_download(epTitle, shortMediaId)
59ae15a5
PH
2441 try:
2442 configXml = compat_urllib_request.urlopen(configReq).read()
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2445 return
2446
2447 cdoc = xml.etree.ElementTree.fromstring(configXml)
2448 turls = []
2449 for rendition in cdoc.findall('.//rendition'):
2450 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2451 turls.append(finfo)
2452
2453 if len(turls) == 0:
2454 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2455 continue
cdb30764 2456
59ae15a5
PH
2457 if self._downloader.params.get('listformats', None):
2458 self._print_formats([i[0] for i in turls])
2459 return
2460
2461 # For now, just pick the highest bitrate
32635ec6 2462 format,rtmp_video_url = turls[-1]
59ae15a5
PH
2463
2464 # Get the format arg from the arg stream
2465 req_format = self._downloader.params.get('format', None)
2466
2467 # Select format if we can find one
2468 for f,v in turls:
2469 if f == req_format:
32635ec6 2470 format, rtmp_video_url = f, v
59ae15a5
PH
2471 break
2472
32635ec6
PH
2473 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2474 if not m:
2475 raise ExtractorError(u'Cannot transform RTMP url')
2476 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2477 video_url = base + m.group('finalid')
59ae15a5 2478
7717ae19 2479 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
59ae15a5
PH
2480 info = {
2481 'id': shortMediaId,
2482 'url': video_url,
2483 'uploader': showId,
2484 'upload_date': officialDate,
2485 'title': effTitle,
2486 'ext': 'mp4',
2487 'format': format,
2488 'thumbnail': None,
2489 'description': officialTitle,
59ae15a5 2490 }
59ae15a5 2491 results.append(info)
cdb30764 2492
59ae15a5 2493 return results
d77c3dfd
FV
2494
2495
2496class EscapistIE(InfoExtractor):
59ae15a5
PH
2497 """Information extractor for The Escapist """
2498
2499 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2500 IE_NAME = u'escapist'
2501
2502 def report_extraction(self, showName):
2503 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2504
2505 def report_config_download(self, showName):
2506 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2507
2508 def _real_extract(self, url):
2509 mobj = re.match(self._VALID_URL, url)
2510 if mobj is None:
2511 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2512 return
2513 showName = mobj.group('showname')
2514 videoId = mobj.group('episode')
2515
2516 self.report_extraction(showName)
2517 try:
2518 webPage = compat_urllib_request.urlopen(url)
2519 webPageBytes = webPage.read()
2520 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2521 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2524 return
2525
2526 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2527 description = unescapeHTML(descMatch.group(1))
2528 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2529 imgUrl = unescapeHTML(imgMatch.group(1))
2530 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2531 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2532 configUrlMatch = re.search('config=(.*)$', playerUrl)
2533 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2534
2535 self.report_config_download(showName)
2536 try:
93702113
FV
2537 configJSON = compat_urllib_request.urlopen(configUrl)
2538 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2539 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
59ae15a5
PH
2540 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2541 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2542 return
2543
2544 # Technically, it's JavaScript, not JSON
2545 configJSON = configJSON.replace("'", '"')
2546
2547 try:
2548 config = json.loads(configJSON)
2549 except (ValueError,) as err:
2550 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2551 return
2552
2553 playlist = config['playlist']
2554 videoUrl = playlist[1]['url']
2555
2556 info = {
2557 'id': videoId,
2558 'url': videoUrl,
2559 'uploader': showName,
2560 'upload_date': None,
2561 'title': showName,
47dcd621 2562 'ext': 'mp4',
59ae15a5
PH
2563 'thumbnail': imgUrl,
2564 'description': description,
2565 'player_url': playerUrl,
2566 }
2567
2568 return [info]
d77c3dfd 2569
d77c3dfd 2570class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2571 """Information extractor for collegehumor.com"""
2572
0eb0faa2 2573 _WORKING = False
59ae15a5
PH
2574 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2575 IE_NAME = u'collegehumor'
2576
799c0763 2577 def report_manifest(self, video_id):
59ae15a5 2578 """Report information extraction."""
799c0763 2579 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
59ae15a5
PH
2580
2581 def report_extraction(self, video_id):
2582 """Report information extraction."""
2583 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2584
2585 def _real_extract(self, url):
2586 mobj = re.match(self._VALID_URL, url)
2587 if mobj is None:
2588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2589 return
2590 video_id = mobj.group('videoid')
2591
59ae15a5
PH
2592 info = {
2593 'id': video_id,
59ae15a5
PH
2594 'uploader': None,
2595 'upload_date': None,
2596 }
2597
2598 self.report_extraction(video_id)
799c0763 2599 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2600 try:
2601 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2602 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2604 return
2605
2606 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2607 try:
2608 videoNode = mdoc.findall('./video')[0]
2609 info['description'] = videoNode.findall('./description')[0].text
2610 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2611 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2612 manifest_url = videoNode.findall('./file')[0].text
59ae15a5
PH
2613 except IndexError:
2614 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2615 return
2616
799c0763
PH
2617 manifest_url += '?hdcore=2.10.3'
2618 self.report_manifest(video_id)
2619 try:
2620 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2621 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2622 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2623 return
2624
2625 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2626 try:
2627 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2628 node_id = media_node.attrib['url']
2629 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2630 except IndexError as err:
2631 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2632 return
2633
2634 url_pr = compat_urllib_parse_urlparse(manifest_url)
2635 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2636
2637 info['url'] = url
2638 info['ext'] = 'f4f'
59ae15a5 2639 return [info]
d77c3dfd
FV
2640
2641
2642class XVideosIE(InfoExtractor):
59ae15a5 2643 """Information extractor for xvideos.com"""
d77c3dfd 2644
59ae15a5
PH
2645 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2646 IE_NAME = u'xvideos'
d77c3dfd 2647
59ae15a5
PH
2648 def report_extraction(self, video_id):
2649 """Report information extraction."""
2650 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
d77c3dfd 2651
59ae15a5
PH
2652 def _real_extract(self, url):
2653 mobj = re.match(self._VALID_URL, url)
2654 if mobj is None:
2655 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2656 return
8588a86f 2657 video_id = mobj.group(1)
d77c3dfd 2658
5f955171 2659 webpage = self._download_webpage(url, video_id)
d77c3dfd 2660
59ae15a5 2661 self.report_extraction(video_id)
d77c3dfd
FV
2662
2663
59ae15a5
PH
2664 # Extract video URL
2665 mobj = re.search(r'flv_url=(.+?)&', webpage)
2666 if mobj is None:
2667 self._downloader.trouble(u'ERROR: unable to extract video url')
2668 return
8588a86f 2669 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2670
2671
59ae15a5
PH
2672 # Extract title
2673 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2674 if mobj is None:
2675 self._downloader.trouble(u'ERROR: unable to extract video title')
2676 return
8588a86f 2677 video_title = mobj.group(1)
d77c3dfd
FV
2678
2679
59ae15a5
PH
2680 # Extract video thumbnail
2681 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2682 if mobj is None:
2683 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2684 return
8588a86f 2685 video_thumbnail = mobj.group(0)
d77c3dfd 2686
59ae15a5
PH
2687 info = {
2688 'id': video_id,
2689 'url': video_url,
2690 'uploader': None,
2691 'upload_date': None,
2692 'title': video_title,
2693 'ext': 'flv',
2694 'thumbnail': video_thumbnail,
2695 'description': None,
2696 }
d77c3dfd 2697
59ae15a5 2698 return [info]
d77c3dfd
FV
2699
2700
2701class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2702 """Information extractor for soundcloud.com
2703 To access the media, the uid of the song and a stream token
2704 must be extracted from the page source and the script must make
2705 a request to media.soundcloud.com/crossdomain.xml. Then
2706 the media can be grabbed by requesting from an url composed
2707 of the stream token and uid
2708 """
2709
2710 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2711 IE_NAME = u'soundcloud'
2712
2713 def __init__(self, downloader=None):
2714 InfoExtractor.__init__(self, downloader)
2715
8fd3afd5 2716 def report_resolve(self, video_id):
59ae15a5 2717 """Report information extraction."""
8fd3afd5 2718 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
59ae15a5
PH
2719
2720 def report_extraction(self, video_id):
2721 """Report information extraction."""
8fd3afd5 2722 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
59ae15a5
PH
2723
2724 def _real_extract(self, url):
2725 mobj = re.match(self._VALID_URL, url)
2726 if mobj is None:
2727 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2728 return
2729
2730 # extract uploader (which is in the url)
15c8d833 2731 uploader = mobj.group(1)
59ae15a5 2732 # extract simple title (uploader + slug of song title)
15c8d833 2733 slug_title = mobj.group(2)
59ae15a5
PH
2734 simple_title = uploader + u'-' + slug_title
2735
8fd3afd5 2736 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2737
8fd3afd5
PH
2738 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2739 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2740 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2741 try:
8fd3afd5
PH
2742 info_json_bytes = compat_urllib_request.urlopen(request).read()
2743 info_json = info_json_bytes.decode('utf-8')
59ae15a5
PH
2744 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2745 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2746 return
2747
8fd3afd5
PH
2748 info = json.loads(info_json)
2749 video_id = info['id']
59ae15a5
PH
2750 self.report_extraction('%s/%s' % (uploader, slug_title))
2751
8fd3afd5 2752 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2753 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2754 try:
2755 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2756 stream_json = stream_json_bytes.decode('utf-8')
2757 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5f955171 2758 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
b4cd069d 2759 return
59ae15a5 2760
8fd3afd5 2761 streams = json.loads(stream_json)
c7214f9a 2762 mediaURL = streams['http_mp3_128_url']
59ae15a5
PH
2763
2764 return [{
c7214f9a 2765 'id': info['id'],
59ae15a5 2766 'url': mediaURL,
c7214f9a
PH
2767 'uploader': info['user']['username'],
2768 'upload_date': info['created_at'],
2769 'title': info['title'],
59ae15a5 2770 'ext': u'mp3',
c7214f9a 2771 'description': info['description'],
59ae15a5 2772 }]
d77c3dfd
FV
2773
2774
2775class InfoQIE(InfoExtractor):
59ae15a5 2776 """Information extractor for infoq.com"""
59ae15a5 2777 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
59ae15a5 2778
59ae15a5
PH
2779 def report_extraction(self, video_id):
2780 """Report information extraction."""
2781 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2782
2783 def _real_extract(self, url):
2784 mobj = re.match(self._VALID_URL, url)
2785 if mobj is None:
2786 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2787 return
2788
4fcca4bb 2789 webpage = self._download_webpage(url, video_id=url)
59ae15a5
PH
2790 self.report_extraction(url)
2791
59ae15a5
PH
2792 # Extract video URL
2793 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2794 if mobj is None:
2795 self._downloader.trouble(u'ERROR: unable to extract video url')
2796 return
4fcca4bb
PH
2797 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2798 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
59ae15a5
PH
2799
2800 # Extract title
2801 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2802 if mobj is None:
2803 self._downloader.trouble(u'ERROR: unable to extract video title')
2804 return
4fcca4bb 2805 video_title = mobj.group(1)
59ae15a5
PH
2806
2807 # Extract description
2808 video_description = u'No description available.'
2809 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2810 if mobj is not None:
4fcca4bb 2811 video_description = mobj.group(1)
59ae15a5
PH
2812
2813 video_filename = video_url.split('/')[-1]
2814 video_id, extension = video_filename.split('.')
2815
2816 info = {
2817 'id': video_id,
2818 'url': video_url,
2819 'uploader': None,
2820 'upload_date': None,
2821 'title': video_title,
2822 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2823 'thumbnail': None,
2824 'description': video_description,
2825 }
2826
2827 return [info]
d77c3dfd
FV
2828
2829class MixcloudIE(InfoExtractor):
59ae15a5 2830 """Information extractor for www.mixcloud.com"""
93702113
FV
2831
2832 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2833 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834 IE_NAME = u'mixcloud'
2835
2836 def __init__(self, downloader=None):
2837 InfoExtractor.__init__(self, downloader)
2838
2839 def report_download_json(self, file_id):
2840 """Report JSON download."""
2841 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2842
2843 def report_extraction(self, file_id):
2844 """Report information extraction."""
2845 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2846
2847 def get_urls(self, jsonData, fmt, bitrate='best'):
2848 """Get urls from 'audio_formats' section in json"""
2849 file_url = None
2850 try:
2851 bitrate_list = jsonData[fmt]
2852 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2853 bitrate = max(bitrate_list) # select highest
2854
2855 url_list = jsonData[fmt][bitrate]
2856 except TypeError: # we have no bitrate info.
2857 url_list = jsonData[fmt]
2858 return url_list
2859
2860 def check_urls(self, url_list):
2861 """Returns 1st active url from list"""
2862 for url in url_list:
2863 try:
2864 compat_urllib_request.urlopen(url)
2865 return url
2866 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867 url = None
2868
2869 return None
2870
2871 def _print_formats(self, formats):
2872 print('Available formats:')
2873 for fmt in formats.keys():
2874 for b in formats[fmt]:
2875 try:
2876 ext = formats[fmt][b][0]
2877 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2878 except TypeError: # we have no bitrate info
2879 ext = formats[fmt][0]
2880 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2881 break
2882
2883 def _real_extract(self, url):
2884 mobj = re.match(self._VALID_URL, url)
2885 if mobj is None:
2886 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2887 return
2888 # extract uploader & filename from url
2889 uploader = mobj.group(1).decode('utf-8')
2890 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2891
2892 # construct API request
2893 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2894 # retrieve .json file with links to files
2895 request = compat_urllib_request.Request(file_url)
2896 try:
2897 self.report_download_json(file_url)
2898 jsonData = compat_urllib_request.urlopen(request).read()
2899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2901 return
2902
2903 # parse JSON
2904 json_data = json.loads(jsonData)
2905 player_url = json_data['player_swf_url']
2906 formats = dict(json_data['audio_formats'])
2907
2908 req_format = self._downloader.params.get('format', None)
2909 bitrate = None
2910
2911 if self._downloader.params.get('listformats', None):
2912 self._print_formats(formats)
2913 return
2914
2915 if req_format is None or req_format == 'best':
2916 for format_param in formats.keys():
2917 url_list = self.get_urls(formats, format_param)
2918 # check urls
2919 file_url = self.check_urls(url_list)
2920 if file_url is not None:
2921 break # got it!
2922 else:
99b0a129 2923 if req_format not in formats:
59ae15a5
PH
2924 self._downloader.trouble(u'ERROR: format is not available')
2925 return
2926
2927 url_list = self.get_urls(formats, req_format)
2928 file_url = self.check_urls(url_list)
2929 format_param = req_format
2930
2931 return [{
2932 'id': file_id.decode('utf-8'),
2933 'url': file_url.decode('utf-8'),
2934 'uploader': uploader.decode('utf-8'),
2935 'upload_date': None,
2936 'title': json_data['name'],
2937 'ext': file_url.split('.')[-1].decode('utf-8'),
2938 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2939 'thumbnail': json_data['thumbnail_url'],
2940 'description': json_data['description'],
2941 'player_url': player_url.decode('utf-8'),
2942 }]
d77c3dfd
FV
2943
2944class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
2945 """Information extractor for Stanford's Open ClassRoom"""
2946
2947 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2948 IE_NAME = u'stanfordoc'
2949
2950 def report_download_webpage(self, objid):
2951 """Report information extraction."""
2952 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2953
2954 def report_extraction(self, video_id):
2955 """Report information extraction."""
2956 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2957
2958 def _real_extract(self, url):
2959 mobj = re.match(self._VALID_URL, url)
2960 if mobj is None:
f0bad2b0 2961 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2962
2963 if mobj.group('course') and mobj.group('video'): # A specific video
2964 course = mobj.group('course')
2965 video = mobj.group('video')
2966 info = {
2967 'id': course + '_' + video,
2968 'uploader': None,
2969 'upload_date': None,
2970 }
2971
2972 self.report_extraction(info['id'])
2973 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2974 xmlUrl = baseUrl + video + '.xml'
2975 try:
2976 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2977 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2978 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2979 return
2980 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2981 try:
2982 info['title'] = mdoc.findall('./title')[0].text
2983 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2984 except IndexError:
2985 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2986 return
2987 info['ext'] = info['url'].rpartition('.')[2]
2988 return [info]
2989 elif mobj.group('course'): # A course page
2990 course = mobj.group('course')
2991 info = {
2992 'id': course,
2993 'type': 'playlist',
2994 'uploader': None,
2995 'upload_date': None,
2996 }
2997
f0bad2b0
PH
2998 coursepage = self._download_webpage(url, info['id'],
2999 note='Downloading course info page',
3000 errnote='Unable to download course info page')
59ae15a5
PH
3001
3002 m = re.search('<h1>([^<]+)</h1>', coursepage)
3003 if m:
3004 info['title'] = unescapeHTML(m.group(1))
3005 else:
3006 info['title'] = info['id']
3007
3008 m = re.search('<description>([^<]+)</description>', coursepage)
3009 if m:
3010 info['description'] = unescapeHTML(m.group(1))
3011
3012 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3013 info['list'] = [
3014 {
3015 'type': 'reference',
3016 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3017 }
3018 for vpage in links]
3019 results = []
3020 for entry in info['list']:
3021 assert entry['type'] == 'reference'
3022 results += self.extract(entry['url'])
3023 return results
59ae15a5
PH
3024 else: # Root page
3025 info = {
3026 'id': 'Stanford OpenClassroom',
3027 'type': 'playlist',
3028 'uploader': None,
3029 'upload_date': None,
3030 }
3031
3032 self.report_download_webpage(info['id'])
3033 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3034 try:
3035 rootpage = compat_urllib_request.urlopen(rootURL).read()
3036 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3038 return
3039
3040 info['title'] = info['id']
3041
3042 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3043 info['list'] = [
3044 {
3045 'type': 'reference',
3046 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3047 }
3048 for cpage in links]
3049
3050 results = []
3051 for entry in info['list']:
3052 assert entry['type'] == 'reference'
3053 results += self.extract(entry['url'])
3054 return results
d77c3dfd
FV
3055
3056class MTVIE(InfoExtractor):
59ae15a5
PH
3057 """Information extractor for MTV.com"""
3058
3059 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3060 IE_NAME = u'mtv'
3061
59ae15a5
PH
3062 def report_extraction(self, video_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3065
3066 def _real_extract(self, url):
3067 mobj = re.match(self._VALID_URL, url)
3068 if mobj is None:
3069 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070 return
3071 if not mobj.group('proto'):
3072 url = 'http://' + url
3073 video_id = mobj.group('videoid')
59ae15a5 3074
5f955171 3075 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
3076
3077 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3078 if mobj is None:
3079 self._downloader.trouble(u'ERROR: unable to extract song name')
3080 return
3081 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3083 if mobj is None:
3084 self._downloader.trouble(u'ERROR: unable to extract performer')
3085 return
3086 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 3087 video_title = performer + ' - ' + song_name
59ae15a5
PH
3088
3089 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3090 if mobj is None:
3091 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3092 return
3093 mtvn_uri = mobj.group(1)
3094
3095 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3096 if mobj is None:
3097 self._downloader.trouble(u'ERROR: unable to extract content id')
3098 return
3099 content_id = mobj.group(1)
3100
3101 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3102 self.report_extraction(video_id)
3103 request = compat_urllib_request.Request(videogen_url)
3104 try:
3105 metadataXml = compat_urllib_request.urlopen(request).read()
3106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3108 return
3109
3110 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3111 renditions = mdoc.findall('.//rendition')
3112
3113 # For now, always pick the highest quality.
3114 rendition = renditions[-1]
3115
3116 try:
3117 _,_,ext = rendition.attrib['type'].partition('/')
3118 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3119 video_url = rendition.find('./src').text
3120 except KeyError:
3121 self._downloader.trouble('Invalid rendition field.')
3122 return
3123
3124 info = {
3125 'id': video_id,
3126 'url': video_url,
3127 'uploader': performer,
3128 'upload_date': None,
3129 'title': video_title,
3130 'ext': ext,
3131 'format': format,
3132 }
3133
3134 return [info]
6de7ef9b 3135
302efc19 3136
302efc19 3137class YoukuIE(InfoExtractor):
59ae15a5 3138 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59ae15a5
PH
3139
3140 def report_download_webpage(self, file_id):
3141 """Report webpage download."""
a34dd63b 3142 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
59ae15a5
PH
3143
3144 def report_extraction(self, file_id):
3145 """Report information extraction."""
a34dd63b 3146 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
59ae15a5
PH
3147
3148 def _gen_sid(self):
3149 nowTime = int(time.time() * 1000)
3150 random1 = random.randint(1000,1998)
3151 random2 = random.randint(1000,9999)
3152
3153 return "%d%d%d" %(nowTime,random1,random2)
3154
3155 def _get_file_ID_mix_string(self, seed):
3156 mixed = []
3157 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3158 seed = float(seed)
3159 for i in range(len(source)):
3160 seed = (seed * 211 + 30031 ) % 65536
3161 index = math.floor(seed / 65536 * len(source) )
3162 mixed.append(source[int(index)])
3163 source.remove(source[int(index)])
3164 #return ''.join(mixed)
3165 return mixed
3166
3167 def _get_file_id(self, fileId, seed):
3168 mixed = self._get_file_ID_mix_string(seed)
3169 ids = fileId.split('*')
3170 realId = []
3171 for ch in ids:
3172 if ch:
3173 realId.append(mixed[int(ch)])
3174 return ''.join(realId)
3175
3176 def _real_extract(self, url):
3177 mobj = re.match(self._VALID_URL, url)
3178 if mobj is None:
3179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3180 return
3181 video_id = mobj.group('ID')
3182
3183 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3184
3185 request = compat_urllib_request.Request(info_url, None, std_headers)
3186 try:
3187 self.report_download_webpage(video_id)
3188 jsondata = compat_urllib_request.urlopen(request).read()
3189 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3190 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3191 return
3192
3193 self.report_extraction(video_id)
3194 try:
8f6f40d9
PH
3195 jsonstr = jsondata.decode('utf-8')
3196 config = json.loads(jsonstr)
59ae15a5
PH
3197
3198 video_title = config['data'][0]['title']
3199 seed = config['data'][0]['seed']
3200
3201 format = self._downloader.params.get('format', None)
1a2c3c0f 3202 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
3203
3204 if format is None or format == 'best':
3205 if 'hd2' in supported_format:
3206 format = 'hd2'
3207 else:
3208 format = 'flv'
3209 ext = u'flv'
3210 elif format == 'worst':
3211 format = 'mp4'
3212 ext = u'mp4'
3213 else:
3214 format = 'flv'
3215 ext = u'flv'
3216
3217
3218 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 3219 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 3220 except (UnicodeDecodeError, ValueError, KeyError):
59ae15a5
PH
3221 self._downloader.trouble(u'ERROR: unable to extract info section')
3222 return
3223
3224 files_info=[]
3225 sid = self._gen_sid()
3226 fileid = self._get_file_id(fileid, seed)
3227
3228 #column 8,9 of fileid represent the segment number
3229 #fileid[7:9] should be changed
3230 for index, key in enumerate(keys):
3231
3232 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3233 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3234
3235 info = {
3236 'id': '%s_part%02d' % (video_id, index),
3237 'url': download_url,
3238 'uploader': None,
3239 'upload_date': None,
3240 'title': video_title,
3241 'ext': ext,
3242 }
3243 files_info.append(info)
3244
3245 return files_info
5dc846fa
FV
3246
3247
6de7ef9b 3248class XNXXIE(InfoExtractor):
59ae15a5
PH
3249 """Information extractor for xnxx.com"""
3250
caec7618 3251 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
59ae15a5
PH
3252 IE_NAME = u'xnxx'
3253 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3254 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3255 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3256
3257 def report_webpage(self, video_id):
3258 """Report information extraction"""
3259 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3260
3261 def report_extraction(self, video_id):
3262 """Report information extraction"""
3263 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3264
3265 def _real_extract(self, url):
3266 mobj = re.match(self._VALID_URL, url)
3267 if mobj is None:
3268 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3269 return
bec102a8 3270 video_id = mobj.group(1)
59ae15a5
PH
3271
3272 self.report_webpage(video_id)
3273
3274 # Get webpage content
3275 try:
bec102a8
PH
3276 webpage_bytes = compat_urllib_request.urlopen(url).read()
3277 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
3278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3279 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3280 return
3281
3282 result = re.search(self.VIDEO_URL_RE, webpage)
3283 if result is None:
3284 self._downloader.trouble(u'ERROR: unable to extract video url')
3285 return
bec102a8 3286 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
3287
3288 result = re.search(self.VIDEO_TITLE_RE, webpage)
3289 if result is None:
3290 self._downloader.trouble(u'ERROR: unable to extract video title')
3291 return
bec102a8 3292 video_title = result.group(1)
59ae15a5
PH
3293
3294 result = re.search(self.VIDEO_THUMB_RE, webpage)
3295 if result is None:
3296 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3297 return
bec102a8 3298 video_thumbnail = result.group(1)
59ae15a5
PH
3299
3300 return [{
3301 'id': video_id,
3302 'url': video_url,
3303 'uploader': None,
3304 'upload_date': None,
3305 'title': video_title,
3306 'ext': 'flv',
3307 'thumbnail': video_thumbnail,
3308 'description': None,
3309 }]
fd873c69
FV
3310
3311
d443aca8 3312class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3313 """Information extractor for plus.google.com."""
3314
93702113 3315 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
3316 IE_NAME = u'plus.google'
3317
3318 def __init__(self, downloader=None):
3319 InfoExtractor.__init__(self, downloader)
3320
3321 def report_extract_entry(self, url):
3322 """Report downloading extry"""
93702113 3323 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
59ae15a5
PH
3324
3325 def report_date(self, upload_date):
3326 """Report downloading extry"""
3327 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3328
3329 def report_uploader(self, uploader):
3330 """Report downloading extry"""
93702113 3331 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
59ae15a5
PH
3332
3333 def report_title(self, video_title):
3334 """Report downloading extry"""
93702113 3335 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
59ae15a5
PH
3336
3337 def report_extract_vid_page(self, video_page):
3338 """Report information extraction."""
93702113 3339 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
59ae15a5
PH
3340
3341 def _real_extract(self, url):
3342 # Extract id from URL
3343 mobj = re.match(self._VALID_URL, url)
3344 if mobj is None:
3345 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3346 return
3347
3348 post_url = mobj.group(0)
93702113 3349 video_id = mobj.group(1)
59ae15a5
PH
3350
3351 video_extension = 'flv'
3352
3353 # Step 1, Retrieve post webpage to extract further information
3354 self.report_extract_entry(post_url)
3355 request = compat_urllib_request.Request(post_url)
3356 try:
93702113 3357 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3359 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3360 return
3361
3362 # Extract update date
3363 upload_date = None
3364 pattern = 'title="Timestamp">(.*?)</a>'
3365 mobj = re.search(pattern, webpage)
3366 if mobj:
3367 upload_date = mobj.group(1)
3368 # Convert timestring to a format suitable for filename
3369 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3370 upload_date = upload_date.strftime('%Y%m%d')
3371 self.report_date(upload_date)
3372
3373 # Extract uploader
3374 uploader = None
3375 pattern = r'rel\="author".*?>(.*?)</a>'
3376 mobj = re.search(pattern, webpage)
3377 if mobj:
3378 uploader = mobj.group(1)
3379 self.report_uploader(uploader)
3380
3381 # Extract title
3382 # Get the first line for title
3383 video_title = u'NA'
3384 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3385 mobj = re.search(pattern, webpage)
3386 if mobj:
3387 video_title = mobj.group(1)
3388 self.report_title(video_title)
3389
3390 # Step 2, Stimulate clicking the image box to launch video
3391 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3392 mobj = re.search(pattern, webpage)
3393 if mobj is None:
3394 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3395
3396 video_page = mobj.group(1)
3397 request = compat_urllib_request.Request(video_page)
3398 try:
93702113 3399 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3402 return
3403 self.report_extract_vid_page(video_page)
3404
3405
3406 # Extract video links on video page
3407 """Extract video links of all sizes"""
3408 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3409 mobj = re.findall(pattern, webpage)
3410 if len(mobj) == 0:
3411 self._downloader.trouble(u'ERROR: unable to extract video links')
3412
3413 # Sort in resolution
3414 links = sorted(mobj)
3415
3416 # Choose the lowest of the sort, i.e. highest resolution
3417 video_url = links[-1]
3418 # Only get the url. The resolution part in the tuple has no use anymore
3419 video_url = video_url[-1]
3420 # Treat escaped \u0026 style hex
93702113
FV
3421 try:
3422 video_url = video_url.decode("unicode_escape")
3423 except AttributeError: # Python 3
3424 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3425
3426
3427 return [{
93702113 3428 'id': video_id,
59ae15a5 3429 'url': video_url,
93702113
FV
3430 'uploader': uploader,
3431 'upload_date': upload_date,
3432 'title': video_title,
3433 'ext': video_extension,
59ae15a5 3434 }]
4cc3d074
PH
3435
3436class NBAIE(InfoExtractor):
3437 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3438 IE_NAME = u'nba'
3439
4cc3d074
PH
3440 def _real_extract(self, url):
3441 mobj = re.match(self._VALID_URL, url)
3442 if mobj is None:
3443 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3444 return
3445
3446 video_id = mobj.group(1)
3447 if video_id.endswith('/index.html'):
3448 video_id = video_id[:-len('/index.html')]
3449
5f955171 3450 webpage = self._download_webpage(url, video_id)
4cc3d074
PH
3451
3452 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3453 def _findProp(rexp, default=None):
3454 m = re.search(rexp, webpage)
3455 if m:
3456 return unescapeHTML(m.group(1))
3457 else:
3458 return default
3459
3460 shortened_video_id = video_id.rpartition('/')[2]
3461 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3462 info = {
3463 'id': shortened_video_id,
3464 'url': video_url,
3465 'ext': 'mp4',
3466 'title': title,
3467 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3468 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3469 }
3470 return [info]
0b40544f
DV
3471
3472class JustinTVIE(InfoExtractor):
3473 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3474 # TODO: One broadcast may be split into multiple videos. The key
3475 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3476 # starts at 1 and increases. Can we treat all parts as one video?
3477
4096b609
DV
3478 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3479 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3480 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3481 IE_NAME = u'justin.tv'
3482
3483 def report_extraction(self, file_id):
3484 """Report information extraction."""
3485 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3486
4096b609
DV
3487 def report_download_page(self, channel, offset):
3488 """Report attempt to download a single page of videos."""
3489 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3490 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3491
2ab1c5ed
DV
3492 # Return count of items, list of *valid* items
3493 def _parse_page(self, url):
0b40544f 3494 try:
2ab1c5ed 3495 urlh = compat_urllib_request.urlopen(url)
0b40544f
DV
3496 webpage_bytes = urlh.read()
3497 webpage = webpage_bytes.decode('utf-8', 'ignore')
3498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3499 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3500 return
cdb30764 3501
0b40544f 3502 response = json.loads(webpage)
fa1bf9c6 3503 if type(response) != list:
3504 error_text = response.get('error', 'unknown error')
3505 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3506 return
0b40544f
DV
3507 info = []
3508 for clip in response:
3509 video_url = clip['video_file_url']
3510 if video_url:
3511 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 3512 video_date = re.sub('-', '', clip['start_time'][:10])
3513 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
3514 video_id = clip['id']
3515 video_title = clip.get('title', video_id)
0b40544f 3516 info.append({
97f194c1 3517 'id': video_id,
0b40544f 3518 'url': video_url,
97f194c1 3519 'title': video_title,
fa1bf9c6 3520 'uploader': clip.get('channel_name', video_uploader_id),
3521 'uploader_id': video_uploader_id,
0b40544f
DV
3522 'upload_date': video_date,
3523 'ext': video_extension,
3524 })
2ab1c5ed
DV
3525 return (len(response), info)
3526
3527 def _real_extract(self, url):
3528 mobj = re.match(self._VALID_URL, url)
3529 if mobj is None:
3530 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3531 return
cdb30764 3532
2ab1c5ed
DV
3533 api = 'http://api.justin.tv'
3534 video_id = mobj.group(mobj.lastindex)
3535 paged = False
3536 if mobj.lastindex == 1:
3537 paged = True
3538 api += '/channel/archives/%s.json'
3539 else:
fa1bf9c6 3540 api += '/broadcast/by_archive/%s.json'
2ab1c5ed 3541 api = api % (video_id,)
cdb30764 3542
2ab1c5ed 3543 self.report_extraction(video_id)
cdb30764 3544
2ab1c5ed
DV
3545 info = []
3546 offset = 0
4096b609
DV
3547 limit = self._JUSTIN_PAGE_LIMIT
3548 while True:
3549 if paged:
3550 self.report_download_page(video_id, offset)
2ab1c5ed
DV
3551 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3552 page_count, page_info = self._parse_page(page_url)
3553 info.extend(page_info)
3554 if not paged or page_count != limit:
3555 break
3556 offset += limit
0b40544f 3557 return info
21a9c6aa
PH
3558
3559class FunnyOrDieIE(InfoExtractor):
3560 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
21a9c6aa 3561
21a9c6aa
PH
3562 def _real_extract(self, url):
3563 mobj = re.match(self._VALID_URL, url)
3564 if mobj is None:
3565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3566 return
3567
3568 video_id = mobj.group('id')
5f955171 3569 webpage = self._download_webpage(url, video_id)
21a9c6aa
PH
3570
3571 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3572 if not m:
3573 self._downloader.trouble(u'ERROR: unable to find video information')
3574 video_url = unescapeHTML(m.group('url'))
21a9c6aa
PH
3575
3576 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3577 if not m:
3578 self._downloader.trouble(u'Cannot find video title')
3579 title = unescapeHTML(m.group('title'))
3580
3581 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3582 if m:
3583 desc = unescapeHTML(m.group('desc'))
3584 else:
3585 desc = None
3586
3587 info = {
3588 'id': video_id,
3589 'url': video_url,
3590 'ext': 'mp4',
3591 'title': title,
3592 'description': desc,
3593 }
3594 return [info]
d0d4f277 3595
e314ba67 3596class SteamIE(InfoExtractor):
6324fd1d 3597 _VALID_URL = r"""http://store.steampowered.com/
e314ba67
JMF
3598 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3599 (?P<gameID>\d+)/?
3600 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3601 """
4aeae91f 3602
89de9eb1
FV
3603 @classmethod
3604 def suitable(cls, url):
e314ba67 3605 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 3606 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
5f955171 3607
e314ba67
JMF
3608 def _real_extract(self, url):
3609 m = re.match(self._VALID_URL, url, re.VERBOSE)
3610 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3611 gameID = m.group('gameID')
3612 videourl = 'http://store.steampowered.com/video/%s/' % gameID
5f955171 3613 webpage = self._download_webpage(videourl, gameID)
e314ba67 3614 mweb = re.finditer(urlRE, webpage)
5e9d042d
JMF
3615 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3616 titles = re.finditer(namesRE, webpage)
60bd48b1
JMF
3617 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3618 thumbs = re.finditer(thumbsRE, webpage)
e314ba67 3619 videos = []
60bd48b1 3620 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
e314ba67 3621 video_id = vid.group('videoID')
5f955171
PH
3622 title = vtitle.group('videoName')
3623 video_url = vid.group('videoURL')
60bd48b1 3624 video_thumb = thumb.group('thumbnail')
e314ba67
JMF
3625 if not video_url:
3626 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
e314ba67
JMF
3627 info = {
3628 'id':video_id,
3629 'url':video_url,
3630 'ext': 'flv',
60bd48b1
JMF
3631 'title': unescapeHTML(title),
3632 'thumbnail': video_thumb
e314ba67
JMF
3633 }
3634 videos.append(info)
3635 return videos
ef0c8d5f 3636
278986ea 3637class UstreamIE(InfoExtractor):
ef0c8d5f 3638 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 3639 IE_NAME = u'ustream'
ef0c8d5f 3640
278986ea
JMF
3641 def _real_extract(self, url):
3642 m = re.match(self._VALID_URL, url)
3643 video_id = m.group('videoID')
3644 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 3645 webpage = self._download_webpage(url, video_id)
278986ea
JMF
3646 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3647 title = m.group('title')
3648 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3649 uploader = m.group('uploader')
3650 info = {
3651 'id':video_id,
3652 'url':video_url,
3653 'ext': 'flv',
3654 'title': title,
3655 'uploader': uploader
3656 }
3657 return [info]
4aeae91f 3658
ca0a0bbe
PH
3659class RBMARadioIE(InfoExtractor):
3660 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3661
3662 def _real_extract(self, url):
3663 m = re.match(self._VALID_URL, url)
3664 video_id = m.group('videoID')
3665
3666 webpage = self._download_webpage(url, video_id)
3667 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3668 if not m:
3669 raise ExtractorError(u'Cannot find metadata')
3670 json_data = m.group(1)
3671
3672 try:
3673 data = json.loads(json_data)
3674 except ValueError as e:
3675 raise ExtractorError(u'Invalid JSON: ' + str(e))
3676
3677 video_url = data['akamai_url'] + '&cbr=256'
3678 url_parts = compat_urllib_parse_urlparse(video_url)
3679 video_ext = url_parts.path.rpartition('.')[2]
3680 info = {
3681 'id': video_id,
3682 'url': video_url,
3683 'ext': video_ext,
3684 'title': data['title'],
3685 'description': data.get('teaser_text'),
3686 'location': data.get('country_of_origin'),
3687 'uploader': data.get('host', {}).get('name'),
3688 'uploader_id': data.get('host', {}).get('slug'),
187f491a 3689 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
3690 'duration': data.get('duration'),
3691 }
3692 return [info]
4aeae91f 3693
991ba7fa
JC
3694
3695class YouPornIE(InfoExtractor):
3696 """Information extractor for youporn.com."""
991ba7fa 3697 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
6324fd1d 3698
991ba7fa
JC
3699 def _print_formats(self, formats):
3700 """Print all available formats"""
565f7519 3701 print(u'Available formats:')
ca6710ee
JC
3702 print(u'ext\t\tformat')
3703 print(u'---------------------------------')
991ba7fa 3704 for format in formats:
ca6710ee 3705 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
3706
3707 def _specific(self, req_format, formats):
3708 for x in formats:
3709 if(x["format"]==req_format):
3710 return x
3711 return None
3712
991ba7fa
JC
3713 def _real_extract(self, url):
3714 mobj = re.match(self._VALID_URL, url)
3715 if mobj is None:
3716 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3717 return
3718
ca6710ee 3719 video_id = mobj.group('videoid')
991ba7fa 3720
629fcdd1
PH
3721 req = compat_urllib_request.Request(url)
3722 req.add_header('Cookie', 'age_verified=1')
3723 webpage = self._download_webpage(req, video_id)
991ba7fa
JC
3724
3725 # Get the video title
e711babb 3726 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
991ba7fa 3727 if result is None:
e711babb 3728 raise ExtractorError(u'Unable to extract video title')
ca6710ee 3729 video_title = result.group('title').strip()
991ba7fa
JC
3730
3731 # Get the video date
e711babb 3732 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
991ba7fa 3733 if result is None:
2e5457be 3734 self._downloader.report_warning(u'unable to extract video date')
629fcdd1
PH
3735 upload_date = None
3736 else:
3737 upload_date = result.group('date').strip()
991ba7fa
JC
3738
3739 # Get the video uploader
e711babb 3740 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
991ba7fa 3741 if result is None:
2e5457be 3742 self._downloader.report_warning(u'unable to extract uploader')
629fcdd1
PH
3743 video_uploader = None
3744 else:
3745 video_uploader = result.group('uploader').strip()
3746 video_uploader = clean_html( video_uploader )
991ba7fa
JC
3747
3748 # Get all of the formats available
ca6710ee
JC
3749 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3750 result = re.search(DOWNLOAD_LIST_RE, webpage)
991ba7fa 3751 if result is None:
629fcdd1 3752 raise ExtractorError(u'Unable to extract download list')
ca6710ee 3753 download_list_html = result.group('download_list').strip()
991ba7fa
JC
3754
3755 # Get all of the links from the page
ca6710ee
JC
3756 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3757 links = re.findall(LINK_RE, download_list_html)
991ba7fa 3758 if(len(links) == 0):
629fcdd1 3759 raise ExtractorError(u'ERROR: no known formats available for video')
6324fd1d
FV
3760
3761 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
991ba7fa
JC
3762
3763 formats = []
3764 for link in links:
3765
3766 # A link looks like this:
3767 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3768 # A path looks like this:
3769 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
3770 video_url = unescapeHTML( link )
3771 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
3772 extension = os.path.splitext( path )[1][1:]
3773 format = path.split('/')[4].split('_')[:2]
3774 size = format[0]
3775 bitrate = format[1]
3776 format = "-".join( format )
3777 title = u'%s-%s-%s' % (video_title, size, bitrate)
3778
3779 formats.append({
3780 'id': video_id,
3781 'url': video_url,
3782 'uploader': video_uploader,
3783 'upload_date': upload_date,
3784 'title': title,
3785 'ext': extension,
3786 'format': format,
3787 'thumbnail': None,
3788 'description': None,
3789 'player_url': None
3790 })
3791
3792 if self._downloader.params.get('listformats', None):
3793 self._print_formats(formats)
3794 return
3795
3796 req_format = self._downloader.params.get('format', None)
991ba7fa
JC
3797 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3798
991ba7fa
JC
3799 if req_format is None or req_format == 'best':
3800 return [formats[0]]
3801 elif req_format == 'worst':
3802 return [formats[-1]]
3803 elif req_format in ('-1', 'all'):
3804 return formats
3805 else:
3806 format = self._specific( req_format, formats )
3807 if result is None:
3808 self._downloader.trouble(u'ERROR: requested format not available')
3809 return
3810 return [format]
3811
6324fd1d 3812
991ba7fa
JC
3813
3814class PornotubeIE(InfoExtractor):
3815 """Information extractor for pornotube.com."""
991ba7fa 3816 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 3817
991ba7fa
JC
3818 def _real_extract(self, url):
3819 mobj = re.match(self._VALID_URL, url)
3820 if mobj is None:
3821 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3822 return
3823
ca6710ee
JC
3824 video_id = mobj.group('videoid')
3825 video_title = mobj.group('title')
991ba7fa
JC
3826
3827 # Get webpage content
ca6710ee 3828 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3829
3830 # Get the video URL
ca6710ee
JC
3831 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3832 result = re.search(VIDEO_URL_RE, webpage)
991ba7fa
JC
3833 if result is None:
3834 self._downloader.trouble(u'ERROR: unable to extract video url')
3835 return
ca6710ee 3836 video_url = compat_urllib_parse.unquote(result.group('url'))
991ba7fa
JC
3837
3838 #Get the uploaded date
ca6710ee
JC
3839 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3840 result = re.search(VIDEO_UPLOADED_RE, webpage)
991ba7fa
JC
3841 if result is None:
3842 self._downloader.trouble(u'ERROR: unable to extract video title')
3843 return
ca6710ee 3844 upload_date = result.group('date')
991ba7fa
JC
3845
3846 info = {'id': video_id,
3847 'url': video_url,
3848 'uploader': None,
3849 'upload_date': upload_date,
3850 'title': video_title,
3851 'ext': 'flv',
565f7519 3852 'format': 'flv'}
991ba7fa
JC
3853
3854 return [info]
3855
991ba7fa
JC
3856class YouJizzIE(InfoExtractor):
3857 """Information extractor for youjizz.com."""
ca6710ee 3858 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 3859
991ba7fa 3860 def _real_extract(self, url):
ca6710ee
JC
3861 mobj = re.match(self._VALID_URL, url)
3862 if mobj is None:
3863 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
991ba7fa 3864 return
ca6710ee
JC
3865
3866 video_id = mobj.group('videoid')
3867
3868 # Get webpage content
3869 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3870
3871 # Get the video title
db16276b 3872 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
991ba7fa 3873 if result is None:
db16276b 3874 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 3875 video_title = result.group('title').strip()
991ba7fa
JC
3876
3877 # Get the embed page
db16276b 3878 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 3879 if result is None:
db16276b 3880 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 3881
ca6710ee
JC
3882 embed_page_url = result.group(0).strip()
3883 video_id = result.group('videoid')
6324fd1d 3884
ca6710ee
JC
3885 webpage = self._download_webpage(embed_page_url, video_id)
3886
991ba7fa 3887 # Get the video URL
db16276b 3888 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
991ba7fa 3889 if result is None:
db16276b 3890 raise ExtractorError(u'ERROR: unable to extract video url')
ca6710ee 3891 video_url = result.group('source')
991ba7fa
JC
3892
3893 info = {'id': video_id,
3894 'url': video_url,
991ba7fa
JC
3895 'title': video_title,
3896 'ext': 'flv',
3897 'format': 'flv',
991ba7fa
JC
3898 'player_url': embed_page_url}
3899
3900 return [info]
3901
ccf65f9d
PH
3902class EightTracksIE(InfoExtractor):
3903 IE_NAME = '8tracks'
25580f32 3904 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
ccf65f9d
PH
3905
3906 def _real_extract(self, url):
3907 mobj = re.match(self._VALID_URL, url)
3908 if mobj is None:
3909 raise ExtractorError(u'Invalid URL: %s' % url)
3910 playlist_id = mobj.group('id')
3911
3912 webpage = self._download_webpage(url, playlist_id)
3913
2a9983b7 3914 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
ccf65f9d
PH
3915 if not m:
3916 raise ExtractorError(u'Cannot find trax information')
3917 json_like = m.group(1)
3918 data = json.loads(json_like)
3919
3920 session = str(random.randint(0, 1000000000))
3921 mix_id = data['id']
3922 track_count = data['tracks_count']
3923 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3924 next_url = first_url
3925 res = []
3926 for i in itertools.count():
3927 api_json = self._download_webpage(next_url, playlist_id,
3928 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3929 errnote=u'Failed to download song information')
3930 api_data = json.loads(api_json)
3931 track_data = api_data[u'set']['track']
3932 info = {
3933 'id': track_data['id'],
3934 'url': track_data['track_file_stream_url'],
da4de959
PH
3935 'title': track_data['performer'] + u' - ' + track_data['name'],
3936 'raw_title': track_data['name'],
3937 'uploader_id': data['user']['login'],
ccf65f9d
PH
3938 'ext': 'm4a',
3939 }
3940 res.append(info)
3941 if api_data['set']['at_last_track']:
3942 break
3943 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3944 return res
991ba7fa 3945
da06e2da
OK
3946class KeekIE(InfoExtractor):
3947 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3948 IE_NAME = u'keek'
3949
3950 def _real_extract(self, url):
3951 m = re.match(self._VALID_URL, url)
3952 video_id = m.group('videoID')
3953 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3954 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3955 webpage = self._download_webpage(url, video_id)
3956 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
f0877a44 3957 title = unescapeHTML(m.group('title'))
da06e2da 3958 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
f0877a44 3959 uploader = unescapeHTML(m.group('uploader'))
da06e2da
OK
3960 info = {
3961 'id':video_id,
3962 'url':video_url,
3963 'ext': 'mp4',
3964 'title': title,
3965 'thumbnail': thumbnail,
3966 'uploader': uploader
f0877a44 3967 }
da06e2da
OK
3968 return [info]
3969
3a468f2d 3970class TEDIE(InfoExtractor):
414638cd
JMF
3971 _VALID_URL=r'''http://www.ted.com/
3972 (
3973 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3974 |
3975 ((?P<type_talk>talks)) # We have a simple talk
3976 )
3977 /(?P<name>\w+) # Here goes the name and then ".html"
3978 '''
3979
89de9eb1
FV
3980 @classmethod
3981 def suitable(cls, url):
414638cd 3982 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 3983 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
414638cd 3984
3a468f2d 3985 def _real_extract(self, url):
414638cd
JMF
3986 m=re.match(self._VALID_URL, url, re.VERBOSE)
3987 if m.group('type_talk'):
3988 return [self._talk_info(url)]
3989 else :
3990 playlist_id=m.group('playlist_id')
3991 name=m.group('name')
3992 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3993 return self._playlist_videos_info(url,name,playlist_id)
3994
3995 def _talk_video_link(self,mediaSlug):
3996 '''Returns the video link for that mediaSlug'''
3997 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3998
3999 def _playlist_videos_info(self,url,name,playlist_id=0):
4000 '''Returns the videos of the playlist'''
4001 video_RE=r'''
4002 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4003 ([.\s]*?)data-playlist_item_id="(\d+)"
4004 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4005 '''
c85538db 4006 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
414638cd
JMF
4007 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4008 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4009 m_names=re.finditer(video_name_RE,webpage)
4010 info=[]
4011 for m_video, m_name in zip(m_videos,m_names):
c85538db
JMF
4012 video_id=m_video.group('video_id')
4013 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4014 info.append(self._talk_info(talk_url,video_id))
414638cd 4015 return info
c85538db 4016
414638cd
JMF
4017 def _talk_info(self, url, video_id=0):
4018 """Return the video for the talk in the url"""
4019 m=re.match(self._VALID_URL, url,re.VERBOSE)
4020 videoName=m.group('name')
4021 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4022 # If the url includes the language we get the title translated
c85538db 4023 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
3a468f2d
JMF
4024 title=re.search(title_RE, webpage).group('title')
4025 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4026 "id":(?P<videoID>[\d]+).*?
4027 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
c85538db
JMF
4028 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4029 thumb_match=re.search(thumb_RE,webpage)
3a468f2d
JMF
4030 info_match=re.search(info_RE,webpage,re.VERBOSE)
4031 video_id=info_match.group('videoID')
4032 mediaSlug=info_match.group('mediaSlug')
414638cd 4033 video_url=self._talk_video_link(mediaSlug)
3a468f2d 4034 info = {
414638cd
JMF
4035 'id': video_id,
4036 'url': video_url,
3a468f2d 4037 'ext': 'mp4',
c85538db
JMF
4038 'title': title,
4039 'thumbnail': thumb_match.group('thumbnail')
414638cd
JMF
4040 }
4041 return info
da06e2da 4042
58994225 4043class MySpassIE(InfoExtractor):
1ad5d872 4044 _VALID_URL = r'http://www.myspass.de/.*'
6324fd1d 4045
1ad5d872 4046 def _real_extract(self, url):
4047 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
58994225 4048
1ad5d872 4049 # video id is the last path element of the URL
4050 # usually there is a trailing slash, so also try the second but last
4051 url_path = compat_urllib_parse_urlparse(url).path
4052 url_parent_path, video_id = os.path.split(url_path)
4053 if not video_id:
4054 _, video_id = os.path.split(url_parent_path)
6324fd1d 4055
1ad5d872 4056 # get metadata
4057 metadata_url = META_DATA_URL_TEMPLATE % video_id
4058 metadata_text = self._download_webpage(metadata_url, video_id)
4059 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
6324fd1d 4060
1ad5d872 4061 # extract values from metadata
4062 url_flv_el = metadata.find('url_flv')
4063 if url_flv_el is None:
4064 self._downloader.trouble(u'ERROR: unable to extract download url')
4065 return
4066 video_url = url_flv_el.text
4067 extension = os.path.splitext(video_url)[1][1:]
4068 title_el = metadata.find('title')
4069 if title_el is None:
4070 self._downloader.trouble(u'ERROR: unable to extract title')
4071 return
4072 title = title_el.text
4073 format_id_el = metadata.find('format_id')
4074 if format_id_el is None:
4075 format = ext
4076 else:
4077 format = format_id_el.text
4078 description_el = metadata.find('description')
4079 if description_el is not None:
4080 description = description_el.text
4081 else:
4082 description = None
4083 imagePreview_el = metadata.find('imagePreview')
4084 if imagePreview_el is not None:
4085 thumbnail = imagePreview_el.text
4086 else:
4087 thumbnail = None
4088 info = {
4089 'id': video_id,
4090 'url': video_url,
4091 'title': title,
4092 'ext': extension,
4093 'format': format,
4094 'thumbnail': thumbnail,
4095 'description': description
4096 }
4097 return [info]
4098
e32b06e9
PH
4099class SpiegelIE(InfoExtractor):
4100 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?$'
4101
4102 def _real_extract(self, url):
4103 m = re.match(self._VALID_URL, url)
4104 video_id = m.group('videoID')
4105
4106 webpage = self._download_webpage(url, video_id)
4107 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4108 if not m:
4109 raise ExtractorError(u'Cannot find title')
4110 video_title = unescapeHTML(m.group(1))
4111
4112 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4113 xml_code = self._download_webpage(xml_url, video_id,
4114 note=u'Downloading XML', errnote=u'Failed to download XML')
4115
4116 idoc = xml.etree.ElementTree.fromstring(xml_code)
4117 last_type = idoc[-1]
4118 filename = last_type.findall('./filename')[0].text
4119 duration = float(last_type.findall('./duration')[0].text)
4120
4121 video_url = 'http://video2.spiegel.de/flash/' + filename
4122 video_ext = filename.rpartition('.')[2]
4123 info = {
4124 'id': video_id,
4125 'url': video_url,
4126 'ext': video_ext,
4127 'title': video_title,
4128 'duration': duration,
4129 }
4130 return [info]
4131
4132
4aeae91f
PH
4133def gen_extractors():
4134 """ Return a list of an instance of every supported extractor.
4135 The order does matter; the first extractor matched is the one handling the URL.
4136 """
4137 return [
4138 YoutubePlaylistIE(),
4139 YoutubeChannelIE(),
4140 YoutubeUserIE(),
4141 YoutubeSearchIE(),
4142 YoutubeIE(),
4143 MetacafeIE(),
4144 DailymotionIE(),
4145 GoogleSearchIE(),
4146 PhotobucketIE(),
4147 YahooIE(),
4148 YahooSearchIE(),
4149 DepositFilesIE(),
4150 FacebookIE(),
4151 BlipTVUserIE(),
4152 BlipTVIE(),
4153 VimeoIE(),
4154 MyVideoIE(),
4155 ComedyCentralIE(),
4156 EscapistIE(),
4157 CollegeHumorIE(),
4158 XVideosIE(),
4159 SoundcloudIE(),
4160 InfoQIE(),
4161 MixcloudIE(),
4162 StanfordOpenClassroomIE(),
4163 MTVIE(),
4164 YoukuIE(),
4165 XNXXIE(),
18be482a
JC
4166 YouJizzIE(),
4167 PornotubeIE(),
4168 YouPornIE(),
4aeae91f
PH
4169 GooglePlusIE(),
4170 ArteTvIE(),
4171 NBAIE(),
4172 JustinTVIE(),
4173 FunnyOrDieIE(),
4aeae91f
PH
4174 SteamIE(),
4175 UstreamIE(),
ca0a0bbe 4176 RBMARadioIE(),
ccf65f9d 4177 EightTracksIE(),
da06e2da 4178 KeekIE(),
3a468f2d 4179 TEDIE(),
58994225 4180 MySpassIE(),
e32b06e9 4181 SpiegelIE(),
4aeae91f
PH
4182 GenericIE()
4183 ]
4184
4185