]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Import HTTPErrorProcessor from the correct module (Closes #696)
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
4fcca4bb 6import base64
d77c3dfd 7import datetime
ccf65f9d 8import itertools
d77c3dfd
FV
9import netrc
10import os
11import re
12import socket
13import time
d77c3dfd 14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
d77c3dfd 18
9e8056d5 19from .utils import *
d77c3dfd
FV
20
21
22class InfoExtractor(object):
59ae15a5 23 """Information Extractor class.
d77c3dfd 24
59ae15a5
PH
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
cdb30764 28 others. The information is stored in a dictionary which is then
59ae15a5
PH
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
717b1f72 32
59ae15a5 33 The dictionaries must include the following fields:
717b1f72 34
59ae15a5
PH
35 id: Video identifier.
36 url: Final video URL.
59ae15a5
PH
37 title: Video title, unescaped.
38 ext: Video filename extension.
717b1f72 39
59ae15a5 40 The following fields are optional:
717b1f72 41
59ae15a5
PH
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
539679c7
PH
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
77c4beab 47 uploader_id: Nickname or id of the video uploader.
6119f78c 48 location: Physical location of the video.
59ae15a5
PH
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
d77c3dfd 53
59ae15a5 54 The fields should all be Unicode strings.
9ce5d9ee 55
59ae15a5
PH
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
717b1f72 59
59ae15a5
PH
60 _real_extract() must return a *list* of information dictionaries as
61 described above.
03c5b0fb 62
59ae15a5
PH
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
65 """
d77c3dfd 66
59ae15a5
PH
67 _ready = False
68 _downloader = None
69 _WORKING = True
d77c3dfd 70
59ae15a5
PH
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
73 self._ready = False
74 self.set_downloader(downloader)
d77c3dfd 75
59ae15a5
PH
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
d77c3dfd 79
59ae15a5
PH
80 def working(self):
81 """Getter method for _WORKING."""
82 return self._WORKING
03c5b0fb 83
59ae15a5
PH
84 def initialize(self):
85 """Initializes an instance (authentication, etc)."""
86 if not self._ready:
87 self._real_initialize()
88 self._ready = True
d77c3dfd 89
59ae15a5
PH
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
92 self.initialize()
93 return self._real_extract(url)
d77c3dfd 94
59ae15a5
PH
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
d77c3dfd 98
59ae15a5
PH
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
101 pass
d77c3dfd 102
59ae15a5
PH
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
105 pass
d77c3dfd 106
d0d4f277
PH
107 @property
108 def IE_NAME(self):
109 return type(self).__name__[:-2]
d77c3dfd 110
64ce2aad
PH
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
d830b7c2
PH
113 if note is None:
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116 try:
64ce2aad 117 return compat_urllib_request.urlopen(url_or_request)
d830b7c2
PH
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 if errnote is None:
120 errnote = u'Unable to download webpage'
01951dda 121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
d830b7c2 122
64ce2aad
PH
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
128
d830b7c2 129
d77c3dfd 130class YoutubeIE(InfoExtractor):
59ae15a5
PH
131 """Information extractor for youtube.com."""
132
133 _VALID_URL = r"""^
134 (
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
3bb61659 145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
59ae15a5
PH
146 v=
147 )
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
152 $"""
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
d3f5f9f6 154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
59ae15a5
PH
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
162 '13': '3gp',
163 '17': 'mp4',
164 '18': 'mp4',
165 '22': 'mp4',
166 '37': 'mp4',
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168 '43': 'webm',
169 '44': 'webm',
170 '45': 'webm',
171 '46': 'webm',
172 }
173 _video_dimensions = {
174 '5': '240x400',
175 '6': '???',
176 '13': '???',
177 '17': '144x176',
178 '18': '360x640',
179 '22': '720x1280',
180 '34': '360x640',
181 '35': '480x854',
182 '37': '1080x1920',
183 '38': '3072x4096',
184 '43': '360x640',
185 '44': '480x854',
186 '45': '720x1280',
187 '46': '1080x1920',
cdb30764 188 }
59ae15a5
PH
189 IE_NAME = u'youtube'
190
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
198
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
202
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
206
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231 def _closed_captions_xml_to_srt(self, xml_string):
232 srt = ''
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
237 start = float(start)
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
246 return srt
247
056d8575
FV
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251 try:
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
262 srt_lang = 'en'
263 else:
1a2c3c0f 264 srt_lang = list(srt_lang_list.keys())[0]
056d8575
FV
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
fb778e66
PH
267 params = compat_urllib_parse.urlencode({
268 'lang': srt_lang,
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
270 'v': video_id,
271 })
272 url = 'http://www.youtube.com/api/timedtext?' + params
056d8575 273 try:
fb778e66 274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
056d8575
FV
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277 if not srt_xml:
fb778e66 278 return (u'WARNING: Did not fetch video subtitles', None)
056d8575
FV
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
280
59ae15a5
PH
281 def _print_formats(self, formats):
282 print('Available formats:')
283 for x in formats:
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
285
286 def _real_initialize(self):
287 if self._downloader is None:
288 return
289
290 username = None
291 password = None
292 downloader_params = self._downloader.params
293
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
299 try:
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
301 if info is not None:
302 username = info[0]
303 password = info[2]
304 else:
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
308 return
309
310 # Set language
311 request = compat_urllib_request.Request(self._LANG_URL)
312 try:
313 self.report_lang()
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
317 return
318
319 # No authentication to be performed
320 if username is None:
321 return
322
d3f5f9f6
PH
323 request = compat_urllib_request.Request(self._LOGIN_URL)
324 try:
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
328 return
329
330 galx = None
331 dsh = None
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
333 if match:
334 galx = match.group(1)
335
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
337 if match:
338 dsh = match.group(1)
339
59ae15a5 340 # Log in
d3f5f9f6
PH
341 login_form_strs = {
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
343 u'Email': username,
344 u'GALX': galx,
345 u'Passwd': password,
346 u'PersistentCookie': u'yes',
347 u'_utf8': u'霱',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
351 u'dnConn': u'',
352 u'dsh': dsh,
353 u'pstMsg': u'0',
354 u'rmShown': u'1',
355 u'secTok': u'',
356 u'signIn': u'Sign in',
357 u'timeStmp': u'',
358 u'service': u'youtube',
359 u'uilel': u'3',
360 u'hl': u'en_US',
361 }
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
363 # chokes on unicode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
59ae15a5
PH
367 try:
368 self.report_login()
80d3177e 369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
d3f5f9f6 370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
59ae15a5
PH
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
372 return
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
375 return
376
377 # Confirm age
378 age_form = {
379 'next_url': '/',
380 'action_confirm': 'Confirm',
381 }
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
383 try:
384 self.report_age_confirmation()
80d3177e 385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
388 return
389
3bb61659 390 def _extract_id(self, url):
59ae15a5
PH
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
392 if mobj is None:
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
394 return
395 video_id = mobj.group(2)
3bb61659
PH
396 return video_id
397
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
401 if mobj:
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
59ae15a5
PH
404
405 # Get video webpage
406 self.report_video_webpage_download(video_id)
3bb61659
PH
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
59ae15a5
PH
409 try:
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
413 return
414
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
416
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
419 if mobj is not None:
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
421 else:
422 player_url = None
423
424 # Get video info
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
430 try:
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
435 break
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
438 return
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
442 else:
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
444 return
445
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
449 return
450
451 # Start extracting information
452 self.report_information_extraction(video_id)
453
454 # uploader
455 if 'author' not in video_info:
77c4beab 456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
59ae15a5
PH
457 return
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
459
77c4beab
FV
460 # uploader_id
461 video_uploader_id = None
26cf0408 462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
77c4beab
FV
463 if mobj is not None:
464 video_uploader_id = mobj.group(1)
465 else:
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
467
59ae15a5
PH
468 # title
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
471 return
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
473
474 # thumbnail image
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
477 video_thumbnail = ''
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
480
481 # upload date
482 upload_date = None
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
484 if mobj is not None:
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
488 try:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
490 except:
491 pass
492
493 # description
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
497 else:
498 video_description = ''
499
500 # closed captions
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
056d8575
FV
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
504 if srt_error:
505 self._downloader.trouble(srt_error)
59ae15a5
PH
506
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
509 video_duration = ''
510 else:
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
512
513 # token
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
515
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
518
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1a2c3c0f 525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
59ae15a5
PH
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
527
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
532 else:
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
537 return
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
540 return
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
547 else:
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
553 if rf in url_map:
554 video_url_list = [(rf, url_map[rf])]
555 break
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
558 return
559 else:
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
561 return
562
563 results = []
564 for format_param, video_real_url in video_url_list:
565 # Extension
566 video_extension = self._video_extensions.get(format_param, 'flv')
567
32761d86
FV
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
570
571 results.append({
572 'id': video_id,
573 'url': video_real_url,
574 'uploader': video_uploader,
77c4beab 575 'uploader_id': video_uploader_id,
59ae15a5
PH
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
585 })
586 return results
d77c3dfd
FV
587
588
589class MetacafeIE(InfoExtractor):
59ae15a5
PH
590 """Information Extractor for metacafe.com."""
591
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
596
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
599
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
603
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
607
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
611
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
615
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
619 try:
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
624 return
625
626 # Confirm age
627 disclaimer_form = {
628 'filters': '0',
629 'submit': "Continue - I'm over 18",
630 }
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
632 try:
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
637 return
638
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
642 if mobj is None:
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
644 return
645
646 video_id = mobj.group(1)
647
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
652 return
653
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
656 try:
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
661 return
662
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
666 if mobj is not None:
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
669
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
672 if mobj is None:
673 video_url = mediaURL
674 else:
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
677 else:
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
679 if mobj is None:
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 return
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
685 return
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
687 if mobj is None:
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
689 return
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
693
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
695 if mobj is None:
696 self._downloader.trouble(u'ERROR: unable to extract title')
697 return
698 video_title = mobj.group(1).decode('utf-8')
699
700 mobj = re.search(r'submitter=(.*?);', webpage)
701 if mobj is None:
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
703 return
704 video_uploader = mobj.group(1)
705
706 return [{
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
710 'upload_date': None,
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
713 }]
d77c3dfd
FV
714
715
716class DailymotionIE(InfoExtractor):
59ae15a5
PH
717 """Information Extractor for Dailymotion"""
718
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
b17c974a 721 _WORKING = False
59ae15a5
PH
722
723 def __init__(self, downloader=None):
724 InfoExtractor.__init__(self, downloader)
725
59ae15a5
PH
726 def report_extraction(self, video_id):
727 """Report information extraction."""
728 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
729
730 def _real_extract(self, url):
731 # Extract id and simplified title from URL
732 mobj = re.match(self._VALID_URL, url)
733 if mobj is None:
734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
735 return
736
737 video_id = mobj.group(1).split('_')[0].split('?')[0]
738
739 video_extension = 'mp4'
740
741 # Retrieve video webpage to extract further information
742 request = compat_urllib_request.Request(url)
743 request.add_header('Cookie', 'family_filter=off')
8e241d1a 744 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
745
746 # Extract URL, uploader and title from webpage
747 self.report_extraction(video_id)
748 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
749 if mobj is None:
750 self._downloader.trouble(u'ERROR: unable to extract media URL')
751 return
752 flashvars = compat_urllib_parse.unquote(mobj.group(1))
753
754 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
755 if key in flashvars:
756 max_quality = key
757 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
758 break
759 else:
760 self._downloader.trouble(u'ERROR: unable to extract video URL')
761 return
762
763 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
764 if mobj is None:
765 self._downloader.trouble(u'ERROR: unable to extract video URL')
766 return
767
768 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
769
770 # TODO: support choosing qualities
771
772 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
773 if mobj is None:
774 self._downloader.trouble(u'ERROR: unable to extract title')
775 return
28ca6b5a 776 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
777
778 video_uploader = None
779 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
780 if mobj is None:
781 # lookin for official user
782 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783 if mobj_official is None:
784 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
785 else:
786 video_uploader = mobj_official.group(1)
787 else:
788 video_uploader = mobj.group(1)
789
790 video_upload_date = None
791 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
792 if mobj is not None:
793 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
794
795 return [{
28ca6b5a
PH
796 'id': video_id,
797 'url': video_url,
798 'uploader': video_uploader,
59ae15a5
PH
799 'upload_date': video_upload_date,
800 'title': video_title,
28ca6b5a 801 'ext': video_extension,
59ae15a5 802 }]
d77c3dfd
FV
803
804
d77c3dfd 805class PhotobucketIE(InfoExtractor):
59ae15a5
PH
806 """Information extractor for photobucket.com."""
807
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
810
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
813
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
817
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
821
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
825 if mobj is None:
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
827 return
828
829 video_id = mobj.group(1)
830
831 video_extension = 'flv'
832
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
835 try:
836 self.report_download_webpage(video_id)
837 webpage = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
840 return
841
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
845 if mobj is None:
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
847 return
848 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
849
850 video_url = mediaURL
851
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
853 if mobj is None:
854 self._downloader.trouble(u'ERROR: unable to extract title')
855 return
856 video_title = mobj.group(1).decode('utf-8')
857
858 video_uploader = mobj.group(2).decode('utf-8')
859
860 return [{
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
864 'upload_date': None,
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
867 }]
d77c3dfd
FV
868
869
870class YahooIE(InfoExtractor):
59ae15a5
PH
871 """Information extractor for video.yahoo.com."""
872
93702113 873 _WORKING = False
59ae15a5
PH
874 # _VALID_URL matches all Yahoo! Video URLs
875 # _VPAGE_URL matches only the extractable '/watch/' URLs
876 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878 IE_NAME = u'video.yahoo'
879
880 def __init__(self, downloader=None):
881 InfoExtractor.__init__(self, downloader)
882
883 def report_download_webpage(self, video_id):
884 """Report webpage download."""
885 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
886
887 def report_extraction(self, video_id):
888 """Report information extraction."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
890
891 def _real_extract(self, url, new_video=True):
892 # Extract ID from URL
893 mobj = re.match(self._VALID_URL, url)
894 if mobj is None:
895 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
896 return
897
898 video_id = mobj.group(2)
899 video_extension = 'flv'
900
901 # Rewrite valid but non-extractable URLs as
902 # extractable English language /watch/ URLs
903 if re.match(self._VPAGE_URL, url) is None:
904 request = compat_urllib_request.Request(url)
905 try:
906 webpage = compat_urllib_request.urlopen(request).read()
907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
909 return
910
911 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
912 if mobj is None:
913 self._downloader.trouble(u'ERROR: Unable to extract id field')
914 return
915 yahoo_id = mobj.group(1)
916
917 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
918 if mobj is None:
919 self._downloader.trouble(u'ERROR: Unable to extract vid field')
920 return
921 yahoo_vid = mobj.group(1)
922
923 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924 return self._real_extract(url, new_video=False)
925
926 # Retrieve video webpage to extract further information
927 request = compat_urllib_request.Request(url)
928 try:
929 self.report_download_webpage(video_id)
930 webpage = compat_urllib_request.urlopen(request).read()
931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933 return
934
935 # Extract uploader and title from webpage
936 self.report_extraction(video_id)
937 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
938 if mobj is None:
939 self._downloader.trouble(u'ERROR: unable to extract video title')
940 return
941 video_title = mobj.group(1).decode('utf-8')
942
943 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
944 if mobj is None:
945 self._downloader.trouble(u'ERROR: unable to extract video uploader')
946 return
947 video_uploader = mobj.group(1).decode('utf-8')
948
949 # Extract video thumbnail
950 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
951 if mobj is None:
952 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
953 return
954 video_thumbnail = mobj.group(1).decode('utf-8')
955
956 # Extract video description
957 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video description')
960 return
961 video_description = mobj.group(1).decode('utf-8')
962 if not video_description:
963 video_description = 'No description available.'
964
965 # Extract video height and width
966 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
967 if mobj is None:
968 self._downloader.trouble(u'ERROR: unable to extract video height')
969 return
970 yv_video_height = mobj.group(1)
971
972 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
973 if mobj is None:
974 self._downloader.trouble(u'ERROR: unable to extract video width')
975 return
976 yv_video_width = mobj.group(1)
977
978 # Retrieve video playlist to extract media URL
979 # I'm not completely sure what all these options are, but we
980 # seem to need most of them, otherwise the server sends a 401.
981 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
982 yv_bitrate = '700' # according to Wikipedia this is hard-coded
983 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
986 try:
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
991 return
992
993 # Extract media URL from playlist XML
994 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
995 if mobj is None:
996 self._downloader.trouble(u'ERROR: Unable to extract media URL')
997 return
998 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999 video_url = unescapeHTML(video_url)
1000
1001 return [{
1002 'id': video_id.decode('utf-8'),
1003 'url': video_url,
1004 'uploader': video_uploader,
1005 'upload_date': None,
1006 'title': video_title,
1007 'ext': video_extension.decode('utf-8'),
1008 'thumbnail': video_thumbnail.decode('utf-8'),
1009 'description': video_description,
1010 }]
d77c3dfd
FV
1011
1012
1013class VimeoIE(InfoExtractor):
59ae15a5
PH
1014 """Information extractor for vimeo.com."""
1015
1016 # _VALID_URL matches Vimeo URLs
8edc2cf8 1017 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
59ae15a5
PH
1018 IE_NAME = u'vimeo'
1019
1020 def __init__(self, downloader=None):
1021 InfoExtractor.__init__(self, downloader)
1022
1023 def report_download_webpage(self, video_id):
1024 """Report webpage download."""
1025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026
1027 def report_extraction(self, video_id):
1028 """Report information extraction."""
1029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1034 if mobj is None:
1035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1036 return
1037
8edc2cf8
PH
1038 video_id = mobj.group('id')
1039 if not mobj.group('proto'):
1040 url = 'https://' + url
1041 if mobj.group('direct_link'):
1042 url = 'https://vimeo.com/' + video_id
59ae15a5
PH
1043
1044 # Retrieve video webpage to extract further information
1045 request = compat_urllib_request.Request(url, None, std_headers)
1046 try:
1047 self.report_download_webpage(video_id)
f1171f7c
PH
1048 webpage_bytes = compat_urllib_request.urlopen(request).read()
1049 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
1050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052 return
1053
1054 # Now we begin extracting as much information as we can from what we
1055 # retrieved. First we extract the information common to all extractors,
1056 # and latter we extract those that are Vimeo specific.
1057 self.report_extraction(video_id)
1058
1059 # Extract the config JSON
59ae15a5 1060 try:
1ca63e3a 1061 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1062 config = json.loads(config)
1063 except:
1064 self._downloader.trouble(u'ERROR: unable to extract info section')
1065 return
cdb30764 1066
59ae15a5
PH
1067 # Extract title
1068 video_title = config["video"]["title"]
1069
77c4beab 1070 # Extract uploader and uploader_id
59ae15a5 1071 video_uploader = config["video"]["owner"]["name"]
77c4beab 1072 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1073
1074 # Extract video thumbnail
1075 video_thumbnail = config["video"]["thumbnail"]
1076
1077 # Extract video description
0dcfb234 1078 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5
PH
1079 if video_description: video_description = clean_html(video_description)
1080 else: video_description = ''
1081
1082 # Extract upload date
1083 video_upload_date = None
6b3aef80 1084 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1085 if mobj is not None:
6b3aef80 1086 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1087
1088 # Vimeo specific: extract request signature and timestamp
1089 sig = config['request']['signature']
1090 timestamp = config['request']['timestamp']
1091
1092 # Vimeo specific: extract video codec and quality information
1093 # First consider quality, then codecs, then take everything
1094 # TODO bind to format param
1095 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096 files = { 'hd': [], 'sd': [], 'other': []}
1097 for codec_name, codec_extension in codecs:
1098 if codec_name in config["video"]["files"]:
1099 if 'hd' in config["video"]["files"][codec_name]:
1100 files['hd'].append((codec_name, codec_extension, 'hd'))
1101 elif 'sd' in config["video"]["files"][codec_name]:
1102 files['sd'].append((codec_name, codec_extension, 'sd'))
1103 else:
1104 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105
1106 for quality in ('hd', 'sd', 'other'):
1107 if len(files[quality]) > 0:
1108 video_quality = files[quality][0][2]
1109 video_codec = files[quality][0][0]
1110 video_extension = files[quality][0][1]
1111 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1112 break
1113 else:
1114 self._downloader.trouble(u'ERROR: no known codec found')
1115 return
1116
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120 return [{
1121 'id': video_id,
1122 'url': video_url,
1123 'uploader': video_uploader,
77c4beab 1124 'uploader_id': video_uploader_id,
59ae15a5
PH
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1130 }]
d77c3dfd
FV
1131
1132
f2ad10a9 1133class ArteTvIE(InfoExtractor):
59ae15a5
PH
1134 """arte.tv information extractor."""
1135
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139 IE_NAME = u'arte.tv'
1140
1141 def __init__(self, downloader=None):
1142 InfoExtractor.__init__(self, downloader)
1143
1144 def report_download_webpage(self, video_id):
1145 """Report webpage download."""
1146 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151
1152 def fetch_webpage(self, url):
59ae15a5
PH
1153 request = compat_urllib_request.Request(url)
1154 try:
1155 self.report_download_webpage(url)
1156 webpage = compat_urllib_request.urlopen(request).read()
1157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159 return
1160 except ValueError as err:
1161 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1162 return
1163 return webpage
1164
1165 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166 page = self.fetch_webpage(url)
1167 mobj = re.search(regex, page, regexFlags)
1168 info = {}
1169
1170 if mobj is None:
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172 return
1173
1174 for (i, key, err) in matchTuples:
1175 if mobj.group(i) is None:
1176 self._downloader.trouble(err)
1177 return
1178 else:
1179 info[key] = mobj.group(i)
1180
1181 return info
1182
1183 def extractLiveStream(self, url):
1184 video_lang = url.split('/')[-4]
1185 info = self.grep_webpage(
1186 url,
1187 r'src="(.*?/videothek_js.*?\.js)',
1188 0,
1189 [
1190 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1191 ]
1192 )
1193 http_host = url.split('/')[2]
1194 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195 info = self.grep_webpage(
1196 next_url,
1197 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198 '(http://.*?\.swf).*?' +
1199 '(rtmp://.*?)\'',
1200 re.DOTALL,
1201 [
1202 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1203 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1205 ]
1206 )
1207 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208
1209 def extractPlus7Stream(self, url):
1210 video_lang = url.split('/')[-3]
1211 info = self.grep_webpage(
1212 url,
1213 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1214 0,
1215 [
1216 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1217 ]
1218 )
1219 next_url = compat_urllib_parse.unquote(info.get('url'))
1220 info = self.grep_webpage(
1221 next_url,
1222 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1223 0,
1224 [
1225 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1226 ]
1227 )
1228 next_url = compat_urllib_parse.unquote(info.get('url'))
1229
1230 info = self.grep_webpage(
1231 next_url,
1232 r'<video id="(.*?)".*?>.*?' +
1233 '<name>(.*?)</name>.*?' +
1234 '<dateVideo>(.*?)</dateVideo>.*?' +
1235 '<url quality="hd">(.*?)</url>',
1236 re.DOTALL,
1237 [
1238 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1239 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1241 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1242 ]
1243 )
1244
1245 return {
1246 'id': info.get('id'),
1247 'url': compat_urllib_parse.unquote(info.get('url')),
1248 'uploader': u'arte.tv',
1249 'upload_date': info.get('date'),
93702113 1250 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1251 'ext': u'mp4',
1252 'format': u'NA',
1253 'player_url': None,
1254 }
1255
1256 def _real_extract(self, url):
1257 video_id = url.split('/')[-1]
1258 self.report_extraction(video_id)
1259
1260 if re.search(self._LIVE_URL, video_id) is not None:
1261 self.extractLiveStream(url)
1262 return
1263 else:
1264 info = self.extractPlus7Stream(url)
1265
1266 return [info]
f2ad10a9
CA
1267
1268
d77c3dfd 1269class GenericIE(InfoExtractor):
59ae15a5
PH
1270 """Generic last-resort information extractor."""
1271
1272 _VALID_URL = r'.*'
1273 IE_NAME = u'generic'
1274
1275 def __init__(self, downloader=None):
1276 InfoExtractor.__init__(self, downloader)
1277
1278 def report_download_webpage(self, video_id):
1279 """Report webpage download."""
1280 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282
1283 def report_extraction(self, video_id):
1284 """Report information extraction."""
1285 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286
1287 def report_following_redirect(self, new_url):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1290
59ae15a5
PH
1291 def _test_redirect(self, url):
1292 """Check if it is a redirect, like url shorteners, in case restart chain."""
1293 class HeadRequest(compat_urllib_request.Request):
1294 def get_method(self):
1295 return "HEAD"
1296
1297 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298 """
cdb30764 1299 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1300 HeadRequest also on the redirected URL
1301 """
cdb30764 1302 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1303 if code in (301, 302, 303, 307):
cdb30764 1304 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1305 newheaders = dict((k,v) for k,v in req.headers.items()
1306 if k.lower() not in ("content-length", "content-type"))
cdb30764 1307 return HeadRequest(newurl,
59ae15a5 1308 headers=newheaders,
cdb30764
ND
1309 origin_req_host=req.get_origin_req_host(),
1310 unverifiable=True)
1311 else:
1312 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1313
1314 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315 """
1316 Fallback to GET if HEAD is not allowed (405 HTTP error)
1317 """
cdb30764 1318 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1319 fp.read()
1320 fp.close()
1321
1322 newheaders = dict((k,v) for k,v in req.headers.items()
1323 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1324 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325 headers=newheaders,
1326 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1327 unverifiable=True))
1328
1329 # Build our opener
cdb30764 1330 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1331 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332 HTTPMethodFallback, HEADRedirectHandler,
7c038b3c 1333 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
59ae15a5
PH
1334 opener.add_handler(handler())
1335
1336 response = opener.open(HeadRequest(url))
1337 new_url = response.geturl()
1338
1339 if url == new_url:
1340 return False
1341
1342 self.report_following_redirect(new_url)
1343 self._downloader.download([new_url])
1344 return True
1345
1346 def _real_extract(self, url):
1347 if self._test_redirect(url): return
1348
1349 video_id = url.split('/')[-1]
1350 request = compat_urllib_request.Request(url)
1351 try:
1352 self.report_download_webpage(video_id)
1353 webpage = compat_urllib_request.urlopen(request).read()
1354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356 return
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361 return
1362
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366 if mobj is None:
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369 if mobj is None:
1370 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371 return
1372
1373 # It's possible that one of the regexes
1374 # matched, but returned an empty group:
1375 if mobj.group(1) is None:
1376 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377 return
1378
1379 video_url = compat_urllib_parse.unquote(mobj.group(1))
1380 video_id = os.path.basename(video_url)
1381
1382 # here's a fun little line of code for you:
1383 video_extension = os.path.splitext(video_id)[1][1:]
1384 video_id = os.path.splitext(video_id)[0]
1385
1386 # it's tempting to parse this further, but you would
1387 # have to take into account all the variations like
1388 # Video Title - Site Name
1389 # Site Name | Video Title
1390 # Video Title - Tagline | Site Name
1391 # and so on and so forth; it's just not practical
1392 mobj = re.search(r'<title>(.*)</title>', webpage)
1393 if mobj is None:
1394 self._downloader.trouble(u'ERROR: unable to extract title')
1395 return
f1171f7c 1396 video_title = mobj.group(1)
59ae15a5
PH
1397
1398 # video uploader is domain name
1399 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1400 if mobj is None:
1401 self._downloader.trouble(u'ERROR: unable to extract title')
1402 return
f1171f7c 1403 video_uploader = mobj.group(1)
59ae15a5
PH
1404
1405 return [{
f1171f7c
PH
1406 'id': video_id,
1407 'url': video_url,
59ae15a5
PH
1408 'uploader': video_uploader,
1409 'upload_date': None,
1410 'title': video_title,
f1171f7c 1411 'ext': video_extension,
59ae15a5 1412 }]
d77c3dfd
FV
1413
1414
1415class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1416 """Information Extractor for YouTube search queries."""
1417 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1418 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1419 _max_youtube_results = 1000
1420 IE_NAME = u'youtube:search'
1421
1422 def __init__(self, downloader=None):
1423 InfoExtractor.__init__(self, downloader)
1424
1425 def report_download_page(self, query, pagenum):
1426 """Report attempt to download search page with given number."""
1427 query = query.decode(preferredencoding())
1428 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429
1430 def _real_extract(self, query):
1431 mobj = re.match(self._VALID_URL, query)
1432 if mobj is None:
1433 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1434 return
1435
1436 prefix, query = query.split(':')
1437 prefix = prefix[8:]
1438 query = query.encode('utf-8')
1439 if prefix == '':
1440 self._download_n_results(query, 1)
1441 return
1442 elif prefix == 'all':
1443 self._download_n_results(query, self._max_youtube_results)
1444 return
1445 else:
1446 try:
1447 n = int(prefix)
1448 if n <= 0:
1449 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450 return
1451 elif n > self._max_youtube_results:
1452 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1453 n = self._max_youtube_results
1454 self._download_n_results(query, n)
1455 return
1456 except ValueError: # parsing prefix as integer fails
1457 self._download_n_results(query, 1)
1458 return
1459
1460 def _download_n_results(self, query, n):
1461 """Downloads a specified number of results for a query"""
1462
1463 video_ids = []
1464 pagenum = 0
1465 limit = n
1466
1467 while (50 * pagenum) < limit:
1468 self.report_download_page(query, pagenum+1)
1469 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1470 request = compat_urllib_request.Request(result_url)
1471 try:
1472 data = compat_urllib_request.urlopen(request).read()
1473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1474 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1475 return
1476 api_response = json.loads(data)['data']
1477
1478 new_ids = list(video['id'] for video in api_response['items'])
1479 video_ids += new_ids
1480
1481 limit = min(n, api_response['totalItems'])
1482 pagenum += 1
1483
1484 if len(video_ids) > n:
1485 video_ids = video_ids[:n]
1486 for id in video_ids:
1487 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1488 return
d77c3dfd
FV
1489
1490
1491class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1492 """Information Extractor for Google Video search queries."""
1493 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497 _max_google_results = 1000
1498 IE_NAME = u'video.google:search'
1499
1500 def __init__(self, downloader=None):
1501 InfoExtractor.__init__(self, downloader)
1502
1503 def report_download_page(self, query, pagenum):
1504 """Report attempt to download playlist page with given number."""
1505 query = query.decode(preferredencoding())
1506 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1507
1508 def _real_extract(self, query):
1509 mobj = re.match(self._VALID_URL, query)
1510 if mobj is None:
1511 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1512 return
1513
1514 prefix, query = query.split(':')
1515 prefix = prefix[8:]
1516 query = query.encode('utf-8')
1517 if prefix == '':
1518 self._download_n_results(query, 1)
1519 return
1520 elif prefix == 'all':
1521 self._download_n_results(query, self._max_google_results)
1522 return
1523 else:
1524 try:
1525 n = int(prefix)
1526 if n <= 0:
1527 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1528 return
1529 elif n > self._max_google_results:
1530 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1531 n = self._max_google_results
1532 self._download_n_results(query, n)
1533 return
1534 except ValueError: # parsing prefix as integer fails
1535 self._download_n_results(query, 1)
1536 return
1537
1538 def _download_n_results(self, query, n):
1539 """Downloads a specified number of results for a query"""
1540
1541 video_ids = []
1542 pagenum = 0
1543
1544 while True:
1545 self.report_download_page(query, pagenum)
1546 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1547 request = compat_urllib_request.Request(result_url)
1548 try:
1549 page = compat_urllib_request.urlopen(request).read()
1550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1551 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1552 return
1553
1554 # Extract video identifiers
1555 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1556 video_id = mobj.group(1)
1557 if video_id not in video_ids:
1558 video_ids.append(video_id)
1559 if len(video_ids) == n:
1560 # Specified n videos reached
1561 for id in video_ids:
1562 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1563 return
1564
1565 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1566 for id in video_ids:
1567 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1568 return
1569
1570 pagenum = pagenum + 1
d77c3dfd
FV
1571
1572
1573class YahooSearchIE(InfoExtractor):
59ae15a5 1574 """Information Extractor for Yahoo! Video search queries."""
93702113
FV
1575
1576 _WORKING = False
59ae15a5
PH
1577 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1578 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1579 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1580 _MORE_PAGES_INDICATOR = r'\s*Next'
1581 _max_yahoo_results = 1000
1582 IE_NAME = u'video.yahoo:search'
1583
1584 def __init__(self, downloader=None):
1585 InfoExtractor.__init__(self, downloader)
1586
1587 def report_download_page(self, query, pagenum):
1588 """Report attempt to download playlist page with given number."""
1589 query = query.decode(preferredencoding())
1590 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1591
1592 def _real_extract(self, query):
1593 mobj = re.match(self._VALID_URL, query)
1594 if mobj is None:
1595 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1596 return
1597
1598 prefix, query = query.split(':')
1599 prefix = prefix[8:]
1600 query = query.encode('utf-8')
1601 if prefix == '':
1602 self._download_n_results(query, 1)
1603 return
1604 elif prefix == 'all':
1605 self._download_n_results(query, self._max_yahoo_results)
1606 return
1607 else:
1608 try:
1609 n = int(prefix)
1610 if n <= 0:
1611 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1612 return
1613 elif n > self._max_yahoo_results:
1614 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615 n = self._max_yahoo_results
1616 self._download_n_results(query, n)
1617 return
1618 except ValueError: # parsing prefix as integer fails
1619 self._download_n_results(query, 1)
1620 return
1621
1622 def _download_n_results(self, query, n):
1623 """Downloads a specified number of results for a query"""
1624
1625 video_ids = []
1626 already_seen = set()
1627 pagenum = 1
1628
1629 while True:
1630 self.report_download_page(query, pagenum)
1631 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632 request = compat_urllib_request.Request(result_url)
1633 try:
1634 page = compat_urllib_request.urlopen(request).read()
1635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1637 return
1638
1639 # Extract video identifiers
1640 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641 video_id = mobj.group(1)
1642 if video_id not in already_seen:
1643 video_ids.append(video_id)
1644 already_seen.add(video_id)
1645 if len(video_ids) == n:
1646 # Specified n videos reached
1647 for id in video_ids:
1648 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649 return
1650
1651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652 for id in video_ids:
1653 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654 return
1655
1656 pagenum = pagenum + 1
d77c3dfd
FV
1657
1658
1659class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1660 """Information Extractor for YouTube playlists."""
1661
e387eb5a 1662 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
59ae15a5
PH
1663 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1664 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
9789a05c 1665 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1666 IE_NAME = u'youtube:playlist'
1667
1668 def __init__(self, downloader=None):
1669 InfoExtractor.__init__(self, downloader)
1670
1671 def report_download_page(self, playlist_id, pagenum):
1672 """Report attempt to download playlist page with given number."""
1673 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1674
1675 def _real_extract(self, url):
1676 # Extract playlist id
1677 mobj = re.match(self._VALID_URL, url)
1678 if mobj is None:
1679 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1680 return
1681
1682 # Single video case
1683 if mobj.group(3) is not None:
1684 self._downloader.download([mobj.group(3)])
1685 return
1686
1687 # Download playlist pages
1688 # prefix is 'p' as default for playlists but there are other types that need extra care
1689 playlist_prefix = mobj.group(1)
1690 if playlist_prefix == 'a':
1691 playlist_access = 'artist'
1692 else:
1693 playlist_prefix = 'p'
1694 playlist_access = 'view_play_list'
1695 playlist_id = mobj.group(2)
1696 video_ids = []
1697 pagenum = 1
1698
1699 while True:
1700 self.report_download_page(playlist_id, pagenum)
1701 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1702 request = compat_urllib_request.Request(url)
1703 try:
80d3177e 1704 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1705 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1706 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1707 return
1708
1709 # Extract video identifiers
1710 ids_in_page = []
1711 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1712 if mobj.group(1) not in ids_in_page:
1713 ids_in_page.append(mobj.group(1))
1714 video_ids.extend(ids_in_page)
1715
9789a05c 1716 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1717 break
1718 pagenum = pagenum + 1
1719
9789a05c
FV
1720 total = len(video_ids)
1721
59ae15a5
PH
1722 playliststart = self._downloader.params.get('playliststart', 1) - 1
1723 playlistend = self._downloader.params.get('playlistend', -1)
1724 if playlistend == -1:
1725 video_ids = video_ids[playliststart:]
1726 else:
1727 video_ids = video_ids[playliststart:playlistend]
1728
9789a05c
FV
1729 if len(video_ids) == total:
1730 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1731 else:
1732 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1733
59ae15a5
PH
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736 return
d77c3dfd
FV
1737
1738
902b2a0a 1739class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1740 """Information Extractor for YouTube channels."""
1741
1742 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1743 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
9789a05c 1744 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1745 IE_NAME = u'youtube:channel'
1746
1747 def report_download_page(self, channel_id, pagenum):
1748 """Report attempt to download channel page with given number."""
1749 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1750
1751 def _real_extract(self, url):
1752 # Extract channel id
1753 mobj = re.match(self._VALID_URL, url)
1754 if mobj is None:
1755 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1756 return
1757
1758 # Download channel pages
1759 channel_id = mobj.group(1)
1760 video_ids = []
1761 pagenum = 1
1762
1763 while True:
1764 self.report_download_page(channel_id, pagenum)
1765 url = self._TEMPLATE_URL % (channel_id, pagenum)
1766 request = compat_urllib_request.Request(url)
1767 try:
9789a05c 1768 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5
PH
1769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1770 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1771 return
1772
1773 # Extract video identifiers
1774 ids_in_page = []
1775 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1776 if mobj.group(1) not in ids_in_page:
1777 ids_in_page.append(mobj.group(1))
1778 video_ids.extend(ids_in_page)
1779
9789a05c 1780 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1781 break
1782 pagenum = pagenum + 1
1783
9789a05c
FV
1784 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1785
59ae15a5
PH
1786 for id in video_ids:
1787 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1788 return
902b2a0a
FV
1789
1790
d77c3dfd 1791class YoutubeUserIE(InfoExtractor):
59ae15a5 1792 """Information Extractor for YouTube users."""
d77c3dfd 1793
59ae15a5
PH
1794 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1795 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1796 _GDATA_PAGE_SIZE = 50
1797 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1798 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1799 IE_NAME = u'youtube:user'
d77c3dfd 1800
59ae15a5
PH
1801 def __init__(self, downloader=None):
1802 InfoExtractor.__init__(self, downloader)
d77c3dfd 1803
59ae15a5
PH
1804 def report_download_page(self, username, start_index):
1805 """Report attempt to download user page."""
1806 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1807 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1808
59ae15a5
PH
1809 def _real_extract(self, url):
1810 # Extract username
1811 mobj = re.match(self._VALID_URL, url)
1812 if mobj is None:
1813 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1814 return
d77c3dfd 1815
59ae15a5 1816 username = mobj.group(1)
d77c3dfd 1817
59ae15a5
PH
1818 # Download video ids using YouTube Data API. Result size per
1819 # query is limited (currently to 50 videos) so we need to query
1820 # page by page until there are no video ids - it means we got
1821 # all of them.
d77c3dfd 1822
59ae15a5
PH
1823 video_ids = []
1824 pagenum = 0
d77c3dfd 1825
59ae15a5
PH
1826 while True:
1827 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1828 self.report_download_page(username, start_index)
d77c3dfd 1829
59ae15a5 1830 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1831
59ae15a5 1832 try:
80d3177e 1833 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1834 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1836 return
d77c3dfd 1837
59ae15a5
PH
1838 # Extract video identifiers
1839 ids_in_page = []
d77c3dfd 1840
59ae15a5
PH
1841 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1842 if mobj.group(1) not in ids_in_page:
1843 ids_in_page.append(mobj.group(1))
d77c3dfd 1844
59ae15a5 1845 video_ids.extend(ids_in_page)
d77c3dfd 1846
59ae15a5
PH
1847 # A little optimization - if current page is not
1848 # "full", ie. does not contain PAGE_SIZE video ids then
1849 # we can assume that this page is the last one - there
1850 # are no more ids on further pages - no need to query
1851 # again.
d77c3dfd 1852
59ae15a5
PH
1853 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1854 break
d77c3dfd 1855
59ae15a5 1856 pagenum += 1
d77c3dfd 1857
59ae15a5
PH
1858 all_ids_count = len(video_ids)
1859 playliststart = self._downloader.params.get('playliststart', 1) - 1
1860 playlistend = self._downloader.params.get('playlistend', -1)
d77c3dfd 1861
59ae15a5
PH
1862 if playlistend == -1:
1863 video_ids = video_ids[playliststart:]
1864 else:
1865 video_ids = video_ids[playliststart:playlistend]
d77c3dfd 1866
59ae15a5
PH
1867 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1868 (username, all_ids_count, len(video_ids)))
d77c3dfd 1869
59ae15a5
PH
1870 for video_id in video_ids:
1871 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1872
1873
eeeb4daa 1874class BlipTVUserIE(InfoExtractor):
59ae15a5 1875 """Information Extractor for blip.tv users."""
eeeb4daa 1876
59ae15a5
PH
1877 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1878 _PAGE_SIZE = 12
1879 IE_NAME = u'blip.tv:user'
eeeb4daa 1880
59ae15a5
PH
1881 def __init__(self, downloader=None):
1882 InfoExtractor.__init__(self, downloader)
eeeb4daa 1883
59ae15a5
PH
1884 def report_download_page(self, username, pagenum):
1885 """Report attempt to download user page."""
1886 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1887 (self.IE_NAME, username, pagenum))
eeeb4daa 1888
59ae15a5
PH
1889 def _real_extract(self, url):
1890 # Extract username
1891 mobj = re.match(self._VALID_URL, url)
1892 if mobj is None:
1893 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1894 return
eeeb4daa 1895
59ae15a5 1896 username = mobj.group(1)
eeeb4daa 1897
59ae15a5 1898 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1899
59ae15a5 1900 request = compat_urllib_request.Request(url)
eeeb4daa 1901
59ae15a5
PH
1902 try:
1903 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1904 mobj = re.search(r'data-users-id="([^"]+)"', page)
1905 page_base = page_base % mobj.group(1)
1906 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1908 return
eeeb4daa
JCGS
1909
1910
59ae15a5
PH
1911 # Download video ids using BlipTV Ajax calls. Result size per
1912 # query is limited (currently to 12 videos) so we need to query
1913 # page by page until there are no video ids - it means we got
1914 # all of them.
eeeb4daa 1915
59ae15a5
PH
1916 video_ids = []
1917 pagenum = 1
eeeb4daa 1918
59ae15a5
PH
1919 while True:
1920 self.report_download_page(username, pagenum)
eeeb4daa 1921
59ae15a5 1922 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa 1923
59ae15a5
PH
1924 try:
1925 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1927 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1928 return
eeeb4daa 1929
59ae15a5
PH
1930 # Extract video identifiers
1931 ids_in_page = []
eeeb4daa 1932
59ae15a5
PH
1933 for mobj in re.finditer(r'href="/([^"]+)"', page):
1934 if mobj.group(1) not in ids_in_page:
1935 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1936
59ae15a5 1937 video_ids.extend(ids_in_page)
eeeb4daa 1938
59ae15a5
PH
1939 # A little optimization - if current page is not
1940 # "full", ie. does not contain PAGE_SIZE video ids then
1941 # we can assume that this page is the last one - there
1942 # are no more ids on further pages - no need to query
1943 # again.
eeeb4daa 1944
59ae15a5
PH
1945 if len(ids_in_page) < self._PAGE_SIZE:
1946 break
eeeb4daa 1947
59ae15a5 1948 pagenum += 1
eeeb4daa 1949
59ae15a5
PH
1950 all_ids_count = len(video_ids)
1951 playliststart = self._downloader.params.get('playliststart', 1) - 1
1952 playlistend = self._downloader.params.get('playlistend', -1)
eeeb4daa 1953
59ae15a5
PH
1954 if playlistend == -1:
1955 video_ids = video_ids[playliststart:]
1956 else:
1957 video_ids = video_ids[playliststart:playlistend]
eeeb4daa 1958
59ae15a5
PH
1959 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1960 (self.IE_NAME, username, all_ids_count, len(video_ids)))
eeeb4daa 1961
59ae15a5
PH
1962 for video_id in video_ids:
1963 self._downloader.download([u'http://blip.tv/'+video_id])
eeeb4daa
JCGS
1964
1965
d77c3dfd 1966class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1967 """Information extractor for depositfiles.com"""
1968
1969 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59ae15a5
PH
1970
1971 def report_download_webpage(self, file_id):
1972 """Report webpage download."""
1973 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1974
1975 def report_extraction(self, file_id):
1976 """Report information extraction."""
1977 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1978
1979 def _real_extract(self, url):
1980 file_id = url.split('/')[-1]
1981 # Rebuild url in english locale
1982 url = 'http://depositfiles.com/en/files/' + file_id
1983
1984 # Retrieve file webpage with 'Free download' button pressed
1985 free_download_indication = { 'gateway_result' : '1' }
1986 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1987 try:
1988 self.report_download_webpage(file_id)
1989 webpage = compat_urllib_request.urlopen(request).read()
1990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1991 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1992 return
1993
1994 # Search for the real file URL
1995 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1996 if (mobj is None) or (mobj.group(1) is None):
1997 # Try to figure out reason of the error.
1998 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1999 if (mobj is not None) and (mobj.group(1) is not None):
2000 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2001 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2002 else:
2003 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2004 return
2005
2006 file_url = mobj.group(1)
2007 file_extension = os.path.splitext(file_url)[1][1:]
2008
2009 # Search for file title
2010 mobj = re.search(r'<b title="(.*?)">', webpage)
2011 if mobj is None:
2012 self._downloader.trouble(u'ERROR: unable to extract title')
2013 return
2014 file_title = mobj.group(1).decode('utf-8')
2015
2016 return [{
2017 'id': file_id.decode('utf-8'),
2018 'url': file_url.decode('utf-8'),
2019 'uploader': None,
2020 'upload_date': None,
2021 'title': file_title,
2022 'ext': file_extension.decode('utf-8'),
2023 }]
d77c3dfd
FV
2024
2025
2026class FacebookIE(InfoExtractor):
59ae15a5
PH
2027 """Information Extractor for Facebook"""
2028
59ae15a5
PH
2029 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2030 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2031 _NETRC_MACHINE = 'facebook'
59ae15a5
PH
2032 IE_NAME = u'facebook'
2033
59ae15a5
PH
2034 def report_login(self):
2035 """Report attempt to log in."""
b954070d 2036 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
59ae15a5
PH
2037
2038 def _real_initialize(self):
2039 if self._downloader is None:
2040 return
2041
2042 useremail = None
2043 password = None
2044 downloader_params = self._downloader.params
2045
2046 # Attempt to use provided username and password or .netrc data
2047 if downloader_params.get('username', None) is not None:
2048 useremail = downloader_params['username']
2049 password = downloader_params['password']
2050 elif downloader_params.get('usenetrc', False):
2051 try:
2052 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2053 if info is not None:
2054 useremail = info[0]
2055 password = info[2]
2056 else:
2057 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2058 except (IOError, netrc.NetrcParseError) as err:
2059 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2060 return
2061
2062 if useremail is None:
2063 return
2064
2065 # Log in
2066 login_form = {
2067 'email': useremail,
2068 'pass': password,
2069 'login': 'Log+In'
2070 }
2071 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2072 try:
2073 self.report_login()
2074 login_results = compat_urllib_request.urlopen(request).read()
2075 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2076 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2077 return
2078 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2079 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2080 return
2081
2082 def _real_extract(self, url):
2083 mobj = re.match(self._VALID_URL, url)
2084 if mobj is None:
2085 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2086 return
2087 video_id = mobj.group('ID')
2088
b954070d
PH
2089 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2090 webpage = self._download_webpage(url, video_id)
2091
2092 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2093 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2094 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2095 if not m:
2096 raise ExtractorError(u'Cannot parse data')
2097 data = dict(json.loads(m.group(1)))
edba5137
PH
2098 params_raw = compat_urllib_parse.unquote(data['params'])
2099 params = json.loads(params_raw)
2100 video_url = params['hd_src']
7796e8c2
PH
2101 if not video_url:
2102 video_url = params['sd_src']
2103 if not video_url:
2104 raise ExtractorError(u'Cannot find video URL')
edba5137 2105 video_duration = int(params['video_duration'])
b954070d
PH
2106
2107 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2108 if not m:
2109 raise ExtractorError(u'Cannot find title in webpage')
2110 video_title = unescapeHTML(m.group(1))
2111
2112 info = {
2113 'id': video_id,
2114 'title': video_title,
2115 'url': video_url,
2116 'ext': 'mp4',
2117 'duration': video_duration,
edba5137 2118 'thumbnail': params['thumbnail_src'],
b954070d
PH
2119 }
2120 return [info]
59ae15a5 2121
d77c3dfd
FV
2122
2123class BlipTVIE(InfoExtractor):
59ae15a5
PH
2124 """Information extractor for blip.tv"""
2125
2126 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2127 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2128 IE_NAME = u'blip.tv'
2129
2130 def report_extraction(self, file_id):
2131 """Report information extraction."""
2132 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2133
2134 def report_direct_download(self, title):
2135 """Report information extraction."""
2136 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2137
2138 def _real_extract(self, url):
2139 mobj = re.match(self._VALID_URL, url)
2140 if mobj is None:
2141 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2142 return
2143
2144 if '?' in url:
2145 cchar = '&'
2146 else:
2147 cchar = '?'
2148 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2149 request = compat_urllib_request.Request(json_url)
3446dfb7 2150 request.add_header('User-Agent', 'iTunes/10.6.1')
59ae15a5
PH
2151 self.report_extraction(mobj.group(1))
2152 info = None
2153 try:
2154 urlh = compat_urllib_request.urlopen(request)
2155 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2156 basename = url.split('/')[-1]
2157 title,ext = os.path.splitext(basename)
2158 title = title.decode('UTF-8')
2159 ext = ext.replace('.', '')
2160 self.report_direct_download(title)
2161 info = {
2162 'id': title,
2163 'url': url,
2164 'uploader': None,
2165 'upload_date': None,
2166 'title': title,
2167 'ext': ext,
2168 'urlhandle': urlh
2169 }
2170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446dfb7 2171 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
59ae15a5
PH
2172 if info is None: # Regular URL
2173 try:
55c05398
PH
2174 json_code_bytes = urlh.read()
2175 json_code = json_code_bytes.decode('utf-8')
59ae15a5
PH
2176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2178 return
2179
2180 try:
2181 json_data = json.loads(json_code)
2182 if 'Post' in json_data:
2183 data = json_data['Post']
2184 else:
2185 data = json_data
2186
2187 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2188 video_url = data['media']['url']
2189 umobj = re.match(self._URL_EXT, video_url)
2190 if umobj is None:
2191 raise ValueError('Can not determine filename extension')
2192 ext = umobj.group(1)
2193
2194 info = {
2195 'id': data['item_id'],
2196 'url': video_url,
2197 'uploader': data['display_name'],
2198 'upload_date': upload_date,
2199 'title': data['title'],
2200 'ext': ext,
2201 'format': data['media']['mimeType'],
2202 'thumbnail': data['thumbnailUrl'],
2203 'description': data['description'],
3446dfb7
PH
2204 'player_url': data['embedUrl'],
2205 'user_agent': 'iTunes/10.6.1',
59ae15a5
PH
2206 }
2207 except (ValueError,KeyError) as err:
2208 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2209 return
2210
59ae15a5 2211 return [info]
d77c3dfd
FV
2212
2213
2214class MyVideoIE(InfoExtractor):
59ae15a5
PH
2215 """Information Extractor for myvideo.de."""
2216
2217 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2218 IE_NAME = u'myvideo'
2219
2220 def __init__(self, downloader=None):
2221 InfoExtractor.__init__(self, downloader)
cdb30764 2222
59ae15a5
PH
2223 def report_extraction(self, video_id):
2224 """Report information extraction."""
2225 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2226
2227 def _real_extract(self,url):
2228 mobj = re.match(self._VALID_URL, url)
2229 if mobj is None:
2230 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2231 return
2232
2233 video_id = mobj.group(1)
2234
2235 # Get video webpage
5f955171
PH
2236 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2237 webpage = self._download_webpage(webpage_url, video_id)
59ae15a5
PH
2238
2239 self.report_extraction(video_id)
6d436336 2240 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
59ae15a5
PH
2241 webpage)
2242 if mobj is None:
2243 self._downloader.trouble(u'ERROR: unable to extract media URL')
2244 return
2245 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2246
2247 mobj = re.search('<title>([^<]+)</title>', webpage)
2248 if mobj is None:
2249 self._downloader.trouble(u'ERROR: unable to extract title')
2250 return
2251
2252 video_title = mobj.group(1)
2253
2254 return [{
2255 'id': video_id,
2256 'url': video_url,
2257 'uploader': None,
2258 'upload_date': None,
2259 'title': video_title,
2260 'ext': u'flv',
2261 }]
d77c3dfd
FV
2262
2263class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2264 """Information extractor for The Daily Show and Colbert Report """
2265
ca6849e6 2266 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2267 # urls for episodes like:
ca6849e6 2268 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2269 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2270 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2271 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2272 |(https?://)?(www\.)?
2273 (?P<showname>thedailyshow|colbertnation)\.com/
2274 (full-episodes/(?P<episode>.*)|
2275 (?P<clip>
2276 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2277 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2278 $"""
59ae15a5
PH
2279
2280 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2281
2282 _video_extensions = {
2283 '3500': 'mp4',
2284 '2200': 'mp4',
2285 '1700': 'mp4',
2286 '1200': 'mp4',
2287 '750': 'mp4',
2288 '400': 'mp4',
2289 }
2290 _video_dimensions = {
2291 '3500': '1280x720',
2292 '2200': '960x540',
2293 '1700': '768x432',
2294 '1200': '640x360',
2295 '750': '512x288',
2296 '400': '384x216',
2297 }
2298
ca6849e6 2299 def suitable(self, url):
2300 """Receives a URL and returns True if suitable for this IE."""
2301 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2302
59ae15a5
PH
2303 def report_extraction(self, episode_id):
2304 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2305
32635ec6
PH
2306 def report_config_download(self, episode_id, media_id):
2307 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
59ae15a5
PH
2308
2309 def report_index_download(self, episode_id):
2310 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2311
59ae15a5
PH
2312 def _print_formats(self, formats):
2313 print('Available formats:')
2314 for x in formats:
2315 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2316
2317
2318 def _real_extract(self, url):
ca6849e6 2319 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2320 if mobj is None:
2321 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2322 return
2323
2324 if mobj.group('shortname'):
2325 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2326 url = u'http://www.thedailyshow.com/full-episodes/'
2327 else:
2328 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2330 assert mobj is not None
2331
ca6849e6 2332 if mobj.group('clip'):
2333 if mobj.group('showname') == 'thedailyshow':
2334 epTitle = mobj.group('tdstitle')
2335 else:
2336 epTitle = mobj.group('cntitle')
2337 dlNewest = False
59ae15a5 2338 else:
ca6849e6 2339 dlNewest = not mobj.group('episode')
2340 if dlNewest:
2341 epTitle = mobj.group('showname')
2342 else:
2343 epTitle = mobj.group('episode')
59ae15a5
PH
2344
2345 req = compat_urllib_request.Request(url)
2346 self.report_extraction(epTitle)
2347 try:
2348 htmlHandle = compat_urllib_request.urlopen(req)
2349 html = htmlHandle.read()
93148102 2350 webpage = html.decode('utf-8')
59ae15a5
PH
2351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2352 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2353 return
2354 if dlNewest:
2355 url = htmlHandle.geturl()
ca6849e6 2356 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2357 if mobj is None:
2358 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2359 return
2360 if mobj.group('episode') == '':
2361 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2362 return
2363 epTitle = mobj.group('episode')
2364
93148102 2365 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
59ae15a5
PH
2366
2367 if len(mMovieParams) == 0:
2368 # The Colbert Report embeds the information in a without
2369 # a URL prefix; so extract the alternate reference
2370 # and then add the URL prefix manually.
2371
93148102 2372 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
59ae15a5
PH
2373 if len(altMovieParams) == 0:
2374 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2375 return
2376 else:
2377 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2378
59ae15a5
PH
2379 uri = mMovieParams[0][1]
2380 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2381 self.report_index_download(epTitle)
2382 try:
2383 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2384 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2385 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2386 return
2387
2388 results = []
2389
2390 idoc = xml.etree.ElementTree.fromstring(indexXml)
2391 itemEls = idoc.findall('.//item')
7717ae19 2392 for partNum,itemEl in enumerate(itemEls):
59ae15a5
PH
2393 mediaId = itemEl.findall('./guid')[0].text
2394 shortMediaId = mediaId.split(':')[-1]
2395 showId = mediaId.split(':')[-2].replace('.com', '')
2396 officialTitle = itemEl.findall('./title')[0].text
2397 officialDate = itemEl.findall('./pubDate')[0].text
2398
2399 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2400 compat_urllib_parse.urlencode({'uri': mediaId}))
2401 configReq = compat_urllib_request.Request(configUrl)
32635ec6 2402 self.report_config_download(epTitle, shortMediaId)
59ae15a5
PH
2403 try:
2404 configXml = compat_urllib_request.urlopen(configReq).read()
2405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2407 return
2408
2409 cdoc = xml.etree.ElementTree.fromstring(configXml)
2410 turls = []
2411 for rendition in cdoc.findall('.//rendition'):
2412 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2413 turls.append(finfo)
2414
2415 if len(turls) == 0:
2416 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2417 continue
cdb30764 2418
59ae15a5
PH
2419 if self._downloader.params.get('listformats', None):
2420 self._print_formats([i[0] for i in turls])
2421 return
2422
2423 # For now, just pick the highest bitrate
32635ec6 2424 format,rtmp_video_url = turls[-1]
59ae15a5
PH
2425
2426 # Get the format arg from the arg stream
2427 req_format = self._downloader.params.get('format', None)
2428
2429 # Select format if we can find one
2430 for f,v in turls:
2431 if f == req_format:
32635ec6 2432 format, rtmp_video_url = f, v
59ae15a5
PH
2433 break
2434
32635ec6
PH
2435 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2436 if not m:
2437 raise ExtractorError(u'Cannot transform RTMP url')
2438 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2439 video_url = base + m.group('finalid')
59ae15a5 2440
7717ae19 2441 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
59ae15a5
PH
2442 info = {
2443 'id': shortMediaId,
2444 'url': video_url,
2445 'uploader': showId,
2446 'upload_date': officialDate,
2447 'title': effTitle,
2448 'ext': 'mp4',
2449 'format': format,
2450 'thumbnail': None,
2451 'description': officialTitle,
59ae15a5 2452 }
59ae15a5 2453 results.append(info)
cdb30764 2454
59ae15a5 2455 return results
d77c3dfd
FV
2456
2457
2458class EscapistIE(InfoExtractor):
59ae15a5
PH
2459 """Information extractor for The Escapist """
2460
2461 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2462 IE_NAME = u'escapist'
2463
2464 def report_extraction(self, showName):
2465 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2466
2467 def report_config_download(self, showName):
2468 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2469
2470 def _real_extract(self, url):
2471 mobj = re.match(self._VALID_URL, url)
2472 if mobj is None:
2473 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2474 return
2475 showName = mobj.group('showname')
2476 videoId = mobj.group('episode')
2477
2478 self.report_extraction(showName)
2479 try:
2480 webPage = compat_urllib_request.urlopen(url)
2481 webPageBytes = webPage.read()
2482 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2483 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2486 return
2487
2488 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2489 description = unescapeHTML(descMatch.group(1))
2490 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2491 imgUrl = unescapeHTML(imgMatch.group(1))
2492 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2493 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2494 configUrlMatch = re.search('config=(.*)$', playerUrl)
2495 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2496
2497 self.report_config_download(showName)
2498 try:
93702113
FV
2499 configJSON = compat_urllib_request.urlopen(configUrl)
2500 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2501 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
59ae15a5
PH
2502 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2503 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2504 return
2505
2506 # Technically, it's JavaScript, not JSON
2507 configJSON = configJSON.replace("'", '"')
2508
2509 try:
2510 config = json.loads(configJSON)
2511 except (ValueError,) as err:
2512 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2513 return
2514
2515 playlist = config['playlist']
2516 videoUrl = playlist[1]['url']
2517
2518 info = {
2519 'id': videoId,
2520 'url': videoUrl,
2521 'uploader': showName,
2522 'upload_date': None,
2523 'title': showName,
2524 'ext': 'flv',
2525 'thumbnail': imgUrl,
2526 'description': description,
2527 'player_url': playerUrl,
2528 }
2529
2530 return [info]
d77c3dfd 2531
d77c3dfd 2532class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2533 """Information extractor for collegehumor.com"""
2534
0eb0faa2 2535 _WORKING = False
59ae15a5
PH
2536 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2537 IE_NAME = u'collegehumor'
2538
799c0763 2539 def report_manifest(self, video_id):
59ae15a5 2540 """Report information extraction."""
799c0763 2541 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
59ae15a5
PH
2542
2543 def report_extraction(self, video_id):
2544 """Report information extraction."""
2545 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2546
2547 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2549 if mobj is None:
2550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2551 return
2552 video_id = mobj.group('videoid')
2553
59ae15a5
PH
2554 info = {
2555 'id': video_id,
59ae15a5
PH
2556 'uploader': None,
2557 'upload_date': None,
2558 }
2559
2560 self.report_extraction(video_id)
799c0763 2561 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2562 try:
2563 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2566 return
2567
2568 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2569 try:
2570 videoNode = mdoc.findall('./video')[0]
2571 info['description'] = videoNode.findall('./description')[0].text
2572 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2573 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2574 manifest_url = videoNode.findall('./file')[0].text
59ae15a5
PH
2575 except IndexError:
2576 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2577 return
2578
799c0763
PH
2579 manifest_url += '?hdcore=2.10.3'
2580 self.report_manifest(video_id)
2581 try:
2582 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2584 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2585 return
2586
2587 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2588 try:
2589 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2590 node_id = media_node.attrib['url']
2591 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2592 except IndexError as err:
2593 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2594 return
2595
2596 url_pr = compat_urllib_parse_urlparse(manifest_url)
2597 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2598
2599 info['url'] = url
2600 info['ext'] = 'f4f'
59ae15a5 2601 return [info]
d77c3dfd
FV
2602
2603
2604class XVideosIE(InfoExtractor):
59ae15a5 2605 """Information extractor for xvideos.com"""
d77c3dfd 2606
59ae15a5
PH
2607 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2608 IE_NAME = u'xvideos'
d77c3dfd 2609
59ae15a5
PH
2610 def report_extraction(self, video_id):
2611 """Report information extraction."""
2612 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
d77c3dfd 2613
59ae15a5
PH
2614 def _real_extract(self, url):
2615 mobj = re.match(self._VALID_URL, url)
2616 if mobj is None:
2617 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2618 return
8588a86f 2619 video_id = mobj.group(1)
d77c3dfd 2620
5f955171 2621 webpage = self._download_webpage(url, video_id)
d77c3dfd 2622
59ae15a5 2623 self.report_extraction(video_id)
d77c3dfd
FV
2624
2625
59ae15a5
PH
2626 # Extract video URL
2627 mobj = re.search(r'flv_url=(.+?)&', webpage)
2628 if mobj is None:
2629 self._downloader.trouble(u'ERROR: unable to extract video url')
2630 return
8588a86f 2631 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2632
2633
59ae15a5
PH
2634 # Extract title
2635 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2636 if mobj is None:
2637 self._downloader.trouble(u'ERROR: unable to extract video title')
2638 return
8588a86f 2639 video_title = mobj.group(1)
d77c3dfd
FV
2640
2641
59ae15a5
PH
2642 # Extract video thumbnail
2643 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2644 if mobj is None:
2645 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2646 return
8588a86f 2647 video_thumbnail = mobj.group(0)
d77c3dfd 2648
59ae15a5
PH
2649 info = {
2650 'id': video_id,
2651 'url': video_url,
2652 'uploader': None,
2653 'upload_date': None,
2654 'title': video_title,
2655 'ext': 'flv',
2656 'thumbnail': video_thumbnail,
2657 'description': None,
2658 }
d77c3dfd 2659
59ae15a5 2660 return [info]
d77c3dfd
FV
2661
2662
2663class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2664 """Information extractor for soundcloud.com
2665 To access the media, the uid of the song and a stream token
2666 must be extracted from the page source and the script must make
2667 a request to media.soundcloud.com/crossdomain.xml. Then
2668 the media can be grabbed by requesting from an url composed
2669 of the stream token and uid
2670 """
2671
2672 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2673 IE_NAME = u'soundcloud'
2674
2675 def __init__(self, downloader=None):
2676 InfoExtractor.__init__(self, downloader)
2677
8fd3afd5 2678 def report_resolve(self, video_id):
59ae15a5 2679 """Report information extraction."""
8fd3afd5 2680 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
59ae15a5
PH
2681
2682 def report_extraction(self, video_id):
2683 """Report information extraction."""
8fd3afd5 2684 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
59ae15a5
PH
2685
2686 def _real_extract(self, url):
2687 mobj = re.match(self._VALID_URL, url)
2688 if mobj is None:
2689 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2690 return
2691
2692 # extract uploader (which is in the url)
15c8d833 2693 uploader = mobj.group(1)
59ae15a5 2694 # extract simple title (uploader + slug of song title)
15c8d833 2695 slug_title = mobj.group(2)
59ae15a5
PH
2696 simple_title = uploader + u'-' + slug_title
2697
8fd3afd5 2698 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2699
8fd3afd5
PH
2700 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2701 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2702 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2703 try:
8fd3afd5
PH
2704 info_json_bytes = compat_urllib_request.urlopen(request).read()
2705 info_json = info_json_bytes.decode('utf-8')
59ae15a5
PH
2706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2708 return
2709
8fd3afd5
PH
2710 info = json.loads(info_json)
2711 video_id = info['id']
59ae15a5
PH
2712 self.report_extraction('%s/%s' % (uploader, slug_title))
2713
8fd3afd5 2714 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2715 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2716 try:
2717 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2718 stream_json = stream_json_bytes.decode('utf-8')
2719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5f955171 2720 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
b4cd069d 2721 return
59ae15a5 2722
8fd3afd5 2723 streams = json.loads(stream_json)
c7214f9a 2724 mediaURL = streams['http_mp3_128_url']
59ae15a5
PH
2725
2726 return [{
c7214f9a 2727 'id': info['id'],
59ae15a5 2728 'url': mediaURL,
c7214f9a
PH
2729 'uploader': info['user']['username'],
2730 'upload_date': info['created_at'],
2731 'title': info['title'],
59ae15a5 2732 'ext': u'mp3',
c7214f9a 2733 'description': info['description'],
59ae15a5 2734 }]
d77c3dfd
FV
2735
2736
2737class InfoQIE(InfoExtractor):
59ae15a5 2738 """Information extractor for infoq.com"""
59ae15a5 2739 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
59ae15a5 2740
59ae15a5
PH
2741 def report_extraction(self, video_id):
2742 """Report information extraction."""
2743 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2744
2745 def _real_extract(self, url):
2746 mobj = re.match(self._VALID_URL, url)
2747 if mobj is None:
2748 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2749 return
2750
4fcca4bb 2751 webpage = self._download_webpage(url, video_id=url)
59ae15a5
PH
2752 self.report_extraction(url)
2753
59ae15a5
PH
2754 # Extract video URL
2755 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2756 if mobj is None:
2757 self._downloader.trouble(u'ERROR: unable to extract video url')
2758 return
4fcca4bb
PH
2759 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2760 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
59ae15a5
PH
2761
2762 # Extract title
2763 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2764 if mobj is None:
2765 self._downloader.trouble(u'ERROR: unable to extract video title')
2766 return
4fcca4bb 2767 video_title = mobj.group(1)
59ae15a5
PH
2768
2769 # Extract description
2770 video_description = u'No description available.'
2771 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2772 if mobj is not None:
4fcca4bb 2773 video_description = mobj.group(1)
59ae15a5
PH
2774
2775 video_filename = video_url.split('/')[-1]
2776 video_id, extension = video_filename.split('.')
2777
2778 info = {
2779 'id': video_id,
2780 'url': video_url,
2781 'uploader': None,
2782 'upload_date': None,
2783 'title': video_title,
2784 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2785 'thumbnail': None,
2786 'description': video_description,
2787 }
2788
2789 return [info]
d77c3dfd
FV
2790
2791class MixcloudIE(InfoExtractor):
59ae15a5 2792 """Information extractor for www.mixcloud.com"""
93702113
FV
2793
2794 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2795 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796 IE_NAME = u'mixcloud'
2797
2798 def __init__(self, downloader=None):
2799 InfoExtractor.__init__(self, downloader)
2800
2801 def report_download_json(self, file_id):
2802 """Report JSON download."""
2803 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2804
2805 def report_extraction(self, file_id):
2806 """Report information extraction."""
2807 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2808
2809 def get_urls(self, jsonData, fmt, bitrate='best'):
2810 """Get urls from 'audio_formats' section in json"""
2811 file_url = None
2812 try:
2813 bitrate_list = jsonData[fmt]
2814 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2815 bitrate = max(bitrate_list) # select highest
2816
2817 url_list = jsonData[fmt][bitrate]
2818 except TypeError: # we have no bitrate info.
2819 url_list = jsonData[fmt]
2820 return url_list
2821
2822 def check_urls(self, url_list):
2823 """Returns 1st active url from list"""
2824 for url in url_list:
2825 try:
2826 compat_urllib_request.urlopen(url)
2827 return url
2828 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2829 url = None
2830
2831 return None
2832
2833 def _print_formats(self, formats):
2834 print('Available formats:')
2835 for fmt in formats.keys():
2836 for b in formats[fmt]:
2837 try:
2838 ext = formats[fmt][b][0]
2839 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2840 except TypeError: # we have no bitrate info
2841 ext = formats[fmt][0]
2842 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2843 break
2844
2845 def _real_extract(self, url):
2846 mobj = re.match(self._VALID_URL, url)
2847 if mobj is None:
2848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849 return
2850 # extract uploader & filename from url
2851 uploader = mobj.group(1).decode('utf-8')
2852 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2853
2854 # construct API request
2855 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2856 # retrieve .json file with links to files
2857 request = compat_urllib_request.Request(file_url)
2858 try:
2859 self.report_download_json(file_url)
2860 jsonData = compat_urllib_request.urlopen(request).read()
2861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2862 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2863 return
2864
2865 # parse JSON
2866 json_data = json.loads(jsonData)
2867 player_url = json_data['player_swf_url']
2868 formats = dict(json_data['audio_formats'])
2869
2870 req_format = self._downloader.params.get('format', None)
2871 bitrate = None
2872
2873 if self._downloader.params.get('listformats', None):
2874 self._print_formats(formats)
2875 return
2876
2877 if req_format is None or req_format == 'best':
2878 for format_param in formats.keys():
2879 url_list = self.get_urls(formats, format_param)
2880 # check urls
2881 file_url = self.check_urls(url_list)
2882 if file_url is not None:
2883 break # got it!
2884 else:
99b0a129 2885 if req_format not in formats:
59ae15a5
PH
2886 self._downloader.trouble(u'ERROR: format is not available')
2887 return
2888
2889 url_list = self.get_urls(formats, req_format)
2890 file_url = self.check_urls(url_list)
2891 format_param = req_format
2892
2893 return [{
2894 'id': file_id.decode('utf-8'),
2895 'url': file_url.decode('utf-8'),
2896 'uploader': uploader.decode('utf-8'),
2897 'upload_date': None,
2898 'title': json_data['name'],
2899 'ext': file_url.split('.')[-1].decode('utf-8'),
2900 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2901 'thumbnail': json_data['thumbnail_url'],
2902 'description': json_data['description'],
2903 'player_url': player_url.decode('utf-8'),
2904 }]
d77c3dfd
FV
2905
2906class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
2907 """Information extractor for Stanford's Open ClassRoom"""
2908
2909 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2910 IE_NAME = u'stanfordoc'
2911
2912 def report_download_webpage(self, objid):
2913 """Report information extraction."""
2914 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2915
2916 def report_extraction(self, video_id):
2917 """Report information extraction."""
2918 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2919
2920 def _real_extract(self, url):
2921 mobj = re.match(self._VALID_URL, url)
2922 if mobj is None:
f0bad2b0 2923 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
2924
2925 if mobj.group('course') and mobj.group('video'): # A specific video
2926 course = mobj.group('course')
2927 video = mobj.group('video')
2928 info = {
2929 'id': course + '_' + video,
2930 'uploader': None,
2931 'upload_date': None,
2932 }
2933
2934 self.report_extraction(info['id'])
2935 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2936 xmlUrl = baseUrl + video + '.xml'
2937 try:
2938 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2939 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2940 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2941 return
2942 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2943 try:
2944 info['title'] = mdoc.findall('./title')[0].text
2945 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2946 except IndexError:
2947 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2948 return
2949 info['ext'] = info['url'].rpartition('.')[2]
2950 return [info]
2951 elif mobj.group('course'): # A course page
2952 course = mobj.group('course')
2953 info = {
2954 'id': course,
2955 'type': 'playlist',
2956 'uploader': None,
2957 'upload_date': None,
2958 }
2959
f0bad2b0
PH
2960 coursepage = self._download_webpage(url, info['id'],
2961 note='Downloading course info page',
2962 errnote='Unable to download course info page')
59ae15a5
PH
2963
2964 m = re.search('<h1>([^<]+)</h1>', coursepage)
2965 if m:
2966 info['title'] = unescapeHTML(m.group(1))
2967 else:
2968 info['title'] = info['id']
2969
2970 m = re.search('<description>([^<]+)</description>', coursepage)
2971 if m:
2972 info['description'] = unescapeHTML(m.group(1))
2973
2974 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2975 info['list'] = [
2976 {
2977 'type': 'reference',
2978 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2979 }
2980 for vpage in links]
2981 results = []
2982 for entry in info['list']:
2983 assert entry['type'] == 'reference'
2984 results += self.extract(entry['url'])
2985 return results
59ae15a5
PH
2986 else: # Root page
2987 info = {
2988 'id': 'Stanford OpenClassroom',
2989 'type': 'playlist',
2990 'uploader': None,
2991 'upload_date': None,
2992 }
2993
2994 self.report_download_webpage(info['id'])
2995 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2996 try:
2997 rootpage = compat_urllib_request.urlopen(rootURL).read()
2998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2999 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3000 return
3001
3002 info['title'] = info['id']
3003
3004 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3005 info['list'] = [
3006 {
3007 'type': 'reference',
3008 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3009 }
3010 for cpage in links]
3011
3012 results = []
3013 for entry in info['list']:
3014 assert entry['type'] == 'reference'
3015 results += self.extract(entry['url'])
3016 return results
d77c3dfd
FV
3017
3018class MTVIE(InfoExtractor):
59ae15a5
PH
3019 """Information extractor for MTV.com"""
3020
3021 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3022 IE_NAME = u'mtv'
3023
59ae15a5
PH
3024 def report_extraction(self, video_id):
3025 """Report information extraction."""
3026 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3027
3028 def _real_extract(self, url):
3029 mobj = re.match(self._VALID_URL, url)
3030 if mobj is None:
3031 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3032 return
3033 if not mobj.group('proto'):
3034 url = 'http://' + url
3035 video_id = mobj.group('videoid')
59ae15a5 3036
5f955171 3037 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
3038
3039 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3040 if mobj is None:
3041 self._downloader.trouble(u'ERROR: unable to extract song name')
3042 return
3043 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3044 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3045 if mobj is None:
3046 self._downloader.trouble(u'ERROR: unable to extract performer')
3047 return
3048 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 3049 video_title = performer + ' - ' + song_name
59ae15a5
PH
3050
3051 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3052 if mobj is None:
3053 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3054 return
3055 mtvn_uri = mobj.group(1)
3056
3057 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3058 if mobj is None:
3059 self._downloader.trouble(u'ERROR: unable to extract content id')
3060 return
3061 content_id = mobj.group(1)
3062
3063 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3064 self.report_extraction(video_id)
3065 request = compat_urllib_request.Request(videogen_url)
3066 try:
3067 metadataXml = compat_urllib_request.urlopen(request).read()
3068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3069 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3070 return
3071
3072 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3073 renditions = mdoc.findall('.//rendition')
3074
3075 # For now, always pick the highest quality.
3076 rendition = renditions[-1]
3077
3078 try:
3079 _,_,ext = rendition.attrib['type'].partition('/')
3080 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3081 video_url = rendition.find('./src').text
3082 except KeyError:
3083 self._downloader.trouble('Invalid rendition field.')
3084 return
3085
3086 info = {
3087 'id': video_id,
3088 'url': video_url,
3089 'uploader': performer,
3090 'upload_date': None,
3091 'title': video_title,
3092 'ext': ext,
3093 'format': format,
3094 }
3095
3096 return [info]
6de7ef9b 3097
302efc19 3098
302efc19 3099class YoukuIE(InfoExtractor):
59ae15a5 3100 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59ae15a5
PH
3101
3102 def report_download_webpage(self, file_id):
3103 """Report webpage download."""
a34dd63b 3104 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
59ae15a5
PH
3105
3106 def report_extraction(self, file_id):
3107 """Report information extraction."""
a34dd63b 3108 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
59ae15a5
PH
3109
3110 def _gen_sid(self):
3111 nowTime = int(time.time() * 1000)
3112 random1 = random.randint(1000,1998)
3113 random2 = random.randint(1000,9999)
3114
3115 return "%d%d%d" %(nowTime,random1,random2)
3116
3117 def _get_file_ID_mix_string(self, seed):
3118 mixed = []
3119 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3120 seed = float(seed)
3121 for i in range(len(source)):
3122 seed = (seed * 211 + 30031 ) % 65536
3123 index = math.floor(seed / 65536 * len(source) )
3124 mixed.append(source[int(index)])
3125 source.remove(source[int(index)])
3126 #return ''.join(mixed)
3127 return mixed
3128
3129 def _get_file_id(self, fileId, seed):
3130 mixed = self._get_file_ID_mix_string(seed)
3131 ids = fileId.split('*')
3132 realId = []
3133 for ch in ids:
3134 if ch:
3135 realId.append(mixed[int(ch)])
3136 return ''.join(realId)
3137
3138 def _real_extract(self, url):
3139 mobj = re.match(self._VALID_URL, url)
3140 if mobj is None:
3141 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3142 return
3143 video_id = mobj.group('ID')
3144
3145 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3146
3147 request = compat_urllib_request.Request(info_url, None, std_headers)
3148 try:
3149 self.report_download_webpage(video_id)
3150 jsondata = compat_urllib_request.urlopen(request).read()
3151 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3152 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3153 return
3154
3155 self.report_extraction(video_id)
3156 try:
8f6f40d9
PH
3157 jsonstr = jsondata.decode('utf-8')
3158 config = json.loads(jsonstr)
59ae15a5
PH
3159
3160 video_title = config['data'][0]['title']
3161 seed = config['data'][0]['seed']
3162
3163 format = self._downloader.params.get('format', None)
1a2c3c0f 3164 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
3165
3166 if format is None or format == 'best':
3167 if 'hd2' in supported_format:
3168 format = 'hd2'
3169 else:
3170 format = 'flv'
3171 ext = u'flv'
3172 elif format == 'worst':
3173 format = 'mp4'
3174 ext = u'mp4'
3175 else:
3176 format = 'flv'
3177 ext = u'flv'
3178
3179
3180 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 3181 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 3182 except (UnicodeDecodeError, ValueError, KeyError):
59ae15a5
PH
3183 self._downloader.trouble(u'ERROR: unable to extract info section')
3184 return
3185
3186 files_info=[]
3187 sid = self._gen_sid()
3188 fileid = self._get_file_id(fileid, seed)
3189
3190 #column 8,9 of fileid represent the segment number
3191 #fileid[7:9] should be changed
3192 for index, key in enumerate(keys):
3193
3194 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3195 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3196
3197 info = {
3198 'id': '%s_part%02d' % (video_id, index),
3199 'url': download_url,
3200 'uploader': None,
3201 'upload_date': None,
3202 'title': video_title,
3203 'ext': ext,
3204 }
3205 files_info.append(info)
3206
3207 return files_info
5dc846fa
FV
3208
3209
6de7ef9b 3210class XNXXIE(InfoExtractor):
59ae15a5
PH
3211 """Information extractor for xnxx.com"""
3212
caec7618 3213 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
59ae15a5
PH
3214 IE_NAME = u'xnxx'
3215 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3216 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3217 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3218
3219 def report_webpage(self, video_id):
3220 """Report information extraction"""
3221 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3222
3223 def report_extraction(self, video_id):
3224 """Report information extraction"""
3225 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3226
3227 def _real_extract(self, url):
3228 mobj = re.match(self._VALID_URL, url)
3229 if mobj is None:
3230 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3231 return
bec102a8 3232 video_id = mobj.group(1)
59ae15a5
PH
3233
3234 self.report_webpage(video_id)
3235
3236 # Get webpage content
3237 try:
bec102a8
PH
3238 webpage_bytes = compat_urllib_request.urlopen(url).read()
3239 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
3240 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3241 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3242 return
3243
3244 result = re.search(self.VIDEO_URL_RE, webpage)
3245 if result is None:
3246 self._downloader.trouble(u'ERROR: unable to extract video url')
3247 return
bec102a8 3248 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
3249
3250 result = re.search(self.VIDEO_TITLE_RE, webpage)
3251 if result is None:
3252 self._downloader.trouble(u'ERROR: unable to extract video title')
3253 return
bec102a8 3254 video_title = result.group(1)
59ae15a5
PH
3255
3256 result = re.search(self.VIDEO_THUMB_RE, webpage)
3257 if result is None:
3258 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3259 return
bec102a8 3260 video_thumbnail = result.group(1)
59ae15a5
PH
3261
3262 return [{
3263 'id': video_id,
3264 'url': video_url,
3265 'uploader': None,
3266 'upload_date': None,
3267 'title': video_title,
3268 'ext': 'flv',
3269 'thumbnail': video_thumbnail,
3270 'description': None,
3271 }]
fd873c69
FV
3272
3273
d443aca8 3274class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3275 """Information extractor for plus.google.com."""
3276
93702113 3277 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
3278 IE_NAME = u'plus.google'
3279
3280 def __init__(self, downloader=None):
3281 InfoExtractor.__init__(self, downloader)
3282
3283 def report_extract_entry(self, url):
3284 """Report downloading extry"""
93702113 3285 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
59ae15a5
PH
3286
3287 def report_date(self, upload_date):
3288 """Report downloading extry"""
3289 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3290
3291 def report_uploader(self, uploader):
3292 """Report downloading extry"""
93702113 3293 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
59ae15a5
PH
3294
3295 def report_title(self, video_title):
3296 """Report downloading extry"""
93702113 3297 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
59ae15a5
PH
3298
3299 def report_extract_vid_page(self, video_page):
3300 """Report information extraction."""
93702113 3301 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
59ae15a5
PH
3302
3303 def _real_extract(self, url):
3304 # Extract id from URL
3305 mobj = re.match(self._VALID_URL, url)
3306 if mobj is None:
3307 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3308 return
3309
3310 post_url = mobj.group(0)
93702113 3311 video_id = mobj.group(1)
59ae15a5
PH
3312
3313 video_extension = 'flv'
3314
3315 # Step 1, Retrieve post webpage to extract further information
3316 self.report_extract_entry(post_url)
3317 request = compat_urllib_request.Request(post_url)
3318 try:
93702113 3319 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3320 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3322 return
3323
3324 # Extract update date
3325 upload_date = None
3326 pattern = 'title="Timestamp">(.*?)</a>'
3327 mobj = re.search(pattern, webpage)
3328 if mobj:
3329 upload_date = mobj.group(1)
3330 # Convert timestring to a format suitable for filename
3331 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3332 upload_date = upload_date.strftime('%Y%m%d')
3333 self.report_date(upload_date)
3334
3335 # Extract uploader
3336 uploader = None
3337 pattern = r'rel\="author".*?>(.*?)</a>'
3338 mobj = re.search(pattern, webpage)
3339 if mobj:
3340 uploader = mobj.group(1)
3341 self.report_uploader(uploader)
3342
3343 # Extract title
3344 # Get the first line for title
3345 video_title = u'NA'
3346 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3347 mobj = re.search(pattern, webpage)
3348 if mobj:
3349 video_title = mobj.group(1)
3350 self.report_title(video_title)
3351
3352 # Step 2, Stimulate clicking the image box to launch video
3353 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3354 mobj = re.search(pattern, webpage)
3355 if mobj is None:
3356 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3357
3358 video_page = mobj.group(1)
3359 request = compat_urllib_request.Request(video_page)
3360 try:
93702113 3361 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3362 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3364 return
3365 self.report_extract_vid_page(video_page)
3366
3367
3368 # Extract video links on video page
3369 """Extract video links of all sizes"""
3370 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3371 mobj = re.findall(pattern, webpage)
3372 if len(mobj) == 0:
3373 self._downloader.trouble(u'ERROR: unable to extract video links')
3374
3375 # Sort in resolution
3376 links = sorted(mobj)
3377
3378 # Choose the lowest of the sort, i.e. highest resolution
3379 video_url = links[-1]
3380 # Only get the url. The resolution part in the tuple has no use anymore
3381 video_url = video_url[-1]
3382 # Treat escaped \u0026 style hex
93702113
FV
3383 try:
3384 video_url = video_url.decode("unicode_escape")
3385 except AttributeError: # Python 3
3386 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3387
3388
3389 return [{
93702113 3390 'id': video_id,
59ae15a5 3391 'url': video_url,
93702113
FV
3392 'uploader': uploader,
3393 'upload_date': upload_date,
3394 'title': video_title,
3395 'ext': video_extension,
59ae15a5 3396 }]
4cc3d074
PH
3397
3398class NBAIE(InfoExtractor):
3399 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3400 IE_NAME = u'nba'
3401
4cc3d074
PH
3402 def _real_extract(self, url):
3403 mobj = re.match(self._VALID_URL, url)
3404 if mobj is None:
3405 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3406 return
3407
3408 video_id = mobj.group(1)
3409 if video_id.endswith('/index.html'):
3410 video_id = video_id[:-len('/index.html')]
3411
5f955171 3412 webpage = self._download_webpage(url, video_id)
4cc3d074
PH
3413
3414 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3415 def _findProp(rexp, default=None):
3416 m = re.search(rexp, webpage)
3417 if m:
3418 return unescapeHTML(m.group(1))
3419 else:
3420 return default
3421
3422 shortened_video_id = video_id.rpartition('/')[2]
3423 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3424 info = {
3425 'id': shortened_video_id,
3426 'url': video_url,
3427 'ext': 'mp4',
3428 'title': title,
3429 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3430 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3431 }
3432 return [info]
0b40544f
DV
3433
3434class JustinTVIE(InfoExtractor):
3435 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3436 # TODO: One broadcast may be split into multiple videos. The key
3437 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3438 # starts at 1 and increases. Can we treat all parts as one video?
3439
4096b609
DV
3440 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3441 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3442 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3443 IE_NAME = u'justin.tv'
3444
3445 def report_extraction(self, file_id):
3446 """Report information extraction."""
3447 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3448
4096b609
DV
3449 def report_download_page(self, channel, offset):
3450 """Report attempt to download a single page of videos."""
3451 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3452 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3453
2ab1c5ed
DV
3454 # Return count of items, list of *valid* items
3455 def _parse_page(self, url):
0b40544f 3456 try:
2ab1c5ed 3457 urlh = compat_urllib_request.urlopen(url)
0b40544f
DV
3458 webpage_bytes = urlh.read()
3459 webpage = webpage_bytes.decode('utf-8', 'ignore')
3460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3461 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3462 return
cdb30764 3463
0b40544f 3464 response = json.loads(webpage)
fa1bf9c6 3465 if type(response) != list:
3466 error_text = response.get('error', 'unknown error')
3467 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3468 return
0b40544f
DV
3469 info = []
3470 for clip in response:
3471 video_url = clip['video_file_url']
3472 if video_url:
3473 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 3474 video_date = re.sub('-', '', clip['start_time'][:10])
3475 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
3476 video_id = clip['id']
3477 video_title = clip.get('title', video_id)
0b40544f 3478 info.append({
97f194c1 3479 'id': video_id,
0b40544f 3480 'url': video_url,
97f194c1 3481 'title': video_title,
fa1bf9c6 3482 'uploader': clip.get('channel_name', video_uploader_id),
3483 'uploader_id': video_uploader_id,
0b40544f
DV
3484 'upload_date': video_date,
3485 'ext': video_extension,
3486 })
2ab1c5ed
DV
3487 return (len(response), info)
3488
3489 def _real_extract(self, url):
3490 mobj = re.match(self._VALID_URL, url)
3491 if mobj is None:
3492 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3493 return
cdb30764 3494
2ab1c5ed
DV
3495 api = 'http://api.justin.tv'
3496 video_id = mobj.group(mobj.lastindex)
3497 paged = False
3498 if mobj.lastindex == 1:
3499 paged = True
3500 api += '/channel/archives/%s.json'
3501 else:
fa1bf9c6 3502 api += '/broadcast/by_archive/%s.json'
2ab1c5ed 3503 api = api % (video_id,)
cdb30764 3504
2ab1c5ed 3505 self.report_extraction(video_id)
cdb30764 3506
2ab1c5ed
DV
3507 info = []
3508 offset = 0
4096b609
DV
3509 limit = self._JUSTIN_PAGE_LIMIT
3510 while True:
3511 if paged:
3512 self.report_download_page(video_id, offset)
2ab1c5ed
DV
3513 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3514 page_count, page_info = self._parse_page(page_url)
3515 info.extend(page_info)
3516 if not paged or page_count != limit:
3517 break
3518 offset += limit
0b40544f 3519 return info
21a9c6aa
PH
3520
3521class FunnyOrDieIE(InfoExtractor):
3522 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
21a9c6aa 3523
21a9c6aa
PH
3524 def _real_extract(self, url):
3525 mobj = re.match(self._VALID_URL, url)
3526 if mobj is None:
3527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3528 return
3529
3530 video_id = mobj.group('id')
5f955171 3531 webpage = self._download_webpage(url, video_id)
21a9c6aa
PH
3532
3533 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3534 if not m:
3535 self._downloader.trouble(u'ERROR: unable to find video information')
3536 video_url = unescapeHTML(m.group('url'))
21a9c6aa
PH
3537
3538 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3539 if not m:
3540 self._downloader.trouble(u'Cannot find video title')
3541 title = unescapeHTML(m.group('title'))
3542
3543 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3544 if m:
3545 desc = unescapeHTML(m.group('desc'))
3546 else:
3547 desc = None
3548
3549 info = {
3550 'id': video_id,
3551 'url': video_url,
3552 'ext': 'mp4',
3553 'title': title,
3554 'description': desc,
3555 }
3556 return [info]
d0d4f277
PH
3557
3558class TweetReelIE(InfoExtractor):
3559 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3560
d0d4f277
PH
3561 def _real_extract(self, url):
3562 mobj = re.match(self._VALID_URL, url)
3563 if mobj is None:
3564 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3565 return
3566
3567 video_id = mobj.group('id')
5f955171 3568 webpage = self._download_webpage(url, video_id)
d0d4f277
PH
3569
3570 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3571 if not m:
3572 self._downloader.trouble(u'ERROR: Cannot find status ID')
3573 status_id = m.group(1)
3574
3575 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3576 if not m:
3577 self._downloader.trouble(u'WARNING: Cannot find description')
3578 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3579
3580 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3581 if not m:
3582 self._downloader.trouble(u'ERROR: Cannot find uploader')
3583 uploader = unescapeHTML(m.group('uploader'))
3584 uploader_id = unescapeHTML(m.group('uploader_id'))
3585
3586 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3587 if not m:
3588 self._downloader.trouble(u'ERROR: Cannot find upload date')
3589 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3590
3591 title = desc
3592 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3593
3594 info = {
3595 'id': video_id,
3596 'url': video_url,
3597 'ext': 'mov',
3598 'title': title,
3599 'description': desc,
3600 'uploader': uploader,
3601 'uploader_id': uploader_id,
3602 'internal_id': status_id,
3603 'upload_date': upload_date
3604 }
3605 return [info]
e314ba67
JMF
3606
3607class SteamIE(InfoExtractor):
3608 _VALID_URL = r"""http://store.steampowered.com/
3609 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3610 (?P<gameID>\d+)/?
3611 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3612 """
4aeae91f 3613
e314ba67
JMF
3614 def suitable(self, url):
3615 """Receives a URL and returns True if suitable for this IE."""
3616 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
5f955171 3617
e314ba67
JMF
3618 def _real_extract(self, url):
3619 m = re.match(self._VALID_URL, url, re.VERBOSE)
3620 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3621 gameID = m.group('gameID')
3622 videourl = 'http://store.steampowered.com/video/%s/' % gameID
5f955171 3623 webpage = self._download_webpage(videourl, gameID)
e314ba67 3624 mweb = re.finditer(urlRE, webpage)
5e9d042d
JMF
3625 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3626 titles = re.finditer(namesRE, webpage)
e314ba67 3627 videos = []
5f955171 3628 for vid,vtitle in zip(mweb,titles):
e314ba67 3629 video_id = vid.group('videoID')
5f955171
PH
3630 title = vtitle.group('videoName')
3631 video_url = vid.group('videoURL')
e314ba67
JMF
3632 if not video_url:
3633 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
e314ba67
JMF
3634 info = {
3635 'id':video_id,
3636 'url':video_url,
3637 'ext': 'flv',
5e9d042d 3638 'title': unescapeHTML(title)
e314ba67
JMF
3639 }
3640 videos.append(info)
3641 return videos
ef0c8d5f 3642
278986ea 3643class UstreamIE(InfoExtractor):
ef0c8d5f 3644 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 3645 IE_NAME = u'ustream'
ef0c8d5f 3646
278986ea
JMF
3647 def _real_extract(self, url):
3648 m = re.match(self._VALID_URL, url)
3649 video_id = m.group('videoID')
3650 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 3651 webpage = self._download_webpage(url, video_id)
278986ea
JMF
3652 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3653 title = m.group('title')
3654 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3655 uploader = m.group('uploader')
3656 info = {
3657 'id':video_id,
3658 'url':video_url,
3659 'ext': 'flv',
3660 'title': title,
3661 'uploader': uploader
3662 }
3663 return [info]
4aeae91f 3664
ca0a0bbe
PH
3665class RBMARadioIE(InfoExtractor):
3666 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3667
3668 def _real_extract(self, url):
3669 m = re.match(self._VALID_URL, url)
3670 video_id = m.group('videoID')
3671
3672 webpage = self._download_webpage(url, video_id)
3673 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3674 if not m:
3675 raise ExtractorError(u'Cannot find metadata')
3676 json_data = m.group(1)
3677
3678 try:
3679 data = json.loads(json_data)
3680 except ValueError as e:
3681 raise ExtractorError(u'Invalid JSON: ' + str(e))
3682
3683 video_url = data['akamai_url'] + '&cbr=256'
3684 url_parts = compat_urllib_parse_urlparse(video_url)
3685 video_ext = url_parts.path.rpartition('.')[2]
3686 info = {
3687 'id': video_id,
3688 'url': video_url,
3689 'ext': video_ext,
3690 'title': data['title'],
3691 'description': data.get('teaser_text'),
3692 'location': data.get('country_of_origin'),
3693 'uploader': data.get('host', {}).get('name'),
3694 'uploader_id': data.get('host', {}).get('slug'),
187f491a 3695 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
3696 'duration': data.get('duration'),
3697 }
3698 return [info]
4aeae91f 3699
991ba7fa
JC
3700
3701class YouPornIE(InfoExtractor):
3702 """Information extractor for youporn.com."""
991ba7fa 3703 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
ca6710ee 3704
991ba7fa
JC
3705 def _print_formats(self, formats):
3706 """Print all available formats"""
565f7519 3707 print(u'Available formats:')
ca6710ee
JC
3708 print(u'ext\t\tformat')
3709 print(u'---------------------------------')
991ba7fa 3710 for format in formats:
ca6710ee 3711 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
3712
3713 def _specific(self, req_format, formats):
3714 for x in formats:
3715 if(x["format"]==req_format):
3716 return x
3717 return None
3718
991ba7fa
JC
3719 def _real_extract(self, url):
3720 mobj = re.match(self._VALID_URL, url)
3721 if mobj is None:
3722 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3723 return
3724
ca6710ee 3725 video_id = mobj.group('videoid')
991ba7fa 3726
629fcdd1
PH
3727 req = compat_urllib_request.Request(url)
3728 req.add_header('Cookie', 'age_verified=1')
3729 webpage = self._download_webpage(req, video_id)
991ba7fa
JC
3730
3731 # Get the video title
e711babb 3732 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
991ba7fa 3733 if result is None:
e711babb 3734 raise ExtractorError(u'Unable to extract video title')
ca6710ee 3735 video_title = result.group('title').strip()
991ba7fa
JC
3736
3737 # Get the video date
e711babb 3738 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
991ba7fa 3739 if result is None:
629fcdd1
PH
3740 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3741 upload_date = None
3742 else:
3743 upload_date = result.group('date').strip()
991ba7fa
JC
3744
3745 # Get the video uploader
e711babb 3746 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
991ba7fa 3747 if result is None:
e711babb 3748 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
629fcdd1
PH
3749 video_uploader = None
3750 else:
3751 video_uploader = result.group('uploader').strip()
3752 video_uploader = clean_html( video_uploader )
991ba7fa
JC
3753
3754 # Get all of the formats available
ca6710ee
JC
3755 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3756 result = re.search(DOWNLOAD_LIST_RE, webpage)
991ba7fa 3757 if result is None:
629fcdd1 3758 raise ExtractorError(u'Unable to extract download list')
ca6710ee 3759 download_list_html = result.group('download_list').strip()
991ba7fa
JC
3760
3761 # Get all of the links from the page
ca6710ee
JC
3762 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3763 links = re.findall(LINK_RE, download_list_html)
991ba7fa 3764 if(len(links) == 0):
629fcdd1 3765 raise ExtractorError(u'ERROR: no known formats available for video')
991ba7fa
JC
3766
3767 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3768
3769 formats = []
3770 for link in links:
3771
3772 # A link looks like this:
3773 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3774 # A path looks like this:
3775 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
3776 video_url = unescapeHTML( link )
3777 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
3778 extension = os.path.splitext( path )[1][1:]
3779 format = path.split('/')[4].split('_')[:2]
3780 size = format[0]
3781 bitrate = format[1]
3782 format = "-".join( format )
3783 title = u'%s-%s-%s' % (video_title, size, bitrate)
3784
3785 formats.append({
3786 'id': video_id,
3787 'url': video_url,
3788 'uploader': video_uploader,
3789 'upload_date': upload_date,
3790 'title': title,
3791 'ext': extension,
3792 'format': format,
3793 'thumbnail': None,
3794 'description': None,
3795 'player_url': None
3796 })
3797
3798 if self._downloader.params.get('listformats', None):
3799 self._print_formats(formats)
3800 return
3801
3802 req_format = self._downloader.params.get('format', None)
991ba7fa
JC
3803 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3804
991ba7fa
JC
3805 if req_format is None or req_format == 'best':
3806 return [formats[0]]
3807 elif req_format == 'worst':
3808 return [formats[-1]]
3809 elif req_format in ('-1', 'all'):
3810 return formats
3811 else:
3812 format = self._specific( req_format, formats )
3813 if result is None:
3814 self._downloader.trouble(u'ERROR: requested format not available')
3815 return
3816 return [format]
3817
3818
3819
3820class PornotubeIE(InfoExtractor):
3821 """Information extractor for pornotube.com."""
991ba7fa 3822 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 3823
991ba7fa
JC
3824 def _real_extract(self, url):
3825 mobj = re.match(self._VALID_URL, url)
3826 if mobj is None:
3827 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3828 return
3829
ca6710ee
JC
3830 video_id = mobj.group('videoid')
3831 video_title = mobj.group('title')
991ba7fa
JC
3832
3833 # Get webpage content
ca6710ee 3834 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3835
3836 # Get the video URL
ca6710ee
JC
3837 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3838 result = re.search(VIDEO_URL_RE, webpage)
991ba7fa
JC
3839 if result is None:
3840 self._downloader.trouble(u'ERROR: unable to extract video url')
3841 return
ca6710ee 3842 video_url = compat_urllib_parse.unquote(result.group('url'))
991ba7fa
JC
3843
3844 #Get the uploaded date
ca6710ee
JC
3845 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3846 result = re.search(VIDEO_UPLOADED_RE, webpage)
991ba7fa
JC
3847 if result is None:
3848 self._downloader.trouble(u'ERROR: unable to extract video title')
3849 return
ca6710ee 3850 upload_date = result.group('date')
991ba7fa
JC
3851
3852 info = {'id': video_id,
3853 'url': video_url,
3854 'uploader': None,
3855 'upload_date': upload_date,
3856 'title': video_title,
3857 'ext': 'flv',
565f7519 3858 'format': 'flv'}
991ba7fa
JC
3859
3860 return [info]
3861
991ba7fa
JC
3862class YouJizzIE(InfoExtractor):
3863 """Information extractor for youjizz.com."""
ca6710ee 3864 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 3865
991ba7fa 3866 def _real_extract(self, url):
ca6710ee
JC
3867 mobj = re.match(self._VALID_URL, url)
3868 if mobj is None:
3869 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
991ba7fa 3870 return
ca6710ee
JC
3871
3872 video_id = mobj.group('videoid')
3873
3874 # Get webpage content
3875 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3876
3877 # Get the video title
db16276b 3878 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
991ba7fa 3879 if result is None:
db16276b 3880 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 3881 video_title = result.group('title').strip()
991ba7fa
JC
3882
3883 # Get the embed page
db16276b 3884 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 3885 if result is None:
db16276b 3886 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 3887
ca6710ee
JC
3888 embed_page_url = result.group(0).strip()
3889 video_id = result.group('videoid')
991ba7fa 3890
ca6710ee
JC
3891 webpage = self._download_webpage(embed_page_url, video_id)
3892
991ba7fa 3893 # Get the video URL
db16276b 3894 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
991ba7fa 3895 if result is None:
db16276b 3896 raise ExtractorError(u'ERROR: unable to extract video url')
ca6710ee 3897 video_url = result.group('source')
991ba7fa
JC
3898
3899 info = {'id': video_id,
3900 'url': video_url,
991ba7fa
JC
3901 'title': video_title,
3902 'ext': 'flv',
3903 'format': 'flv',
991ba7fa
JC
3904 'player_url': embed_page_url}
3905
3906 return [info]
3907
ccf65f9d
PH
3908class EightTracksIE(InfoExtractor):
3909 IE_NAME = '8tracks'
25580f32 3910 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
ccf65f9d
PH
3911
3912 def _real_extract(self, url):
3913 mobj = re.match(self._VALID_URL, url)
3914 if mobj is None:
3915 raise ExtractorError(u'Invalid URL: %s' % url)
3916 playlist_id = mobj.group('id')
3917
3918 webpage = self._download_webpage(url, playlist_id)
3919
2a9983b7 3920 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
ccf65f9d
PH
3921 if not m:
3922 raise ExtractorError(u'Cannot find trax information')
3923 json_like = m.group(1)
3924 data = json.loads(json_like)
3925
3926 session = str(random.randint(0, 1000000000))
3927 mix_id = data['id']
3928 track_count = data['tracks_count']
3929 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3930 next_url = first_url
3931 res = []
3932 for i in itertools.count():
3933 api_json = self._download_webpage(next_url, playlist_id,
3934 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3935 errnote=u'Failed to download song information')
3936 api_data = json.loads(api_json)
3937 track_data = api_data[u'set']['track']
3938 info = {
3939 'id': track_data['id'],
3940 'url': track_data['track_file_stream_url'],
da4de959
PH
3941 'title': track_data['performer'] + u' - ' + track_data['name'],
3942 'raw_title': track_data['name'],
3943 'uploader_id': data['user']['login'],
ccf65f9d
PH
3944 'ext': 'm4a',
3945 }
3946 res.append(info)
3947 if api_data['set']['at_last_track']:
3948 break
3949 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3950 return res
991ba7fa 3951
da06e2da
OK
3952class KeekIE(InfoExtractor):
3953 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3954 IE_NAME = u'keek'
3955
3956 def _real_extract(self, url):
3957 m = re.match(self._VALID_URL, url)
3958 video_id = m.group('videoID')
3959 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3960 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3961 webpage = self._download_webpage(url, video_id)
3962 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
f0877a44 3963 title = unescapeHTML(m.group('title'))
da06e2da 3964 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
f0877a44 3965 uploader = unescapeHTML(m.group('uploader'))
da06e2da
OK
3966 info = {
3967 'id':video_id,
3968 'url':video_url,
3969 'ext': 'mp4',
3970 'title': title,
3971 'thumbnail': thumbnail,
3972 'uploader': uploader
f0877a44 3973 }
da06e2da
OK
3974 return [info]
3975
3a468f2d 3976class TEDIE(InfoExtractor):
414638cd
JMF
3977 _VALID_URL=r'''http://www.ted.com/
3978 (
3979 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3980 |
3981 ((?P<type_talk>talks)) # We have a simple talk
3982 )
3983 /(?P<name>\w+) # Here goes the name and then ".html"
3984 '''
3985
3986 def suitable(self, url):
3987 """Receives a URL and returns True if suitable for this IE."""
3988 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3989
3a468f2d 3990 def _real_extract(self, url):
414638cd
JMF
3991 m=re.match(self._VALID_URL, url, re.VERBOSE)
3992 if m.group('type_talk'):
3993 return [self._talk_info(url)]
3994 else :
3995 playlist_id=m.group('playlist_id')
3996 name=m.group('name')
3997 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3998 return self._playlist_videos_info(url,name,playlist_id)
3999
4000 def _talk_video_link(self,mediaSlug):
4001 '''Returns the video link for that mediaSlug'''
4002 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4003
4004 def _playlist_videos_info(self,url,name,playlist_id=0):
4005 '''Returns the videos of the playlist'''
4006 video_RE=r'''
4007 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4008 ([.\s]*?)data-playlist_item_id="(\d+)"
4009 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4010 '''
4011 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4012 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4013 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4014 m_names=re.finditer(video_name_RE,webpage)
4015 info=[]
4016 for m_video, m_name in zip(m_videos,m_names):
4017 video_dic={
4018 'id': m_video.group('video_id'),
4019 'url': self._talk_video_link(m_video.group('mediaSlug')),
4020 'ext': 'mp4',
4021 'title': m_name.group('fullname')
4022 }
4023 info.append(video_dic)
4024 return info
4025 def _talk_info(self, url, video_id=0):
4026 """Return the video for the talk in the url"""
4027 m=re.match(self._VALID_URL, url,re.VERBOSE)
4028 videoName=m.group('name')
4029 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4030 # If the url includes the language we get the title translated
59d4c2fe 4031 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
3a468f2d
JMF
4032 title=re.search(title_RE, webpage).group('title')
4033 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4034 "id":(?P<videoID>[\d]+).*?
4035 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4036 info_match=re.search(info_RE,webpage,re.VERBOSE)
4037 video_id=info_match.group('videoID')
4038 mediaSlug=info_match.group('mediaSlug')
414638cd 4039 video_url=self._talk_video_link(mediaSlug)
3a468f2d 4040 info = {
414638cd
JMF
4041 'id': video_id,
4042 'url': video_url,
3a468f2d
JMF
4043 'ext': 'mp4',
4044 'title': title
414638cd
JMF
4045 }
4046 return info
da06e2da 4047
58994225 4048class MySpassIE(InfoExtractor):
1ad5d872 4049 _VALID_URL = r'http://www.myspass.de/.*'
1ad5d872 4050
4051 def _real_extract(self, url):
4052 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
58994225 4053
1ad5d872 4054 # video id is the last path element of the URL
4055 # usually there is a trailing slash, so also try the second but last
4056 url_path = compat_urllib_parse_urlparse(url).path
4057 url_parent_path, video_id = os.path.split(url_path)
4058 if not video_id:
4059 _, video_id = os.path.split(url_parent_path)
4060
4061 # get metadata
4062 metadata_url = META_DATA_URL_TEMPLATE % video_id
4063 metadata_text = self._download_webpage(metadata_url, video_id)
4064 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4065
4066 # extract values from metadata
4067 url_flv_el = metadata.find('url_flv')
4068 if url_flv_el is None:
4069 self._downloader.trouble(u'ERROR: unable to extract download url')
4070 return
4071 video_url = url_flv_el.text
4072 extension = os.path.splitext(video_url)[1][1:]
4073 title_el = metadata.find('title')
4074 if title_el is None:
4075 self._downloader.trouble(u'ERROR: unable to extract title')
4076 return
4077 title = title_el.text
4078 format_id_el = metadata.find('format_id')
4079 if format_id_el is None:
4080 format = ext
4081 else:
4082 format = format_id_el.text
4083 description_el = metadata.find('description')
4084 if description_el is not None:
4085 description = description_el.text
4086 else:
4087 description = None
4088 imagePreview_el = metadata.find('imagePreview')
4089 if imagePreview_el is not None:
4090 thumbnail = imagePreview_el.text
4091 else:
4092 thumbnail = None
4093 info = {
4094 'id': video_id,
4095 'url': video_url,
4096 'title': title,
4097 'ext': extension,
4098 'format': format,
4099 'thumbnail': thumbnail,
4100 'description': description
4101 }
4102 return [info]
4103
4aeae91f
PH
4104def gen_extractors():
4105 """ Return a list of an instance of every supported extractor.
4106 The order does matter; the first extractor matched is the one handling the URL.
4107 """
4108 return [
4109 YoutubePlaylistIE(),
4110 YoutubeChannelIE(),
4111 YoutubeUserIE(),
4112 YoutubeSearchIE(),
4113 YoutubeIE(),
4114 MetacafeIE(),
4115 DailymotionIE(),
4116 GoogleSearchIE(),
4117 PhotobucketIE(),
4118 YahooIE(),
4119 YahooSearchIE(),
4120 DepositFilesIE(),
4121 FacebookIE(),
4122 BlipTVUserIE(),
4123 BlipTVIE(),
4124 VimeoIE(),
4125 MyVideoIE(),
4126 ComedyCentralIE(),
4127 EscapistIE(),
4128 CollegeHumorIE(),
4129 XVideosIE(),
4130 SoundcloudIE(),
4131 InfoQIE(),
4132 MixcloudIE(),
4133 StanfordOpenClassroomIE(),
4134 MTVIE(),
4135 YoukuIE(),
4136 XNXXIE(),
18be482a
JC
4137 YouJizzIE(),
4138 PornotubeIE(),
4139 YouPornIE(),
4aeae91f
PH
4140 GooglePlusIE(),
4141 ArteTvIE(),
4142 NBAIE(),
4143 JustinTVIE(),
4144 FunnyOrDieIE(),
4145 TweetReelIE(),
4146 SteamIE(),
4147 UstreamIE(),
ca0a0bbe 4148 RBMARadioIE(),
ccf65f9d 4149 EightTracksIE(),
da06e2da 4150 KeekIE(),
3a468f2d 4151 TEDIE(),
58994225 4152 MySpassIE(),
4aeae91f
PH
4153 GenericIE()
4154 ]
4155
4156