]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
handle YT urls with #/ redirects (closes #484)
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import datetime
5import HTMLParser
6import httplib
7import netrc
8import os
9import re
10import socket
11import time
12import urllib
13import urllib2
14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
921a1455 18from urlparse import parse_qs
d77c3dfd
FV
19
20try:
21 import cStringIO as StringIO
22except ImportError:
23 import StringIO
24
d11d05d0 25from utils import *
d77c3dfd
FV
26
27
28class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
38 the following fields:
39
40 id: Video identifier.
41 url: Final video URL.
42 uploader: Nickname of the video uploader.
43 title: Literal title.
d77c3dfd
FV
44 ext: Video filename extension.
45 format: Video format.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
52
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59 """
60
61 _ready = False
62 _downloader = None
63
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
66 self._ready = False
67 self.set_downloader(downloader)
68
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
72
73 def initialize(self):
74 """Initializes an instance (authentication, etc)."""
75 if not self._ready:
76 self._real_initialize()
77 self._ready = True
78
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
81 self.initialize()
82 return self._real_extract(url)
83
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
87
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
90 pass
91
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
94 pass
95
96
97class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
99
202e76cf
FV
100 _VALID_URL = r"""^
101 (
102 (?:https?://)? # http(s):// (optional)
5c961d89
FV
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
b8005afc 105 (?:.*?\#/)? # handle anchor (#/) redirect urls
202e76cf
FV
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
113 v=
114 )
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
119 $"""
d77c3dfd
FV
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
3fe294e4
FV
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
d77c3dfd
FV
128 _video_extensions = {
129 '13': '3gp',
130 '17': 'mp4',
131 '18': 'mp4',
132 '22': 'mp4',
133 '37': 'mp4',
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
135 '43': 'webm',
136 '44': 'webm',
137 '45': 'webm',
3fe294e4 138 '46': 'webm',
d77c3dfd
FV
139 }
140 _video_dimensions = {
141 '5': '240x400',
142 '6': '???',
143 '13': '???',
144 '17': '144x176',
145 '18': '360x640',
146 '22': '720x1280',
147 '34': '360x640',
148 '35': '480x854',
149 '37': '1080x1920',
150 '38': '3072x4096',
151 '43': '360x640',
152 '44': '480x854',
153 '45': '720x1280',
3fe294e4 154 '46': '1080x1920',
d77c3dfd
FV
155 }
156 IE_NAME = u'youtube'
157
202e76cf
FV
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161
d77c3dfd
FV
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
165
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
169
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
173
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
197
198 def _closed_captions_xml_to_srt(self, xml_string):
199 srt = ''
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
204 start = float(start)
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
9e6dd238 208 caption = unescapeHTML(caption)
6ab92c8b 209 caption = unescapeHTML(caption) # double cycle, intentional
54041793 210 srt += str(n+1) + '\n'
d77c3dfd
FV
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
213 return srt
214
215 def _print_formats(self, formats):
216 print 'Available formats:'
217 for x in formats:
218 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219
220 def _real_initialize(self):
221 if self._downloader is None:
222 return
223
224 username = None
225 password = None
226 downloader_params = self._downloader.params
227
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
233 try:
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
235 if info is not None:
236 username = info[0]
237 password = info[2]
238 else:
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
242 return
243
244 # Set language
245 request = urllib2.Request(self._LANG_URL)
246 try:
247 self.report_lang()
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
251 return
252
253 # No authentication to be performed
254 if username is None:
255 return
256
257 # Log in
258 login_form = {
259 'current_form': 'loginForm',
260 'next': '/',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
264 }
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
266 try:
267 self.report_login()
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 return
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
274 return
275
276 # Confirm age
277 age_form = {
278 'next_url': '/',
279 'action_confirm': 'Confirm',
280 }
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 try:
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
287 return
288
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
292 if mobj:
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294
295 # Extract video id from URL
202e76cf 296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
d77c3dfd
FV
297 if mobj is None:
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 return
300 video_id = mobj.group(2)
301
302 # Get video webpage
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 try:
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
309 return
310
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 if mobj is not None:
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
315 else:
316 player_url = None
317
318 # Get video info
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
324 try:
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
328 break
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 return
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 else:
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
337 return
338
7df97fb5
FV
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
342 return
343
d77c3dfd
FV
344 # Start extracting information
345 self.report_information_extraction(video_id)
346
347 # uploader
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 return
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
352
353 # title
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
356 return
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
d77c3dfd
FV
359
360 # thumbnail image
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 video_thumbnail = ''
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
366
367 # upload date
368 upload_date = u'NA'
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 if mobj is not None:
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
374 try:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
376 except:
377 pass
378
379 # description
9beb5af8
FV
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
9e6dd238 382 else: video_description = ''
d77c3dfd
FV
383
384 # closed captions
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
d77c3dfd 387 try:
0b8c922d
FV
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 try:
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
dee5d769
FV
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
0b8c922d
FV
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
401 srt_lang = 'en'
d77c3dfd 402 else:
dee5d769 403 srt_lang = srt_lang_list.keys()[0]
0b8c922d
FV
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
dee5d769 406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
0b8c922d
FV
407 try:
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
dee5d769
FV
411 if not srt_xml:
412 raise Trouble(u'WARNING: unable to download video subtitles')
0b8c922d
FV
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
d77c3dfd
FV
416
417 # token
418 video_token = urllib.unquote_plus(video_info['token'][0])
419
420 # Decide which formats to download
421 req_format = self._downloader.params.get('format', None)
422
423 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
424 self.report_rtmp_download()
425 video_url_list = [(None, video_info['conn'][0])]
426 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
427 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
428 url_data = [parse_qs(uds) for uds in url_data_strs]
429 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
9ca66706 430 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
d77c3dfd
FV
431
432 format_limit = self._downloader.params.get('format_limit', None)
433 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
434 if format_limit is not None and format_limit in available_formats:
435 format_list = available_formats[available_formats.index(format_limit):]
436 else:
437 format_list = available_formats
438 existing_formats = [x for x in format_list if x in url_map]
439 if len(existing_formats) == 0:
440 self._downloader.trouble(u'ERROR: no known formats available for video')
441 return
442 if self._downloader.params.get('listformats', None):
443 self._print_formats(existing_formats)
444 return
445 if req_format is None or req_format == 'best':
446 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
447 elif req_format == 'worst':
448 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
449 elif req_format in ('-1', 'all'):
450 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
451 else:
452 # Specific formats. We pick the first in a slash-delimeted sequence.
453 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
454 req_formats = req_format.split('/')
455 video_url_list = None
456 for rf in req_formats:
457 if rf in url_map:
458 video_url_list = [(rf, url_map[rf])]
459 break
460 if video_url_list is None:
461 self._downloader.trouble(u'ERROR: requested format not available')
462 return
463 else:
464 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
465 return
466
58ca755f 467 results = []
d77c3dfd 468 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
469 # Extension
470 video_extension = self._video_extensions.get(format_param, 'flv')
471
58ca755f
FV
472 results.append({
473 'id': video_id.decode('utf-8'),
474 'url': video_real_url.decode('utf-8'),
475 'uploader': video_uploader.decode('utf-8'),
476 'upload_date': upload_date,
477 'title': video_title,
58ca755f
FV
478 'ext': video_extension.decode('utf-8'),
479 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
480 'thumbnail': video_thumbnail.decode('utf-8'),
481 'description': video_description,
482 'player_url': player_url,
483 'subtitles': video_subtitles
484 })
485 return results
d77c3dfd
FV
486
487
488class MetacafeIE(InfoExtractor):
489 """Information Extractor for metacafe.com."""
490
491 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
492 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
493 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
d77c3dfd
FV
494 IE_NAME = u'metacafe'
495
58ca755f 496 def __init__(self, downloader=None):
d77c3dfd 497 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
498
499 def report_disclaimer(self):
500 """Report disclaimer retrieval."""
501 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
502
503 def report_age_confirmation(self):
504 """Report attempt to confirm age."""
505 self._downloader.to_screen(u'[metacafe] Confirming age')
506
507 def report_download_webpage(self, video_id):
508 """Report webpage download."""
509 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
510
511 def report_extraction(self, video_id):
512 """Report information extraction."""
513 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
514
515 def _real_initialize(self):
516 # Retrieve disclaimer
517 request = urllib2.Request(self._DISCLAIMER)
518 try:
519 self.report_disclaimer()
520 disclaimer = urllib2.urlopen(request).read()
521 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
522 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
523 return
524
525 # Confirm age
526 disclaimer_form = {
527 'filters': '0',
528 'submit': "Continue - I'm over 18",
529 }
530 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
531 try:
532 self.report_age_confirmation()
533 disclaimer = urllib2.urlopen(request).read()
534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
535 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
536 return
537
538 def _real_extract(self, url):
539 # Extract id and simplified title from URL
540 mobj = re.match(self._VALID_URL, url)
541 if mobj is None:
542 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
543 return
544
545 video_id = mobj.group(1)
546
547 # Check if video comes from YouTube
548 mobj2 = re.match(r'^yt-(.*)$', video_id)
549 if mobj2 is not None:
58ca755f 550 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
d77c3dfd
FV
551 return
552
d77c3dfd
FV
553 # Retrieve video webpage to extract further information
554 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
555 try:
556 self.report_download_webpage(video_id)
557 webpage = urllib2.urlopen(request).read()
558 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
559 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
560 return
561
562 # Extract URL, uploader and title from webpage
563 self.report_extraction(video_id)
564 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
565 if mobj is not None:
566 mediaURL = urllib.unquote(mobj.group(1))
567 video_extension = mediaURL[-3:]
568
569 # Extract gdaKey if available
570 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
571 if mobj is None:
572 video_url = mediaURL
573 else:
574 gdaKey = mobj.group(1)
575 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
576 else:
577 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
578 if mobj is None:
579 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 return
581 vardict = parse_qs(mobj.group(1))
582 if 'mediaData' not in vardict:
583 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 return
585 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
586 if mobj is None:
587 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 return
589 mediaURL = mobj.group(1).replace('\\/', '/')
590 video_extension = mediaURL[-3:]
591 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
592
593 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
594 if mobj is None:
595 self._downloader.trouble(u'ERROR: unable to extract title')
596 return
597 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
598
599 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
600 if mobj is None:
601 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
602 return
603 video_uploader = mobj.group(1)
604
58ca755f
FV
605 return [{
606 'id': video_id.decode('utf-8'),
607 'url': video_url.decode('utf-8'),
608 'uploader': video_uploader.decode('utf-8'),
609 'upload_date': u'NA',
610 'title': video_title,
58ca755f
FV
611 'ext': video_extension.decode('utf-8'),
612 'format': u'NA',
613 'player_url': None,
614 }]
d77c3dfd
FV
615
616
617class DailymotionIE(InfoExtractor):
618 """Information Extractor for Dailymotion"""
619
63ec7b74 620 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
d77c3dfd
FV
621 IE_NAME = u'dailymotion'
622
623 def __init__(self, downloader=None):
624 InfoExtractor.__init__(self, downloader)
625
626 def report_download_webpage(self, video_id):
627 """Report webpage download."""
628 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629
630 def report_extraction(self, video_id):
631 """Report information extraction."""
632 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633
634 def _real_extract(self, url):
635 # Extract id and simplified title from URL
636 mobj = re.match(self._VALID_URL, url)
637 if mobj is None:
638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
639 return
640
13e69f54 641 video_id = mobj.group(1).split('_')[0].split('?')[0]
d77c3dfd 642
349e2e3e 643 video_extension = 'mp4'
d77c3dfd
FV
644
645 # Retrieve video webpage to extract further information
646 request = urllib2.Request(url)
647 request.add_header('Cookie', 'family_filter=off')
648 try:
649 self.report_download_webpage(video_id)
650 webpage = urllib2.urlopen(request).read()
651 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
652 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
653 return
654
655 # Extract URL, uploader and title from webpage
656 self.report_extraction(video_id)
349e2e3e 657 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
d77c3dfd
FV
658 if mobj is None:
659 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 return
349e2e3e 661 flashvars = urllib.unquote(mobj.group(1))
0bfd0b59
FV
662
663 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
664 if key in flashvars:
665 max_quality = key
666 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
667 break
668 else:
669 self._downloader.trouble(u'ERROR: unable to extract video URL')
670 return
671
3c4d6c9e 672 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
7b6d7001 673 if mobj is None:
0bfd0b59 674 self._downloader.trouble(u'ERROR: unable to extract video URL')
d77c3dfd 675 return
0bfd0b59 676
7b6d7001 677 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
d77c3dfd 678
3c4d6c9e 679 # TODO: support choosing qualities
d77c3dfd
FV
680
681 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
682 if mobj is None:
683 self._downloader.trouble(u'ERROR: unable to extract title')
684 return
685 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
d77c3dfd 686
0bfd0b59 687 video_uploader = u'NA'
d77c3dfd
FV
688 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
689 if mobj is None:
0bfd0b59
FV
690 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
691 else:
692 video_uploader = mobj.group(1)
d77c3dfd 693
413575f7
T
694 video_upload_date = u'NA'
695 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
696 if mobj is not None:
697 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
698
58ca755f
FV
699 return [{
700 'id': video_id.decode('utf-8'),
3c4d6c9e 701 'url': video_url.decode('utf-8'),
58ca755f 702 'uploader': video_uploader.decode('utf-8'),
413575f7 703 'upload_date': video_upload_date,
58ca755f 704 'title': video_title,
58ca755f
FV
705 'ext': video_extension.decode('utf-8'),
706 'format': u'NA',
707 'player_url': None,
708 }]
d77c3dfd
FV
709
710
711class GoogleIE(InfoExtractor):
712 """Information extractor for video.google.com."""
713
714 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
715 IE_NAME = u'video.google'
716
717 def __init__(self, downloader=None):
718 InfoExtractor.__init__(self, downloader)
719
720 def report_download_webpage(self, video_id):
721 """Report webpage download."""
722 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
723
724 def report_extraction(self, video_id):
725 """Report information extraction."""
726 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
727
728 def _real_extract(self, url):
729 # Extract id from URL
730 mobj = re.match(self._VALID_URL, url)
731 if mobj is None:
732 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
733 return
734
d77c3dfd
FV
735 video_id = mobj.group(1)
736
737 video_extension = 'mp4'
738
739 # Retrieve video webpage to extract further information
740 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
741 try:
742 self.report_download_webpage(video_id)
743 webpage = urllib2.urlopen(request).read()
744 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
745 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
746 return
747
748 # Extract URL, uploader, and title from webpage
749 self.report_extraction(video_id)
750 mobj = re.search(r"download_url:'([^']+)'", webpage)
751 if mobj is None:
752 video_extension = 'flv'
753 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
754 if mobj is None:
755 self._downloader.trouble(u'ERROR: unable to extract media URL')
756 return
757 mediaURL = urllib.unquote(mobj.group(1))
758 mediaURL = mediaURL.replace('\\x3d', '\x3d')
759 mediaURL = mediaURL.replace('\\x26', '\x26')
760
761 video_url = mediaURL
762
763 mobj = re.search(r'<title>(.*)</title>', webpage)
764 if mobj is None:
765 self._downloader.trouble(u'ERROR: unable to extract title')
766 return
767 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
768
769 # Extract video description
770 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
771 if mobj is None:
772 self._downloader.trouble(u'ERROR: unable to extract video description')
773 return
774 video_description = mobj.group(1).decode('utf-8')
775 if not video_description:
776 video_description = 'No description available.'
777
778 # Extract video thumbnail
779 if self._downloader.params.get('forcethumbnail', False):
780 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
781 try:
782 webpage = urllib2.urlopen(request).read()
783 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
784 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
785 return
786 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
787 if mobj is None:
788 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
789 return
790 video_thumbnail = mobj.group(1)
791 else: # we need something to pass to process_info
792 video_thumbnail = ''
793
58ca755f
FV
794 return [{
795 'id': video_id.decode('utf-8'),
796 'url': video_url.decode('utf-8'),
797 'uploader': u'NA',
798 'upload_date': u'NA',
799 'title': video_title,
58ca755f
FV
800 'ext': video_extension.decode('utf-8'),
801 'format': u'NA',
802 'player_url': None,
803 }]
d77c3dfd
FV
804
805
806class PhotobucketIE(InfoExtractor):
807 """Information extractor for photobucket.com."""
808
809 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
810 IE_NAME = u'photobucket'
811
812 def __init__(self, downloader=None):
813 InfoExtractor.__init__(self, downloader)
814
815 def report_download_webpage(self, video_id):
816 """Report webpage download."""
817 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818
819 def report_extraction(self, video_id):
820 """Report information extraction."""
821 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822
823 def _real_extract(self, url):
824 # Extract id from URL
825 mobj = re.match(self._VALID_URL, url)
826 if mobj is None:
827 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
828 return
829
d77c3dfd
FV
830 video_id = mobj.group(1)
831
832 video_extension = 'flv'
833
834 # Retrieve video webpage to extract further information
835 request = urllib2.Request(url)
836 try:
837 self.report_download_webpage(video_id)
838 webpage = urllib2.urlopen(request).read()
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
841 return
842
843 # Extract URL, uploader, and title from webpage
844 self.report_extraction(video_id)
845 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 if mobj is None:
847 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 return
849 mediaURL = urllib.unquote(mobj.group(1))
850
851 video_url = mediaURL
852
853 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 if mobj is None:
855 self._downloader.trouble(u'ERROR: unable to extract title')
856 return
857 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
858
859 video_uploader = mobj.group(2).decode('utf-8')
860
58ca755f
FV
861 return [{
862 'id': video_id.decode('utf-8'),
863 'url': video_url.decode('utf-8'),
864 'uploader': video_uploader,
865 'upload_date': u'NA',
866 'title': video_title,
58ca755f
FV
867 'ext': video_extension.decode('utf-8'),
868 'format': u'NA',
869 'player_url': None,
870 }]
d77c3dfd
FV
871
872
873class YahooIE(InfoExtractor):
874 """Information extractor for video.yahoo.com."""
875
876 # _VALID_URL matches all Yahoo! Video URLs
877 # _VPAGE_URL matches only the extractable '/watch/' URLs
878 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
879 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
880 IE_NAME = u'video.yahoo'
881
882 def __init__(self, downloader=None):
883 InfoExtractor.__init__(self, downloader)
884
885 def report_download_webpage(self, video_id):
886 """Report webpage download."""
887 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
888
889 def report_extraction(self, video_id):
890 """Report information extraction."""
891 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
892
893 def _real_extract(self, url, new_video=True):
894 # Extract ID from URL
895 mobj = re.match(self._VALID_URL, url)
896 if mobj is None:
897 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
898 return
899
d77c3dfd
FV
900 video_id = mobj.group(2)
901 video_extension = 'flv'
902
903 # Rewrite valid but non-extractable URLs as
904 # extractable English language /watch/ URLs
905 if re.match(self._VPAGE_URL, url) is None:
906 request = urllib2.Request(url)
907 try:
908 webpage = urllib2.urlopen(request).read()
909 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
910 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
911 return
912
913 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
914 if mobj is None:
915 self._downloader.trouble(u'ERROR: Unable to extract id field')
916 return
917 yahoo_id = mobj.group(1)
918
919 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
920 if mobj is None:
921 self._downloader.trouble(u'ERROR: Unable to extract vid field')
922 return
923 yahoo_vid = mobj.group(1)
924
925 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
926 return self._real_extract(url, new_video=False)
927
928 # Retrieve video webpage to extract further information
929 request = urllib2.Request(url)
930 try:
931 self.report_download_webpage(video_id)
932 webpage = urllib2.urlopen(request).read()
933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
934 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
935 return
936
937 # Extract uploader and title from webpage
938 self.report_extraction(video_id)
939 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
940 if mobj is None:
941 self._downloader.trouble(u'ERROR: unable to extract video title')
942 return
943 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
944
945 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
946 if mobj is None:
947 self._downloader.trouble(u'ERROR: unable to extract video uploader')
948 return
949 video_uploader = mobj.group(1).decode('utf-8')
950
951 # Extract video thumbnail
952 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
953 if mobj is None:
954 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
955 return
956 video_thumbnail = mobj.group(1).decode('utf-8')
957
958 # Extract video description
959 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
960 if mobj is None:
961 self._downloader.trouble(u'ERROR: unable to extract video description')
962 return
963 video_description = mobj.group(1).decode('utf-8')
964 if not video_description:
965 video_description = 'No description available.'
966
967 # Extract video height and width
968 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
969 if mobj is None:
970 self._downloader.trouble(u'ERROR: unable to extract video height')
971 return
972 yv_video_height = mobj.group(1)
973
974 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
975 if mobj is None:
976 self._downloader.trouble(u'ERROR: unable to extract video width')
977 return
978 yv_video_width = mobj.group(1)
979
980 # Retrieve video playlist to extract media URL
981 # I'm not completely sure what all these options are, but we
982 # seem to need most of them, otherwise the server sends a 401.
983 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
984 yv_bitrate = '700' # according to Wikipedia this is hard-coded
985 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
986 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
987 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
988 try:
989 self.report_download_webpage(video_id)
990 webpage = urllib2.urlopen(request).read()
991 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
992 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
993 return
994
995 # Extract media URL from playlist XML
996 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
997 if mobj is None:
998 self._downloader.trouble(u'ERROR: Unable to extract media URL')
999 return
1000 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
9e6dd238 1001 video_url = unescapeHTML(video_url)
d77c3dfd 1002
58ca755f
FV
1003 return [{
1004 'id': video_id.decode('utf-8'),
1005 'url': video_url,
1006 'uploader': video_uploader,
1007 'upload_date': u'NA',
1008 'title': video_title,
58ca755f
FV
1009 'ext': video_extension.decode('utf-8'),
1010 'thumbnail': video_thumbnail.decode('utf-8'),
1011 'description': video_description,
1012 'thumbnail': video_thumbnail,
1013 'player_url': None,
1014 }]
d77c3dfd
FV
1015
1016
1017class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1019
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1022 IE_NAME = u'vimeo'
1023
1024 def __init__(self, downloader=None):
1025 InfoExtractor.__init__(self, downloader)
1026
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035 def _real_extract(self, url, new_video=True):
1036 # Extract ID from URL
1037 mobj = re.match(self._VALID_URL, url)
1038 if mobj is None:
1039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040 return
1041
d77c3dfd
FV
1042 video_id = mobj.group(1)
1043
1044 # Retrieve video webpage to extract further information
1045 request = urllib2.Request(url, None, std_headers)
1046 try:
1047 self.report_download_webpage(video_id)
1048 webpage = urllib2.urlopen(request).read()
1049 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1050 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1051 return
1052
1053 # Now we begin extracting as much information as we can from what we
1054 # retrieved. First we extract the information common to all extractors,
1055 # and latter we extract those that are Vimeo specific.
1056 self.report_extraction(video_id)
1057
1058 # Extract the config JSON
1059 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1060 try:
1061 config = json.loads(config)
1062 except:
1063 self._downloader.trouble(u'ERROR: unable to extract info section')
1064 return
1065
1066 # Extract title
1067 video_title = config["video"]["title"]
d77c3dfd
FV
1068
1069 # Extract uploader
1070 video_uploader = config["video"]["owner"]["name"]
1071
1072 # Extract video thumbnail
1073 video_thumbnail = config["video"]["thumbnail"]
1074
1075 # Extract video description
9beb5af8
FV
1076 video_description = get_element_by_id("description", webpage.decode('utf8'))
1077 if video_description: video_description = clean_html(video_description)
9e6dd238 1078 else: video_description = ''
d77c3dfd
FV
1079
1080 # Extract upload date
1081 video_upload_date = u'NA'
1082 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1083 if mobj is not None:
1084 video_upload_date = mobj.group(1)
1085
1086 # Vimeo specific: extract request signature and timestamp
1087 sig = config['request']['signature']
1088 timestamp = config['request']['timestamp']
1089
1090 # Vimeo specific: extract video codec and quality information
1091 # TODO bind to format param
1092 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1093 for codec in codecs:
1094 if codec[0] in config["video"]["files"]:
1095 video_codec = codec[0]
1096 video_extension = codec[1]
1097 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1098 else: quality = 'sd'
1099 break
1100 else:
1101 self._downloader.trouble(u'ERROR: no known codec found')
1102 return
1103
1104 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1105 %(video_id, sig, timestamp, quality, video_codec.upper())
1106
58ca755f
FV
1107 return [{
1108 'id': video_id,
1109 'url': video_url,
1110 'uploader': video_uploader,
1111 'upload_date': video_upload_date,
1112 'title': video_title,
58ca755f
FV
1113 'ext': video_extension,
1114 'thumbnail': video_thumbnail,
1115 'description': video_description,
1116 'player_url': None,
1117 }]
d77c3dfd
FV
1118
1119
1120class GenericIE(InfoExtractor):
1121 """Generic last-resort information extractor."""
1122
1123 _VALID_URL = r'.*'
1124 IE_NAME = u'generic'
1125
1126 def __init__(self, downloader=None):
1127 InfoExtractor.__init__(self, downloader)
1128
1129 def report_download_webpage(self, video_id):
1130 """Report webpage download."""
1131 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1132 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1133
1134 def report_extraction(self, video_id):
1135 """Report information extraction."""
1136 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1137
1138 def report_following_redirect(self, new_url):
1139 """Report information extraction."""
1140 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1141
1142 def _test_redirect(self, url):
1143 """Check if it is a redirect, like url shorteners, in case restart chain."""
1144 class HeadRequest(urllib2.Request):
1145 def get_method(self):
1146 return "HEAD"
1147
1148 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1149 """
1150 Subclass the HTTPRedirectHandler to make it use our
1151 HeadRequest also on the redirected URL
1152 """
1153 def redirect_request(self, req, fp, code, msg, headers, newurl):
1154 if code in (301, 302, 303, 307):
303692b5
FV
1155 newurl = newurl.replace(' ', '%20')
1156 newheaders = dict((k,v) for k,v in req.headers.items()
1157 if k.lower() not in ("content-length", "content-type"))
1158 return HeadRequest(newurl,
1159 headers=newheaders,
1160 origin_req_host=req.get_origin_req_host(),
1161 unverifiable=True)
d77c3dfd 1162 else:
303692b5
FV
1163 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1164
d77c3dfd
FV
1165 class HTTPMethodFallback(urllib2.BaseHandler):
1166 """
1167 Fallback to GET if HEAD is not allowed (405 HTTP error)
1168 """
1169 def http_error_405(self, req, fp, code, msg, headers):
1170 fp.read()
1171 fp.close()
1172
1173 newheaders = dict((k,v) for k,v in req.headers.items()
303692b5 1174 if k.lower() not in ("content-length", "content-type"))
d77c3dfd 1175 return self.parent.open(urllib2.Request(req.get_full_url(),
303692b5
FV
1176 headers=newheaders,
1177 origin_req_host=req.get_origin_req_host(),
1178 unverifiable=True))
d77c3dfd
FV
1179
1180 # Build our opener
1181 opener = urllib2.OpenerDirector()
1182 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
303692b5
FV
1183 HTTPMethodFallback, HEADRedirectHandler,
1184 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
d77c3dfd
FV
1185 opener.add_handler(handler())
1186
1187 response = opener.open(HeadRequest(url))
1188 new_url = response.geturl()
1189
1190 if url == new_url: return False
1191
1192 self.report_following_redirect(new_url)
1193 self._downloader.download([new_url])
1194 return True
1195
1196 def _real_extract(self, url):
1197 if self._test_redirect(url): return
d77c3dfd
FV
1198
1199 video_id = url.split('/')[-1]
1200 request = urllib2.Request(url)
1201 try:
1202 self.report_download_webpage(video_id)
1203 webpage = urllib2.urlopen(request).read()
1204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1205 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1206 return
1207 except ValueError, err:
1208 # since this is the last-resort InfoExtractor, if
1209 # this error is thrown, it'll be thrown here
1210 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1211 return
1212
1213 self.report_extraction(video_id)
1214 # Start with something easy: JW Player in SWFObject
1215 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1216 if mobj is None:
1217 # Broaden the search a little bit
1218 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1219 if mobj is None:
1220 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1221 return
1222
1223 # It's possible that one of the regexes
1224 # matched, but returned an empty group:
1225 if mobj.group(1) is None:
1226 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1227 return
1228
1229 video_url = urllib.unquote(mobj.group(1))
1230 video_id = os.path.basename(video_url)
1231
1232 # here's a fun little line of code for you:
1233 video_extension = os.path.splitext(video_id)[1][1:]
1234 video_id = os.path.splitext(video_id)[0]
1235
1236 # it's tempting to parse this further, but you would
1237 # have to take into account all the variations like
1238 # Video Title - Site Name
1239 # Site Name | Video Title
1240 # Video Title - Tagline | Site Name
1241 # and so on and so forth; it's just not practical
1242 mobj = re.search(r'<title>(.*)</title>', webpage)
1243 if mobj is None:
1244 self._downloader.trouble(u'ERROR: unable to extract title')
1245 return
1246 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
1247
1248 # video uploader is domain name
1249 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1250 if mobj is None:
1251 self._downloader.trouble(u'ERROR: unable to extract title')
1252 return
1253 video_uploader = mobj.group(1).decode('utf-8')
1254
58ca755f
FV
1255 return [{
1256 'id': video_id.decode('utf-8'),
1257 'url': video_url.decode('utf-8'),
1258 'uploader': video_uploader,
1259 'upload_date': u'NA',
1260 'title': video_title,
58ca755f
FV
1261 'ext': video_extension.decode('utf-8'),
1262 'format': u'NA',
1263 'player_url': None,
1264 }]
d77c3dfd
FV
1265
1266
1267class YoutubeSearchIE(InfoExtractor):
1268 """Information Extractor for YouTube search queries."""
1269 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1270 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
d77c3dfd
FV
1271 _max_youtube_results = 1000
1272 IE_NAME = u'youtube:search'
1273
58ca755f 1274 def __init__(self, downloader=None):
d77c3dfd 1275 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1276
1277 def report_download_page(self, query, pagenum):
d4e16d3e 1278 """Report attempt to download search page with given number."""
d77c3dfd
FV
1279 query = query.decode(preferredencoding())
1280 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1281
d77c3dfd
FV
1282 def _real_extract(self, query):
1283 mobj = re.match(self._VALID_URL, query)
1284 if mobj is None:
1285 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1286 return
1287
1288 prefix, query = query.split(':')
1289 prefix = prefix[8:]
1290 query = query.encode('utf-8')
1291 if prefix == '':
1292 self._download_n_results(query, 1)
1293 return
1294 elif prefix == 'all':
1295 self._download_n_results(query, self._max_youtube_results)
1296 return
1297 else:
1298 try:
1299 n = long(prefix)
1300 if n <= 0:
1301 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1302 return
1303 elif n > self._max_youtube_results:
1304 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1305 n = self._max_youtube_results
1306 self._download_n_results(query, n)
1307 return
1308 except ValueError: # parsing prefix as integer fails
1309 self._download_n_results(query, 1)
1310 return
1311
1312 def _download_n_results(self, query, n):
1313 """Downloads a specified number of results for a query"""
1314
1315 video_ids = []
1316 pagenum = 0
1317 limit = n
1318
1319 while (50 * pagenum) < limit:
1320 self.report_download_page(query, pagenum+1)
1321 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1322 request = urllib2.Request(result_url)
1323 try:
1324 data = urllib2.urlopen(request).read()
1325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1326 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1327 return
1328 api_response = json.loads(data)['data']
1329
1330 new_ids = list(video['id'] for video in api_response['items'])
1331 video_ids += new_ids
1332
1333 limit = min(n, api_response['totalItems'])
1334 pagenum += 1
1335
1336 if len(video_ids) > n:
1337 video_ids = video_ids[:n]
1338 for id in video_ids:
58ca755f 1339 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1340 return
1341
1342
1343class GoogleSearchIE(InfoExtractor):
1344 """Information Extractor for Google Video search queries."""
1345 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1346 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1347 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1348 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
d77c3dfd
FV
1349 _max_google_results = 1000
1350 IE_NAME = u'video.google:search'
1351
58ca755f 1352 def __init__(self, downloader=None):
d77c3dfd 1353 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1354
1355 def report_download_page(self, query, pagenum):
1356 """Report attempt to download playlist page with given number."""
1357 query = query.decode(preferredencoding())
1358 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1359
d77c3dfd
FV
1360 def _real_extract(self, query):
1361 mobj = re.match(self._VALID_URL, query)
1362 if mobj is None:
1363 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1364 return
1365
1366 prefix, query = query.split(':')
1367 prefix = prefix[8:]
1368 query = query.encode('utf-8')
1369 if prefix == '':
1370 self._download_n_results(query, 1)
1371 return
1372 elif prefix == 'all':
1373 self._download_n_results(query, self._max_google_results)
1374 return
1375 else:
1376 try:
1377 n = long(prefix)
1378 if n <= 0:
1379 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1380 return
1381 elif n > self._max_google_results:
1382 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1383 n = self._max_google_results
1384 self._download_n_results(query, n)
1385 return
1386 except ValueError: # parsing prefix as integer fails
1387 self._download_n_results(query, 1)
1388 return
1389
1390 def _download_n_results(self, query, n):
1391 """Downloads a specified number of results for a query"""
1392
1393 video_ids = []
1394 pagenum = 0
1395
1396 while True:
1397 self.report_download_page(query, pagenum)
1398 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1399 request = urllib2.Request(result_url)
1400 try:
1401 page = urllib2.urlopen(request).read()
1402 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1404 return
1405
1406 # Extract video identifiers
1407 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1408 video_id = mobj.group(1)
1409 if video_id not in video_ids:
1410 video_ids.append(video_id)
1411 if len(video_ids) == n:
1412 # Specified n videos reached
1413 for id in video_ids:
58ca755f 1414 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1415 return
1416
1417 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1418 for id in video_ids:
58ca755f 1419 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1420 return
1421
1422 pagenum = pagenum + 1
1423
1424
1425class YahooSearchIE(InfoExtractor):
1426 """Information Extractor for Yahoo! Video search queries."""
1427 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1428 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1429 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1430 _MORE_PAGES_INDICATOR = r'\s*Next'
d77c3dfd
FV
1431 _max_yahoo_results = 1000
1432 IE_NAME = u'video.yahoo:search'
1433
58ca755f 1434 def __init__(self, downloader=None):
d77c3dfd 1435 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1436
1437 def report_download_page(self, query, pagenum):
1438 """Report attempt to download playlist page with given number."""
1439 query = query.decode(preferredencoding())
1440 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1441
d77c3dfd
FV
1442 def _real_extract(self, query):
1443 mobj = re.match(self._VALID_URL, query)
1444 if mobj is None:
1445 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446 return
1447
1448 prefix, query = query.split(':')
1449 prefix = prefix[8:]
1450 query = query.encode('utf-8')
1451 if prefix == '':
1452 self._download_n_results(query, 1)
1453 return
1454 elif prefix == 'all':
1455 self._download_n_results(query, self._max_yahoo_results)
1456 return
1457 else:
1458 try:
1459 n = long(prefix)
1460 if n <= 0:
1461 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462 return
1463 elif n > self._max_yahoo_results:
1464 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1465 n = self._max_yahoo_results
1466 self._download_n_results(query, n)
1467 return
1468 except ValueError: # parsing prefix as integer fails
1469 self._download_n_results(query, 1)
1470 return
1471
1472 def _download_n_results(self, query, n):
1473 """Downloads a specified number of results for a query"""
1474
1475 video_ids = []
1476 already_seen = set()
1477 pagenum = 1
1478
1479 while True:
1480 self.report_download_page(query, pagenum)
1481 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1482 request = urllib2.Request(result_url)
1483 try:
1484 page = urllib2.urlopen(request).read()
1485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1487 return
1488
1489 # Extract video identifiers
1490 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1491 video_id = mobj.group(1)
1492 if video_id not in already_seen:
1493 video_ids.append(video_id)
1494 already_seen.add(video_id)
1495 if len(video_ids) == n:
1496 # Specified n videos reached
1497 for id in video_ids:
58ca755f 1498 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1499 return
1500
1501 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1502 for id in video_ids:
58ca755f 1503 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1504 return
1505
1506 pagenum = pagenum + 1
1507
1508
1509class YoutubePlaylistIE(InfoExtractor):
1510 """Information Extractor for YouTube playlists."""
1511
10daa766 1512 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
d77c3dfd 1513 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
10daa766 1514 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
d4e16d3e 1515 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
d77c3dfd
FV
1516 IE_NAME = u'youtube:playlist'
1517
58ca755f 1518 def __init__(self, downloader=None):
d77c3dfd 1519 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1520
1521 def report_download_page(self, playlist_id, pagenum):
1522 """Report attempt to download playlist page with given number."""
1523 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1524
d77c3dfd
FV
1525 def _real_extract(self, url):
1526 # Extract playlist id
1527 mobj = re.match(self._VALID_URL, url)
1528 if mobj is None:
1529 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1530 return
1531
1532 # Single video case
1533 if mobj.group(3) is not None:
58ca755f 1534 self._downloader.download([mobj.group(3)])
d77c3dfd
FV
1535 return
1536
1537 # Download playlist pages
1538 # prefix is 'p' as default for playlists but there are other types that need extra care
1539 playlist_prefix = mobj.group(1)
1540 if playlist_prefix == 'a':
1541 playlist_access = 'artist'
1542 else:
1543 playlist_prefix = 'p'
1544 playlist_access = 'view_play_list'
1545 playlist_id = mobj.group(2)
1546 video_ids = []
1547 pagenum = 1
1548
1549 while True:
1550 self.report_download_page(playlist_id, pagenum)
1551 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1552 request = urllib2.Request(url)
1553 try:
1554 page = urllib2.urlopen(request).read()
1555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1557 return
1558
1559 # Extract video identifiers
1560 ids_in_page = []
1561 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1562 if mobj.group(1) not in ids_in_page:
1563 ids_in_page.append(mobj.group(1))
1564 video_ids.extend(ids_in_page)
1565
1566 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567 break
1568 pagenum = pagenum + 1
1569
1570 playliststart = self._downloader.params.get('playliststart', 1) - 1
1571 playlistend = self._downloader.params.get('playlistend', -1)
1572 if playlistend == -1:
1573 video_ids = video_ids[playliststart:]
1574 else:
1575 video_ids = video_ids[playliststart:playlistend]
1576
1577 for id in video_ids:
58ca755f 1578 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1579 return
1580
1581
902b2a0a
FV
1582class YoutubeChannelIE(InfoExtractor):
1583 """Information Extractor for YouTube channels."""
1584
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1588 IE_NAME = u'youtube:channel'
1589
1590 def report_download_page(self, channel_id, pagenum):
1591 """Report attempt to download channel page with given number."""
1592 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1593
1594 def _real_extract(self, url):
1595 # Extract channel id
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1599 return
1600
1601 # Download channel pages
1602 channel_id = mobj.group(1)
1603 video_ids = []
1604 pagenum = 1
1605
1606 while True:
1607 self.report_download_page(channel_id, pagenum)
1608 url = self._TEMPLATE_URL % (channel_id, pagenum)
1609 request = urllib2.Request(url)
1610 try:
1611 page = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1614 return
1615
1616 # Extract video identifiers
1617 ids_in_page = []
1618 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1619 if mobj.group(1) not in ids_in_page:
1620 ids_in_page.append(mobj.group(1))
1621 video_ids.extend(ids_in_page)
1622
1623 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1624 break
1625 pagenum = pagenum + 1
1626
1627 for id in video_ids:
1628 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1629 return
1630
1631
d77c3dfd
FV
1632class YoutubeUserIE(InfoExtractor):
1633 """Information Extractor for YouTube users."""
1634
1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1639 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
d77c3dfd
FV
1640 IE_NAME = u'youtube:user'
1641
58ca755f 1642 def __init__(self, downloader=None):
d77c3dfd 1643 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1644
1645 def report_download_page(self, username, start_index):
1646 """Report attempt to download user page."""
1647 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1648 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1649
d77c3dfd
FV
1650 def _real_extract(self, url):
1651 # Extract username
1652 mobj = re.match(self._VALID_URL, url)
1653 if mobj is None:
1654 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1655 return
1656
1657 username = mobj.group(1)
1658
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1662 # all of them.
1663
1664 video_ids = []
1665 pagenum = 0
1666
1667 while True:
1668 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669 self.report_download_page(username, start_index)
1670
1671 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1672
1673 try:
1674 page = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1677 return
1678
1679 # Extract video identifiers
1680 ids_in_page = []
1681
1682 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1683 if mobj.group(1) not in ids_in_page:
1684 ids_in_page.append(mobj.group(1))
1685
1686 video_ids.extend(ids_in_page)
1687
1688 # A little optimization - if current page is not
1689 # "full", ie. does not contain PAGE_SIZE video ids then
1690 # we can assume that this page is the last one - there
1691 # are no more ids on further pages - no need to query
1692 # again.
1693
1694 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1695 break
1696
1697 pagenum += 1
1698
1699 all_ids_count = len(video_ids)
1700 playliststart = self._downloader.params.get('playliststart', 1) - 1
1701 playlistend = self._downloader.params.get('playlistend', -1)
1702
1703 if playlistend == -1:
1704 video_ids = video_ids[playliststart:]
1705 else:
1706 video_ids = video_ids[playliststart:playlistend]
1707
1708 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1709 (username, all_ids_count, len(video_ids)))
1710
1711 for video_id in video_ids:
58ca755f 1712 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1713
1714
eeeb4daa
JCGS
1715class BlipTVUserIE(InfoExtractor):
1716 """Information Extractor for blip.tv users."""
1717
1718 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
11a141de 1719 _PAGE_SIZE = 12
eeeb4daa
JCGS
1720 IE_NAME = u'blip.tv:user'
1721
1722 def __init__(self, downloader=None):
1723 InfoExtractor.__init__(self, downloader)
1724
1725 def report_download_page(self, username, pagenum):
1726 """Report attempt to download user page."""
1727 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1728 (self.IE_NAME, username, pagenum))
1729
1730 def _real_extract(self, url):
1731 # Extract username
1732 mobj = re.match(self._VALID_URL, url)
1733 if mobj is None:
1734 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1735 return
1736
1737 username = mobj.group(1)
1738
11a141de 1739 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa
JCGS
1740
1741 request = urllib2.Request(url)
1742
1743 try:
1744 page = urllib2.urlopen(request).read().decode('utf-8')
11a141de
FV
1745 mobj = re.search(r'data-users-id="([^"]+)"', page)
1746 page_base = page_base % mobj.group(1)
eeeb4daa
JCGS
1747 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1748 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1749 return
1750
1751
11a141de
FV
1752 # Download video ids using BlipTV Ajax calls. Result size per
1753 # query is limited (currently to 12 videos) so we need to query
eeeb4daa
JCGS
1754 # page by page until there are no video ids - it means we got
1755 # all of them.
1756
1757 video_ids = []
11a141de 1758 pagenum = 1
eeeb4daa
JCGS
1759
1760 while True:
1761 self.report_download_page(username, pagenum)
1762
11a141de 1763 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa
JCGS
1764
1765 try:
1766 page = urllib2.urlopen(request).read().decode('utf-8')
1767 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1769 return
1770
1771 # Extract video identifiers
1772 ids_in_page = []
1773
1774 for mobj in re.finditer(r'href="/([^"]+)"', page):
1775 if mobj.group(1) not in ids_in_page:
1776 ids_in_page.append(unescapeHTML(mobj.group(1)))
1777
1778 video_ids.extend(ids_in_page)
1779
1780 # A little optimization - if current page is not
1781 # "full", ie. does not contain PAGE_SIZE video ids then
1782 # we can assume that this page is the last one - there
1783 # are no more ids on further pages - no need to query
1784 # again.
1785
1786 if len(ids_in_page) < self._PAGE_SIZE:
1787 break
1788
1789 pagenum += 1
1790
1791 all_ids_count = len(video_ids)
1792 playliststart = self._downloader.params.get('playliststart', 1) - 1
1793 playlistend = self._downloader.params.get('playlistend', -1)
1794
1795 if playlistend == -1:
1796 video_ids = video_ids[playliststart:]
1797 else:
1798 video_ids = video_ids[playliststart:playlistend]
1799
1800 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1801 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1802
1803 for video_id in video_ids:
1804 self._downloader.download([u'http://blip.tv/'+video_id])
1805
1806
d77c3dfd
FV
1807class DepositFilesIE(InfoExtractor):
1808 """Information extractor for depositfiles.com"""
1809
1810 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1811 IE_NAME = u'DepositFiles'
1812
1813 def __init__(self, downloader=None):
1814 InfoExtractor.__init__(self, downloader)
1815
1816 def report_download_webpage(self, file_id):
1817 """Report webpage download."""
1818 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1819
1820 def report_extraction(self, file_id):
1821 """Report information extraction."""
1822 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1823
1824 def _real_extract(self, url):
d77c3dfd
FV
1825 file_id = url.split('/')[-1]
1826 # Rebuild url in english locale
1827 url = 'http://depositfiles.com/en/files/' + file_id
1828
1829 # Retrieve file webpage with 'Free download' button pressed
1830 free_download_indication = { 'gateway_result' : '1' }
1831 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1832 try:
1833 self.report_download_webpage(file_id)
1834 webpage = urllib2.urlopen(request).read()
1835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1836 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1837 return
1838
1839 # Search for the real file URL
1840 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1841 if (mobj is None) or (mobj.group(1) is None):
1842 # Try to figure out reason of the error.
1843 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1844 if (mobj is not None) and (mobj.group(1) is not None):
1845 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1846 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1847 else:
1848 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1849 return
1850
1851 file_url = mobj.group(1)
1852 file_extension = os.path.splitext(file_url)[1][1:]
1853
1854 # Search for file title
1855 mobj = re.search(r'<b title="(.*?)">', webpage)
1856 if mobj is None:
1857 self._downloader.trouble(u'ERROR: unable to extract title')
1858 return
1859 file_title = mobj.group(1).decode('utf-8')
1860
58ca755f
FV
1861 return [{
1862 'id': file_id.decode('utf-8'),
1863 'url': file_url.decode('utf-8'),
1864 'uploader': u'NA',
1865 'upload_date': u'NA',
1866 'title': file_title,
58ca755f
FV
1867 'ext': file_extension.decode('utf-8'),
1868 'format': u'NA',
1869 'player_url': None,
1870 }]
d77c3dfd
FV
1871
1872
1873class FacebookIE(InfoExtractor):
1874 """Information Extractor for Facebook"""
1875
1876 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1877 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1878 _NETRC_MACHINE = 'facebook'
1879 _available_formats = ['video', 'highqual', 'lowqual']
1880 _video_extensions = {
1881 'video': 'mp4',
1882 'highqual': 'mp4',
1883 'lowqual': 'mp4',
1884 }
1885 IE_NAME = u'facebook'
1886
1887 def __init__(self, downloader=None):
1888 InfoExtractor.__init__(self, downloader)
1889
1890 def _reporter(self, message):
1891 """Add header and report message."""
1892 self._downloader.to_screen(u'[facebook] %s' % message)
1893
1894 def report_login(self):
1895 """Report attempt to log in."""
1896 self._reporter(u'Logging in')
1897
1898 def report_video_webpage_download(self, video_id):
1899 """Report attempt to download video webpage."""
1900 self._reporter(u'%s: Downloading video webpage' % video_id)
1901
1902 def report_information_extraction(self, video_id):
1903 """Report attempt to extract video information."""
1904 self._reporter(u'%s: Extracting video information' % video_id)
1905
1906 def _parse_page(self, video_webpage):
1907 """Extract video information from page"""
1908 # General data
1909 data = {'title': r'\("video_title", "(.*?)"\)',
1910 'description': r'<div class="datawrap">(.*?)</div>',
1911 'owner': r'\("video_owner_name", "(.*?)"\)',
1912 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1913 }
1914 video_info = {}
1915 for piece in data.keys():
1916 mobj = re.search(data[piece], video_webpage)
1917 if mobj is not None:
1918 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1919
1920 # Video urls
1921 video_urls = {}
1922 for fmt in self._available_formats:
1923 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1924 if mobj is not None:
1925 # URL is in a Javascript segment inside an escaped Unicode format within
1926 # the generally utf-8 page
1927 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1928 video_info['video_urls'] = video_urls
1929
1930 return video_info
1931
1932 def _real_initialize(self):
1933 if self._downloader is None:
1934 return
1935
1936 useremail = None
1937 password = None
1938 downloader_params = self._downloader.params
1939
1940 # Attempt to use provided username and password or .netrc data
1941 if downloader_params.get('username', None) is not None:
1942 useremail = downloader_params['username']
1943 password = downloader_params['password']
1944 elif downloader_params.get('usenetrc', False):
1945 try:
1946 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1947 if info is not None:
1948 useremail = info[0]
1949 password = info[2]
1950 else:
1951 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1952 except (IOError, netrc.NetrcParseError), err:
1953 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1954 return
1955
1956 if useremail is None:
1957 return
1958
1959 # Log in
1960 login_form = {
1961 'email': useremail,
1962 'pass': password,
1963 'login': 'Log+In'
1964 }
1965 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1966 try:
1967 self.report_login()
1968 login_results = urllib2.urlopen(request).read()
1969 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1970 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1971 return
1972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1974 return
1975
1976 def _real_extract(self, url):
1977 mobj = re.match(self._VALID_URL, url)
1978 if mobj is None:
1979 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1980 return
1981 video_id = mobj.group('ID')
1982
1983 # Get video webpage
1984 self.report_video_webpage_download(video_id)
1985 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1986 try:
1987 page = urllib2.urlopen(request)
1988 video_webpage = page.read()
1989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1990 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1991 return
1992
1993 # Start extracting information
1994 self.report_information_extraction(video_id)
1995
1996 # Extract information
1997 video_info = self._parse_page(video_webpage)
1998
1999 # uploader
2000 if 'owner' not in video_info:
2001 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2002 return
2003 video_uploader = video_info['owner']
2004
2005 # title
2006 if 'title' not in video_info:
2007 self._downloader.trouble(u'ERROR: unable to extract video title')
2008 return
2009 video_title = video_info['title']
2010 video_title = video_title.decode('utf-8')
d77c3dfd
FV
2011
2012 # thumbnail image
2013 if 'thumbnail' not in video_info:
2014 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2015 video_thumbnail = ''
2016 else:
2017 video_thumbnail = video_info['thumbnail']
2018
2019 # upload date
2020 upload_date = u'NA'
2021 if 'upload_date' in video_info:
2022 upload_time = video_info['upload_date']
2023 timetuple = email.utils.parsedate_tz(upload_time)
2024 if timetuple is not None:
2025 try:
2026 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2027 except:
2028 pass
2029
2030 # description
2031 video_description = video_info.get('description', 'No description available.')
2032
2033 url_map = video_info['video_urls']
2034 if len(url_map.keys()) > 0:
2035 # Decide which formats to download
2036 req_format = self._downloader.params.get('format', None)
2037 format_limit = self._downloader.params.get('format_limit', None)
2038
2039 if format_limit is not None and format_limit in self._available_formats:
2040 format_list = self._available_formats[self._available_formats.index(format_limit):]
2041 else:
2042 format_list = self._available_formats
2043 existing_formats = [x for x in format_list if x in url_map]
2044 if len(existing_formats) == 0:
2045 self._downloader.trouble(u'ERROR: no known formats available for video')
2046 return
2047 if req_format is None:
2048 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2049 elif req_format == 'worst':
2050 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2051 elif req_format == '-1':
2052 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2053 else:
2054 # Specific format
2055 if req_format not in url_map:
2056 self._downloader.trouble(u'ERROR: requested format not available')
2057 return
2058 video_url_list = [(req_format, url_map[req_format])] # Specific format
2059
58ca755f 2060 results = []
d77c3dfd 2061 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
2062 # Extension
2063 video_extension = self._video_extensions.get(format_param, 'mp4')
2064
58ca755f
FV
2065 results.append({
2066 'id': video_id.decode('utf-8'),
2067 'url': video_real_url.decode('utf-8'),
2068 'uploader': video_uploader.decode('utf-8'),
2069 'upload_date': upload_date,
2070 'title': video_title,
58ca755f
FV
2071 'ext': video_extension.decode('utf-8'),
2072 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2073 'thumbnail': video_thumbnail.decode('utf-8'),
2074 'description': video_description.decode('utf-8'),
2075 'player_url': None,
2076 })
2077 return results
d77c3dfd
FV
2078
2079class BlipTVIE(InfoExtractor):
2080 """Information extractor for blip.tv"""
2081
2082 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2083 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2084 IE_NAME = u'blip.tv'
2085
2086 def report_extraction(self, file_id):
2087 """Report information extraction."""
2088 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2089
2090 def report_direct_download(self, title):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2093
2094 def _real_extract(self, url):
2095 mobj = re.match(self._VALID_URL, url)
2096 if mobj is None:
2097 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2098 return
2099
2100 if '?' in url:
2101 cchar = '&'
2102 else:
2103 cchar = '?'
2104 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
f1927d71 2105 request = urllib2.Request(json_url.encode('utf-8'))
d77c3dfd
FV
2106 self.report_extraction(mobj.group(1))
2107 info = None
2108 try:
2109 urlh = urllib2.urlopen(request)
2110 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2111 basename = url.split('/')[-1]
2112 title,ext = os.path.splitext(basename)
2113 title = title.decode('UTF-8')
2114 ext = ext.replace('.', '')
2115 self.report_direct_download(title)
2116 info = {
2117 'id': title,
2118 'url': url,
2119 'title': title,
d77c3dfd
FV
2120 'ext': ext,
2121 'urlhandle': urlh
2122 }
2123 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2124 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2125 return
2126 if info is None: # Regular URL
2127 try:
2128 json_code = urlh.read()
2129 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2130 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2131 return
2132
2133 try:
2134 json_data = json.loads(json_code)
2135 if 'Post' in json_data:
2136 data = json_data['Post']
2137 else:
2138 data = json_data
3fe294e4 2139
d77c3dfd
FV
2140 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2141 video_url = data['media']['url']
2142 umobj = re.match(self._URL_EXT, video_url)
2143 if umobj is None:
2144 raise ValueError('Can not determine filename extension')
2145 ext = umobj.group(1)
3fe294e4 2146
d77c3dfd
FV
2147 info = {
2148 'id': data['item_id'],
2149 'url': video_url,
2150 'uploader': data['display_name'],
2151 'upload_date': upload_date,
2152 'title': data['title'],
d77c3dfd
FV
2153 'ext': ext,
2154 'format': data['media']['mimeType'],
2155 'thumbnail': data['thumbnailUrl'],
2156 'description': data['description'],
2157 'player_url': data['embedUrl']
2158 }
2159 except (ValueError,KeyError), err:
2160 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2161 return
2162
81828271 2163 std_headers['User-Agent'] = 'iTunes/10.6.1'
58ca755f 2164 return [info]
d77c3dfd
FV
2165
2166
2167class MyVideoIE(InfoExtractor):
2168 """Information Extractor for myvideo.de."""
2169
2170 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2171 IE_NAME = u'myvideo'
2172
2173 def __init__(self, downloader=None):
2174 InfoExtractor.__init__(self, downloader)
2175
2176 def report_download_webpage(self, video_id):
2177 """Report webpage download."""
2178 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2179
2180 def report_extraction(self, video_id):
2181 """Report information extraction."""
2182 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2183
2184 def _real_extract(self,url):
2185 mobj = re.match(self._VALID_URL, url)
2186 if mobj is None:
2187 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2188 return
2189
2190 video_id = mobj.group(1)
2191
2192 # Get video webpage
2193 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2194 try:
2195 self.report_download_webpage(video_id)
2196 webpage = urllib2.urlopen(request).read()
2197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2198 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2199 return
2200
2201 self.report_extraction(video_id)
2202 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2203 webpage)
2204 if mobj is None:
2205 self._downloader.trouble(u'ERROR: unable to extract media URL')
2206 return
2207 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2208
2209 mobj = re.search('<title>([^<]+)</title>', webpage)
2210 if mobj is None:
2211 self._downloader.trouble(u'ERROR: unable to extract title')
2212 return
2213
2214 video_title = mobj.group(1)
d77c3dfd 2215
58ca755f
FV
2216 return [{
2217 'id': video_id,
2218 'url': video_url,
2219 'uploader': u'NA',
2220 'upload_date': u'NA',
2221 'title': video_title,
58ca755f
FV
2222 'ext': u'flv',
2223 'format': u'NA',
2224 'player_url': None,
2225 }]
d77c3dfd
FV
2226
2227class ComedyCentralIE(InfoExtractor):
2228 """Information extractor for The Daily Show and Colbert Report """
2229
2230 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2231 IE_NAME = u'comedycentral'
2232
2233 def report_extraction(self, episode_id):
2234 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3fe294e4 2235
d77c3dfd
FV
2236 def report_config_download(self, episode_id):
2237 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2238
2239 def report_index_download(self, episode_id):
2240 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2241
2242 def report_player_url(self, episode_id):
2243 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2244
2245 def _real_extract(self, url):
2246 mobj = re.match(self._VALID_URL, url)
2247 if mobj is None:
2248 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2249 return
2250
2251 if mobj.group('shortname'):
2252 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253 url = u'http://www.thedailyshow.com/full-episodes/'
2254 else:
2255 url = u'http://www.colbertnation.com/full-episodes/'
2256 mobj = re.match(self._VALID_URL, url)
2257 assert mobj is not None
2258
2259 dlNewest = not mobj.group('episode')
2260 if dlNewest:
2261 epTitle = mobj.group('showname')
2262 else:
2263 epTitle = mobj.group('episode')
2264
2265 req = urllib2.Request(url)
2266 self.report_extraction(epTitle)
2267 try:
2268 htmlHandle = urllib2.urlopen(req)
2269 html = htmlHandle.read()
2270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2271 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2272 return
2273 if dlNewest:
2274 url = htmlHandle.geturl()
2275 mobj = re.match(self._VALID_URL, url)
2276 if mobj is None:
2277 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2278 return
2279 if mobj.group('episode') == '':
2280 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2281 return
2282 epTitle = mobj.group('episode')
2283
2284 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2285 if len(mMovieParams) == 0:
2286 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2287 return
2288
2289 playerUrl_raw = mMovieParams[0][0]
2290 self.report_player_url(epTitle)
2291 try:
2292 urlHandle = urllib2.urlopen(playerUrl_raw)
2293 playerUrl = urlHandle.geturl()
2294 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2295 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2296 return
2297
2298 uri = mMovieParams[0][1]
2299 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2300 self.report_index_download(epTitle)
2301 try:
2302 indexXml = urllib2.urlopen(indexUrl).read()
2303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2305 return
2306
58ca755f
FV
2307 results = []
2308
d77c3dfd
FV
2309 idoc = xml.etree.ElementTree.fromstring(indexXml)
2310 itemEls = idoc.findall('.//item')
2311 for itemEl in itemEls:
2312 mediaId = itemEl.findall('./guid')[0].text
2313 shortMediaId = mediaId.split(':')[-1]
2314 showId = mediaId.split(':')[-2].replace('.com', '')
2315 officialTitle = itemEl.findall('./title')[0].text
2316 officialDate = itemEl.findall('./pubDate')[0].text
2317
2318 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2319 urllib.urlencode({'uri': mediaId}))
2320 configReq = urllib2.Request(configUrl)
2321 self.report_config_download(epTitle)
2322 try:
2323 configXml = urllib2.urlopen(configReq).read()
2324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2325 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2326 return
2327
2328 cdoc = xml.etree.ElementTree.fromstring(configXml)
2329 turls = []
2330 for rendition in cdoc.findall('.//rendition'):
2331 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2332 turls.append(finfo)
2333
2334 if len(turls) == 0:
2335 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2336 continue
2337
2338 # For now, just pick the highest bitrate
2339 format,video_url = turls[-1]
2340
d77c3dfd
FV
2341 effTitle = showId + u'-' + epTitle
2342 info = {
2343 'id': shortMediaId,
2344 'url': video_url,
2345 'uploader': showId,
2346 'upload_date': officialDate,
2347 'title': effTitle,
d77c3dfd
FV
2348 'ext': 'mp4',
2349 'format': format,
2350 'thumbnail': None,
2351 'description': officialTitle,
2352 'player_url': playerUrl
2353 }
2354
58ca755f
FV
2355 results.append(info)
2356
2357 return results
d77c3dfd
FV
2358
2359
2360class EscapistIE(InfoExtractor):
2361 """Information extractor for The Escapist """
2362
2363 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2364 IE_NAME = u'escapist'
2365
2366 def report_extraction(self, showName):
2367 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2368
2369 def report_config_download(self, showName):
2370 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2371
2372 def _real_extract(self, url):
d77c3dfd
FV
2373 mobj = re.match(self._VALID_URL, url)
2374 if mobj is None:
2375 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2376 return
2377 showName = mobj.group('showname')
2378 videoId = mobj.group('episode')
2379
2380 self.report_extraction(showName)
2381 try:
3210735c
PH
2382 webPage = urllib2.urlopen(url)
2383 webPageBytes = webPage.read()
2384 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2385 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
d77c3dfd
FV
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2388 return
2389
2390 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
9e6dd238 2391 description = unescapeHTML(descMatch.group(1))
d77c3dfd 2392 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
9e6dd238 2393 imgUrl = unescapeHTML(imgMatch.group(1))
d77c3dfd 2394 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
9e6dd238 2395 playerUrl = unescapeHTML(playerUrlMatch.group(1))
d77c3dfd
FV
2396 configUrlMatch = re.search('config=(.*)$', playerUrl)
2397 configUrl = urllib2.unquote(configUrlMatch.group(1))
2398
2399 self.report_config_download(showName)
2400 try:
2401 configJSON = urllib2.urlopen(configUrl).read()
2402 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2403 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2404 return
2405
2406 # Technically, it's JavaScript, not JSON
2407 configJSON = configJSON.replace("'", '"')
2408
2409 try:
2410 config = json.loads(configJSON)
2411 except (ValueError,), err:
2412 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2413 return
2414
2415 playlist = config['playlist']
2416 videoUrl = playlist[1]['url']
2417
d77c3dfd
FV
2418 info = {
2419 'id': videoId,
2420 'url': videoUrl,
2421 'uploader': showName,
2422 'upload_date': None,
2423 'title': showName,
d77c3dfd
FV
2424 'ext': 'flv',
2425 'format': 'flv',
2426 'thumbnail': imgUrl,
2427 'description': description,
2428 'player_url': playerUrl,
2429 }
2430
58ca755f 2431 return [info]
d77c3dfd
FV
2432
2433
2434class CollegeHumorIE(InfoExtractor):
2435 """Information extractor for collegehumor.com"""
2436
2437 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2438 IE_NAME = u'collegehumor'
2439
2440 def report_webpage(self, video_id):
2441 """Report information extraction."""
2442 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2443
2444 def report_extraction(self, video_id):
2445 """Report information extraction."""
2446 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2447
2448 def _real_extract(self, url):
d77c3dfd
FV
2449 mobj = re.match(self._VALID_URL, url)
2450 if mobj is None:
2451 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452 return
2453 video_id = mobj.group('videoid')
2454
2455 self.report_webpage(video_id)
2456 request = urllib2.Request(url)
2457 try:
2458 webpage = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2461 return
2462
2463 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2464 if m is None:
2465 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2466 return
2467 internal_video_id = m.group('internalvideoid')
2468
2469 info = {
2470 'id': video_id,
2471 'internal_id': internal_video_id,
2472 }
2473
2474 self.report_extraction(video_id)
2475 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2476 try:
2477 metaXml = urllib2.urlopen(xmlUrl).read()
2478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2480 return
2481
2482 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2483 try:
2484 videoNode = mdoc.findall('./video')[0]
2485 info['description'] = videoNode.findall('./description')[0].text
2486 info['title'] = videoNode.findall('./caption')[0].text
d77c3dfd
FV
2487 info['url'] = videoNode.findall('./file')[0].text
2488 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2489 info['ext'] = info['url'].rpartition('.')[2]
2490 info['format'] = info['ext']
2491 except IndexError:
2492 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2493 return
2494
58ca755f 2495 return [info]
d77c3dfd
FV
2496
2497
2498class XVideosIE(InfoExtractor):
2499 """Information extractor for xvideos.com"""
2500
2501 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2502 IE_NAME = u'xvideos'
2503
2504 def report_webpage(self, video_id):
2505 """Report information extraction."""
2506 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2507
2508 def report_extraction(self, video_id):
2509 """Report information extraction."""
2510 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2511
2512 def _real_extract(self, url):
d77c3dfd
FV
2513 mobj = re.match(self._VALID_URL, url)
2514 if mobj is None:
2515 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2516 return
2517 video_id = mobj.group(1).decode('utf-8')
2518
2519 self.report_webpage(video_id)
2520
2521 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2522 try:
2523 webpage = urllib2.urlopen(request).read()
2524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2526 return
2527
2528 self.report_extraction(video_id)
2529
2530
2531 # Extract video URL
2532 mobj = re.search(r'flv_url=(.+?)&', webpage)
2533 if mobj is None:
2534 self._downloader.trouble(u'ERROR: unable to extract video url')
2535 return
2536 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2537
2538
2539 # Extract title
2540 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2541 if mobj is None:
2542 self._downloader.trouble(u'ERROR: unable to extract video title')
2543 return
2544 video_title = mobj.group(1).decode('utf-8')
2545
2546
2547 # Extract video thumbnail
363a4e11 2548 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
d77c3dfd
FV
2549 if mobj is None:
2550 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2551 return
363a4e11 2552 video_thumbnail = mobj.group(0).decode('utf-8')
d77c3dfd 2553
d77c3dfd
FV
2554 info = {
2555 'id': video_id,
2556 'url': video_url,
2557 'uploader': None,
2558 'upload_date': None,
2559 'title': video_title,
d77c3dfd
FV
2560 'ext': 'flv',
2561 'format': 'flv',
2562 'thumbnail': video_thumbnail,
2563 'description': None,
2564 'player_url': None,
2565 }
2566
58ca755f 2567 return [info]
d77c3dfd
FV
2568
2569
2570class SoundcloudIE(InfoExtractor):
2571 """Information extractor for soundcloud.com
2572 To access the media, the uid of the song and a stream token
2573 must be extracted from the page source and the script must make
2574 a request to media.soundcloud.com/crossdomain.xml. Then
2575 the media can be grabbed by requesting from an url composed
2576 of the stream token and uid
2577 """
2578
2579 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2580 IE_NAME = u'soundcloud'
2581
2582 def __init__(self, downloader=None):
2583 InfoExtractor.__init__(self, downloader)
2584
2585 def report_webpage(self, video_id):
2586 """Report information extraction."""
2587 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2588
2589 def report_extraction(self, video_id):
2590 """Report information extraction."""
2591 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2592
2593 def _real_extract(self, url):
d77c3dfd
FV
2594 mobj = re.match(self._VALID_URL, url)
2595 if mobj is None:
2596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2597 return
2598
2599 # extract uploader (which is in the url)
2600 uploader = mobj.group(1).decode('utf-8')
2601 # extract simple title (uploader + slug of song title)
2602 slug_title = mobj.group(2).decode('utf-8')
2c288bda 2603 simple_title = uploader + u'-' + slug_title
d77c3dfd
FV
2604
2605 self.report_webpage('%s/%s' % (uploader, slug_title))
2606
2607 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2608 try:
2609 webpage = urllib2.urlopen(request).read()
2610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2612 return
2613
2614 self.report_extraction('%s/%s' % (uploader, slug_title))
2615
2616 # extract uid and stream token that soundcloud hands out for access
2617 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2618 if mobj:
2619 video_id = mobj.group(1)
2620 stream_token = mobj.group(2)
2621
2622 # extract unsimplified title
2623 mobj = re.search('"title":"(.*?)",', webpage)
2624 if mobj:
2c288bda
FV
2625 title = mobj.group(1).decode('utf-8')
2626 else:
2627 title = simple_title
d77c3dfd
FV
2628
2629 # construct media url (with uid/token)
2630 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2631 mediaURL = mediaURL % (video_id, stream_token)
2632
2633 # description
2634 description = u'No description available'
2635 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2636 if mobj:
2637 description = mobj.group(1)
2638
2639 # upload date
2640 upload_date = None
2641 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2642 if mobj:
2643 try:
2644 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2645 except Exception, e:
6ab92c8b 2646 self._downloader.to_stderr(str(e))
d77c3dfd
FV
2647
2648 # for soundcloud, a request to a cross domain is required for cookies
2649 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2650
58ca755f
FV
2651 return [{
2652 'id': video_id.decode('utf-8'),
2653 'url': mediaURL,
2654 'uploader': uploader.decode('utf-8'),
2655 'upload_date': upload_date,
2c288bda 2656 'title': title,
58ca755f
FV
2657 'ext': u'mp3',
2658 'format': u'NA',
2659 'player_url': None,
2660 'description': description.decode('utf-8')
2661 }]
d77c3dfd
FV
2662
2663
2664class InfoQIE(InfoExtractor):
2665 """Information extractor for infoq.com"""
2666
2667 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2668 IE_NAME = u'infoq'
2669
2670 def report_webpage(self, video_id):
2671 """Report information extraction."""
2672 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2673
2674 def report_extraction(self, video_id):
2675 """Report information extraction."""
2676 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2677
2678 def _real_extract(self, url):
d77c3dfd
FV
2679 mobj = re.match(self._VALID_URL, url)
2680 if mobj is None:
2681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2682 return
2683
2684 self.report_webpage(url)
2685
2686 request = urllib2.Request(url)
2687 try:
2688 webpage = urllib2.urlopen(request).read()
2689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2690 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2691 return
2692
2693 self.report_extraction(url)
2694
2695
2696 # Extract video URL
2697 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2698 if mobj is None:
2699 self._downloader.trouble(u'ERROR: unable to extract video url')
2700 return
2701 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2702
2703
2704 # Extract title
2705 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2706 if mobj is None:
2707 self._downloader.trouble(u'ERROR: unable to extract video title')
2708 return
2709 video_title = mobj.group(1).decode('utf-8')
2710
2711 # Extract description
2712 video_description = u'No description available.'
2713 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2714 if mobj is not None:
2715 video_description = mobj.group(1).decode('utf-8')
2716
2717 video_filename = video_url.split('/')[-1]
2718 video_id, extension = video_filename.split('.')
2719
d77c3dfd
FV
2720 info = {
2721 'id': video_id,
2722 'url': video_url,
2723 'uploader': None,
2724 'upload_date': None,
2725 'title': video_title,
d77c3dfd
FV
2726 'ext': extension,
2727 'format': extension, # Extension is always(?) mp4, but seems to be flv
2728 'thumbnail': None,
2729 'description': video_description,
2730 'player_url': None,
2731 }
2732
58ca755f 2733 return [info]
d77c3dfd
FV
2734
2735class MixcloudIE(InfoExtractor):
2736 """Information extractor for www.mixcloud.com"""
2737 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2738 IE_NAME = u'mixcloud'
2739
2740 def __init__(self, downloader=None):
2741 InfoExtractor.__init__(self, downloader)
2742
2743 def report_download_json(self, file_id):
2744 """Report JSON download."""
2745 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2746
2747 def report_extraction(self, file_id):
2748 """Report information extraction."""
2749 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2750
2751 def get_urls(self, jsonData, fmt, bitrate='best'):
2752 """Get urls from 'audio_formats' section in json"""
2753 file_url = None
2754 try:
2755 bitrate_list = jsonData[fmt]
2756 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2757 bitrate = max(bitrate_list) # select highest
2758
2759 url_list = jsonData[fmt][bitrate]
2760 except TypeError: # we have no bitrate info.
2761 url_list = jsonData[fmt]
d77c3dfd
FV
2762 return url_list
2763
2764 def check_urls(self, url_list):
2765 """Returns 1st active url from list"""
2766 for url in url_list:
2767 try:
2768 urllib2.urlopen(url)
2769 return url
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771 url = None
2772
2773 return None
2774
2775 def _print_formats(self, formats):
2776 print 'Available formats:'
2777 for fmt in formats.keys():
2778 for b in formats[fmt]:
2779 try:
2780 ext = formats[fmt][b][0]
2781 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2782 except TypeError: # we have no bitrate info
2783 ext = formats[fmt][0]
2784 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2785 break
2786
2787 def _real_extract(self, url):
2788 mobj = re.match(self._VALID_URL, url)
2789 if mobj is None:
2790 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2791 return
2792 # extract uploader & filename from url
2793 uploader = mobj.group(1).decode('utf-8')
2794 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2795
2796 # construct API request
2797 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2798 # retrieve .json file with links to files
2799 request = urllib2.Request(file_url)
2800 try:
2801 self.report_download_json(file_url)
2802 jsonData = urllib2.urlopen(request).read()
2803 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2804 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2805 return
2806
2807 # parse JSON
2808 json_data = json.loads(jsonData)
2809 player_url = json_data['player_swf_url']
2810 formats = dict(json_data['audio_formats'])
2811
2812 req_format = self._downloader.params.get('format', None)
2813 bitrate = None
2814
2815 if self._downloader.params.get('listformats', None):
2816 self._print_formats(formats)
2817 return
2818
2819 if req_format is None or req_format == 'best':
2820 for format_param in formats.keys():
2821 url_list = self.get_urls(formats, format_param)
2822 # check urls
2823 file_url = self.check_urls(url_list)
2824 if file_url is not None:
2825 break # got it!
2826 else:
2827 if req_format not in formats.keys():
2828 self._downloader.trouble(u'ERROR: format is not available')
2829 return
2830
2831 url_list = self.get_urls(formats, req_format)
2832 file_url = self.check_urls(url_list)
2833 format_param = req_format
2834
58ca755f
FV
2835 return [{
2836 'id': file_id.decode('utf-8'),
2837 'url': file_url.decode('utf-8'),
2838 'uploader': uploader.decode('utf-8'),
2839 'upload_date': u'NA',
2840 'title': json_data['name'],
58ca755f
FV
2841 'ext': file_url.split('.')[-1].decode('utf-8'),
2842 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2843 'thumbnail': json_data['thumbnail_url'],
2844 'description': json_data['description'],
2845 'player_url': player_url.decode('utf-8'),
2846 }]
d77c3dfd
FV
2847
2848class StanfordOpenClassroomIE(InfoExtractor):
2849 """Information extractor for Stanford's Open ClassRoom"""
2850
2851 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2852 IE_NAME = u'stanfordoc'
2853
2854 def report_download_webpage(self, objid):
2855 """Report information extraction."""
2856 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2857
2858 def report_extraction(self, video_id):
2859 """Report information extraction."""
2860 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2861
2862 def _real_extract(self, url):
2863 mobj = re.match(self._VALID_URL, url)
2864 if mobj is None:
2865 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2866 return
2867
2868 if mobj.group('course') and mobj.group('video'): # A specific video
2869 course = mobj.group('course')
2870 video = mobj.group('video')
2871 info = {
2c288bda 2872 'id': course + '_' + video,
d77c3dfd 2873 }
3fe294e4 2874
d77c3dfd
FV
2875 self.report_extraction(info['id'])
2876 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2877 xmlUrl = baseUrl + video + '.xml'
2878 try:
2879 metaXml = urllib2.urlopen(xmlUrl).read()
2880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2882 return
2883 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2884 try:
2885 info['title'] = mdoc.findall('./title')[0].text
2886 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2887 except IndexError:
2888 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2889 return
d77c3dfd
FV
2890 info['ext'] = info['url'].rpartition('.')[2]
2891 info['format'] = info['ext']
58ca755f 2892 return [info]
d77c3dfd 2893 elif mobj.group('course'): # A course page
d77c3dfd
FV
2894 course = mobj.group('course')
2895 info = {
2c288bda 2896 'id': course,
d77c3dfd
FV
2897 'type': 'playlist',
2898 }
2899
2900 self.report_download_webpage(info['id'])
2901 try:
2902 coursepage = urllib2.urlopen(url).read()
2903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2904 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2905 return
2906
2907 m = re.search('<h1>([^<]+)</h1>', coursepage)
2908 if m:
2909 info['title'] = unescapeHTML(m.group(1))
2910 else:
2911 info['title'] = info['id']
d77c3dfd
FV
2912
2913 m = re.search('<description>([^<]+)</description>', coursepage)
2914 if m:
2915 info['description'] = unescapeHTML(m.group(1))
2916
2917 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2918 info['list'] = [
2919 {
2920 'type': 'reference',
2921 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2922 }
2923 for vpage in links]
58ca755f 2924 results = []
d77c3dfd
FV
2925 for entry in info['list']:
2926 assert entry['type'] == 'reference'
58ca755f
FV
2927 results += self.extract(entry['url'])
2928 return results
2929
d77c3dfd 2930 else: # Root page
d77c3dfd
FV
2931 info = {
2932 'id': 'Stanford OpenClassroom',
2933 'type': 'playlist',
2934 }
2935
2936 self.report_download_webpage(info['id'])
2937 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2938 try:
2939 rootpage = urllib2.urlopen(rootURL).read()
2940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2941 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2942 return
2943
2944 info['title'] = info['id']
d77c3dfd
FV
2945
2946 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2947 info['list'] = [
2948 {
2949 'type': 'reference',
2950 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2951 }
2952 for cpage in links]
2953
58ca755f 2954 results = []
d77c3dfd
FV
2955 for entry in info['list']:
2956 assert entry['type'] == 'reference'
58ca755f
FV
2957 results += self.extract(entry['url'])
2958 return results
d77c3dfd
FV
2959
2960class MTVIE(InfoExtractor):
2961 """Information extractor for MTV.com"""
2962
2963 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2964 IE_NAME = u'mtv'
2965
2966 def report_webpage(self, video_id):
2967 """Report information extraction."""
2968 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2969
2970 def report_extraction(self, video_id):
2971 """Report information extraction."""
2972 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2973
2974 def _real_extract(self, url):
2975 mobj = re.match(self._VALID_URL, url)
2976 if mobj is None:
2977 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2978 return
2979 if not mobj.group('proto'):
2980 url = 'http://' + url
2981 video_id = mobj.group('videoid')
2982 self.report_webpage(video_id)
2983
2984 request = urllib2.Request(url)
2985 try:
2986 webpage = urllib2.urlopen(request).read()
2987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2989 return
2990
2991 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2992 if mobj is None:
2993 self._downloader.trouble(u'ERROR: unable to extract song name')
2994 return
2995 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2996 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2997 if mobj is None:
2998 self._downloader.trouble(u'ERROR: unable to extract performer')
2999 return
3000 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3001 video_title = performer + ' - ' + song_name
3002
3003 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3004 if mobj is None:
3005 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3006 return
3007 mtvn_uri = mobj.group(1)
3008
3009 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3010 if mobj is None:
3011 self._downloader.trouble(u'ERROR: unable to extract content id')
3012 return
3013 content_id = mobj.group(1)
3014
3015 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3016 self.report_extraction(video_id)
3017 request = urllib2.Request(videogen_url)
3018 try:
3019 metadataXml = urllib2.urlopen(request).read()
3020 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3021 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3022 return
3023
3024 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3025 renditions = mdoc.findall('.//rendition')
3026
3027 # For now, always pick the highest quality.
3028 rendition = renditions[-1]
3029
3030 try:
3031 _,_,ext = rendition.attrib['type'].partition('/')
3032 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3033 video_url = rendition.find('./src').text
3034 except KeyError:
3035 self._downloader.trouble('Invalid rendition field.')
3036 return
3037
d77c3dfd
FV
3038 info = {
3039 'id': video_id,
3040 'url': video_url,
3041 'uploader': performer,
3042 'title': video_title,
d77c3dfd
FV
3043 'ext': ext,
3044 'format': format,
3045 }
3046
58ca755f 3047 return [info]
6de7ef9b 3048
302efc19 3049
302efc19 3050class YoukuIE(InfoExtractor):
3051
3052 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3053 IE_NAME = u'Youku'
3054
3055 def __init__(self, downloader=None):
3056 InfoExtractor.__init__(self, downloader)
3057
3058 def report_download_webpage(self, file_id):
3059 """Report webpage download."""
3060 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3061
3062 def report_extraction(self, file_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3065
3066 def _gen_sid(self):
3067 nowTime = int(time.time() * 1000)
3068 random1 = random.randint(1000,1998)
3069 random2 = random.randint(1000,9999)
3070
3071 return "%d%d%d" %(nowTime,random1,random2)
3072
3073 def _get_file_ID_mix_string(self, seed):
3074 mixed = []
3075 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3076 seed = float(seed)
3077 for i in range(len(source)):
3078 seed = (seed * 211 + 30031 ) % 65536
3079 index = math.floor(seed / 65536 * len(source) )
3080 mixed.append(source[int(index)])
3081 source.remove(source[int(index)])
3082 #return ''.join(mixed)
3083 return mixed
3084
302efc19 3085 def _get_file_id(self, fileId, seed):
3086 mixed = self._get_file_ID_mix_string(seed)
3087 ids = fileId.split('*')
3088 realId = []
3089 for ch in ids:
d5c4c4c1 3090 if ch:
302efc19 3091 realId.append(mixed[int(ch)])
3092 return ''.join(realId)
3093
302efc19 3094 def _real_extract(self, url):
3095 mobj = re.match(self._VALID_URL, url)
3096 if mobj is None:
3097 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3098 return
3099 video_id = mobj.group('ID')
3100
3101 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
302efc19 3102
3103 request = urllib2.Request(info_url, None, std_headers)
3104 try:
3105 self.report_download_webpage(video_id)
3106 jsondata = urllib2.urlopen(request).read()
3107 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3108 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3109 return
3110
3111 self.report_extraction(video_id)
3112 try:
3113 config = json.loads(jsondata)
3114
3115 video_title = config['data'][0]['title']
3116 seed = config['data'][0]['seed']
0a98b09b 3117
3118 format = self._downloader.params.get('format', None)
3119 supported_format = config['data'][0]['streamfileids'].keys()
3120
3121 if format is None or format == 'best':
3122 if 'hd2' in supported_format:
3123 format = 'hd2'
3124 else:
3125 format = 'flv'
3126 ext = u'flv'
3127 elif format == 'worst':
3128 format = 'mp4'
3129 ext = u'mp4'
3130 else:
3131 format = 'flv'
3132 ext = u'flv'
3133
302efc19 3134
3135 fileid = config['data'][0]['streamfileids'][format]
3136 seg_number = len(config['data'][0]['segs'][format])
0a98b09b 3137
302efc19 3138 keys=[]
3139 for i in xrange(seg_number):
3140 keys.append(config['data'][0]['segs'][format][i]['k'])
3141
3142 #TODO check error
3143 #youku only could be viewed from mainland china
3144 except:
3145 self._downloader.trouble(u'ERROR: unable to extract info section')
3146 return
3147
3148 files_info=[]
3149 sid = self._gen_sid()
3150 fileid = self._get_file_id(fileid, seed)
3151
3152 #column 8,9 of fileid represent the segment number
3153 #fileid[7:9] should be changed
3154 for index, key in enumerate(keys):
3155
7733d455 3156 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3157 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
d5c4c4c1 3158
302efc19 3159 info = {
b5809a68 3160 'id': '%s_part%02d' % (video_id, index),
302efc19 3161 'url': download_url,
3162 'uploader': None,
b5809a68 3163 'title': video_title,
0a98b09b 3164 'ext': ext,
302efc19 3165 'format': u'NA'
3166 }
3167 files_info.append(info)
3168
3169 return files_info
5dc846fa
FV
3170
3171
6de7ef9b 3172class XNXXIE(InfoExtractor):
3173 """Information extractor for xnxx.com"""
3174
3175 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3176 IE_NAME = u'xnxx'
3177 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3178 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3179 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3180
3181 def report_webpage(self, video_id):
3182 """Report information extraction"""
3183 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3184
3185 def report_extraction(self, video_id):
3186 """Report information extraction"""
3187 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3188
6de7ef9b 3189 def _real_extract(self, url):
3190 mobj = re.match(self._VALID_URL, url)
3191 if mobj is None:
3192 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3193 return
3194 video_id = mobj.group(1).decode('utf-8')
3195
3196 self.report_webpage(video_id)
3197
3198 # Get webpage content
3199 try:
3200 webpage = urllib2.urlopen(url).read()
3201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3202 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3203 return
3204
795cc505
FV
3205 result = re.search(self.VIDEO_URL_RE, webpage)
3206 if result is None:
3207 self._downloader.trouble(u'ERROR: unable to extract video url')
3208 return
3209 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3210
3211 result = re.search(self.VIDEO_TITLE_RE, webpage)
3212 if result is None:
3213 self._downloader.trouble(u'ERROR: unable to extract video title')
3214 return
3215 video_title = result.group(1).decode('utf-8')
3216
3217 result = re.search(self.VIDEO_THUMB_RE, webpage)
3218 if result is None:
3219 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3220 return
3221 video_thumbnail = result.group(1).decode('utf-8')
3222
6de7ef9b 3223 info = {'id': video_id,
795cc505 3224 'url': video_url,
6de7ef9b 3225 'uploader': None,
3226 'upload_date': None,
795cc505 3227 'title': video_title,
6de7ef9b 3228 'ext': 'flv',
3229 'format': 'flv',
795cc505 3230 'thumbnail': video_thumbnail,
6de7ef9b 3231 'description': None,
3232 'player_url': None}
3233
ebe3f89e 3234 return [info]
fd873c69
FV
3235
3236
d443aca8
KK
3237class GooglePlusIE(InfoExtractor):
3238 """Information extractor for plus.google.com."""
3239
fd873c69 3240 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
d443aca8
KK
3241 IE_NAME = u'plus.google'
3242
3243 def __init__(self, downloader=None):
3244 InfoExtractor.__init__(self, downloader)
3245
3246 def report_extract_entry(self, url):
3247 """Report downloading extry"""
3248 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3249
3250 def report_date(self, upload_date):
3251 """Report downloading extry"""
3252 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3253
3254 def report_uploader(self, uploader):
3255 """Report downloading extry"""
3256 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3257
3258 def report_title(self, video_title):
3259 """Report downloading extry"""
3260 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3261
3262 def report_extract_vid_page(self, video_page):
3263 """Report information extraction."""
3264 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3265
3266 def _real_extract(self, url):
3267 # Extract id from URL
3268 mobj = re.match(self._VALID_URL, url)
3269 if mobj is None:
3270 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3271 return
3272
3273 post_url = mobj.group(0)
3274 video_id = mobj.group(2)
3275
3276 video_extension = 'flv'
3277
3278 # Step 1, Retrieve post webpage to extract further information
fd873c69 3279 self.report_extract_entry(post_url)
d443aca8
KK
3280 request = urllib2.Request(post_url)
3281 try:
d443aca8
KK
3282 webpage = urllib2.urlopen(request).read()
3283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3285 return
3286
3287 # Extract update date
3288 upload_date = u'NA'
3289 pattern = 'title="Timestamp">(.*?)</a>'
3290 mobj = re.search(pattern, webpage)
3291 if mobj:
3292 upload_date = mobj.group(1)
fd873c69 3293 # Convert timestring to a format suitable for filename
d443aca8
KK
3294 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3295 upload_date = upload_date.strftime('%Y%m%d')
3296 self.report_date(upload_date)
3297
3298 # Extract uploader
3299 uploader = u'NA'
3300 pattern = r'rel\="author".*?>(.*?)</a>'
3301 mobj = re.search(pattern, webpage)
3302 if mobj:
3303 uploader = mobj.group(1)
3304 self.report_uploader(uploader)
3305
3306 # Extract title
fd873c69 3307 # Get the first line for title
d443aca8 3308 video_title = u'NA'
fd873c69 3309 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
d443aca8
KK
3310 mobj = re.search(pattern, webpage)
3311 if mobj:
3312 video_title = mobj.group(1)
3313 self.report_title(video_title)
3314
3315 # Step 2, Stimulate clicking the image box to launch video
3316 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3317 mobj = re.search(pattern, webpage)
3318 if mobj is None:
3319 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3320
3321 video_page = mobj.group(1)
3322 request = urllib2.Request(video_page)
3323 try:
3324 webpage = urllib2.urlopen(request).read()
3325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3326 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3327 return
3328 self.report_extract_vid_page(video_page)
3329
3330
3331 # Extract video links on video page
3332 """Extract video links of all sizes"""
3333 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3334 mobj = re.findall(pattern, webpage)
fd873c69 3335 if len(mobj) == 0:
d443aca8
KK
3336 self._downloader.trouble(u'ERROR: unable to extract video links')
3337
3338 # Sort in resolution
3339 links = sorted(mobj)
3340
3341 # Choose the lowest of the sort, i.e. highest resolution
3342 video_url = links[-1]
3343 # Only get the url. The resolution part in the tuple has no use anymore
3344 video_url = video_url[-1]
3345 # Treat escaped \u0026 style hex
fd873c69 3346 video_url = unicode(video_url, "unicode_escape")
d443aca8
KK
3347
3348
3349 return [{
3350 'id': video_id.decode('utf-8'),
fd873c69 3351 'url': video_url,
d443aca8
KK
3352 'uploader': uploader.decode('utf-8'),
3353 'upload_date': upload_date.decode('utf-8'),
3354 'title': video_title.decode('utf-8'),
3355 'ext': video_extension.decode('utf-8'),
3356 'format': u'NA',
3357 'player_url': None,
3358 }]