]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
cfaef29045d95d45fbf7a8baf0b70874e881d0d7
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21 import cStringIO as StringIO
22 except ImportError:
23 import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
38 the following fields:
39
40 id: Video identifier.
41 url: Final video URL.
42 uploader: Nickname of the video uploader.
43 title: Literal title.
44 ext: Video filename extension.
45 format: Video format.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
52
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59 """
60
61 _ready = False
62 _downloader = None
63
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
66 self._ready = False
67 self.set_downloader(downloader)
68
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
72
73 def initialize(self):
74 """Initializes an instance (authentication, etc)."""
75 if not self._ready:
76 self._real_initialize()
77 self._ready = True
78
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
81 self.initialize()
82 return self._real_extract(url)
83
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
87
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
90 pass
91
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
94 pass
95
96
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
99
100 _VALID_URL = r"""^
101 (
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?:.*?\#/)? # handle anchor (#/) redirect urls
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
113 v=
114 )
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
119 $"""
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
128 _video_extensions = {
129 '13': '3gp',
130 '17': 'mp4',
131 '18': 'mp4',
132 '22': 'mp4',
133 '37': 'mp4',
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
135 '43': 'webm',
136 '44': 'webm',
137 '45': 'webm',
138 '46': 'webm',
139 }
140 _video_dimensions = {
141 '5': '240x400',
142 '6': '???',
143 '13': '???',
144 '17': '144x176',
145 '18': '360x640',
146 '22': '720x1280',
147 '34': '360x640',
148 '35': '480x854',
149 '37': '1080x1920',
150 '38': '3072x4096',
151 '43': '360x640',
152 '44': '480x854',
153 '45': '720x1280',
154 '46': '1080x1920',
155 }
156 IE_NAME = u'youtube'
157
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
165
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
169
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
173
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
197
198 def _closed_captions_xml_to_srt(self, xml_string):
199 srt = ''
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
204 start = float(start)
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
208 caption = unescapeHTML(caption)
209 caption = unescapeHTML(caption) # double cycle, intentional
210 srt += str(n+1) + '\n'
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
213 return srt
214
215 def _print_formats(self, formats):
216 print 'Available formats:'
217 for x in formats:
218 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219
220 def _real_initialize(self):
221 if self._downloader is None:
222 return
223
224 username = None
225 password = None
226 downloader_params = self._downloader.params
227
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
233 try:
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
235 if info is not None:
236 username = info[0]
237 password = info[2]
238 else:
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
242 return
243
244 # Set language
245 request = urllib2.Request(self._LANG_URL)
246 try:
247 self.report_lang()
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
251 return
252
253 # No authentication to be performed
254 if username is None:
255 return
256
257 # Log in
258 login_form = {
259 'current_form': 'loginForm',
260 'next': '/',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
264 }
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
266 try:
267 self.report_login()
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 return
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
274 return
275
276 # Confirm age
277 age_form = {
278 'next_url': '/',
279 'action_confirm': 'Confirm',
280 }
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 try:
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
287 return
288
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
292 if mobj:
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294
295 # Extract video id from URL
296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 if mobj is None:
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 return
300 video_id = mobj.group(2)
301
302 # Get video webpage
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 try:
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
309 return
310
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 if mobj is not None:
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
315 else:
316 player_url = None
317
318 # Get video info
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
324 try:
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
328 break
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 return
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 else:
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
337 return
338
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
342 return
343
344 # Start extracting information
345 self.report_information_extraction(video_id)
346
347 # uploader
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 return
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
352
353 # title
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
356 return
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
359
360 # thumbnail image
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 video_thumbnail = ''
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
366
367 # upload date
368 upload_date = u'NA'
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 if mobj is not None:
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
374 try:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
376 except:
377 pass
378
379 # description
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
382 else: video_description = ''
383
384 # closed captions
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
387 try:
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 try:
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
401 srt_lang = 'en'
402 else:
403 srt_lang = srt_lang_list.keys()[0]
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 try:
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 if not srt_xml:
412 raise Trouble(u'WARNING: unable to download video subtitles')
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
416
417 if 'length_seconds' not in video_info:
418 self._downloader.trouble(u'WARNING: unable to extract video duration')
419 video_duration = ''
420 else:
421 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
422
423 # token
424 video_token = urllib.unquote_plus(video_info['token'][0])
425
426 # Decide which formats to download
427 req_format = self._downloader.params.get('format', None)
428
429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
430 self.report_rtmp_download()
431 video_url_list = [(None, video_info['conn'][0])]
432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
433 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
434 url_data = [parse_qs(uds) for uds in url_data_strs]
435 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
436 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
437
438 format_limit = self._downloader.params.get('format_limit', None)
439 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
440 if format_limit is not None and format_limit in available_formats:
441 format_list = available_formats[available_formats.index(format_limit):]
442 else:
443 format_list = available_formats
444 existing_formats = [x for x in format_list if x in url_map]
445 if len(existing_formats) == 0:
446 self._downloader.trouble(u'ERROR: no known formats available for video')
447 return
448 if self._downloader.params.get('listformats', None):
449 self._print_formats(existing_formats)
450 return
451 if req_format is None or req_format == 'best':
452 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
453 elif req_format == 'worst':
454 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
455 elif req_format in ('-1', 'all'):
456 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
457 else:
458 # Specific formats. We pick the first in a slash-delimeted sequence.
459 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
460 req_formats = req_format.split('/')
461 video_url_list = None
462 for rf in req_formats:
463 if rf in url_map:
464 video_url_list = [(rf, url_map[rf])]
465 break
466 if video_url_list is None:
467 self._downloader.trouble(u'ERROR: requested format not available')
468 return
469 else:
470 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
471 return
472
473 results = []
474 for format_param, video_real_url in video_url_list:
475 # Extension
476 video_extension = self._video_extensions.get(format_param, 'flv')
477
478 results.append({
479 'id': video_id.decode('utf-8'),
480 'url': video_real_url.decode('utf-8'),
481 'uploader': video_uploader.decode('utf-8'),
482 'upload_date': upload_date,
483 'title': video_title,
484 'ext': video_extension.decode('utf-8'),
485 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
486 'thumbnail': video_thumbnail.decode('utf-8'),
487 'description': video_description,
488 'player_url': player_url,
489 'subtitles': video_subtitles,
490 'duration': video_duration
491 })
492 return results
493
494
495 class MetacafeIE(InfoExtractor):
496 """Information Extractor for metacafe.com."""
497
498 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
499 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
500 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
501 IE_NAME = u'metacafe'
502
503 def __init__(self, downloader=None):
504 InfoExtractor.__init__(self, downloader)
505
506 def report_disclaimer(self):
507 """Report disclaimer retrieval."""
508 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
509
510 def report_age_confirmation(self):
511 """Report attempt to confirm age."""
512 self._downloader.to_screen(u'[metacafe] Confirming age')
513
514 def report_download_webpage(self, video_id):
515 """Report webpage download."""
516 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
517
518 def report_extraction(self, video_id):
519 """Report information extraction."""
520 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
521
522 def _real_initialize(self):
523 # Retrieve disclaimer
524 request = urllib2.Request(self._DISCLAIMER)
525 try:
526 self.report_disclaimer()
527 disclaimer = urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
530 return
531
532 # Confirm age
533 disclaimer_form = {
534 'filters': '0',
535 'submit': "Continue - I'm over 18",
536 }
537 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
538 try:
539 self.report_age_confirmation()
540 disclaimer = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
543 return
544
545 def _real_extract(self, url):
546 # Extract id and simplified title from URL
547 mobj = re.match(self._VALID_URL, url)
548 if mobj is None:
549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
550 return
551
552 video_id = mobj.group(1)
553
554 # Check if video comes from YouTube
555 mobj2 = re.match(r'^yt-(.*)$', video_id)
556 if mobj2 is not None:
557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
558 return
559
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
562 try:
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
567 return
568
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
572 if mobj is not None:
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
575
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
578 if mobj is None:
579 video_url = mediaURL
580 else:
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
583 else:
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
585 if mobj is None:
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
587 return
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
591 return
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
593 if mobj is None:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
599
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: unable to extract title')
603 return
604 video_title = mobj.group(1).decode('utf-8')
605
606 mobj = re.search(r'submitter=(.*?);', webpage)
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
609 return
610 video_uploader = mobj.group(1)
611
612 return [{
613 'id': video_id.decode('utf-8'),
614 'url': video_url.decode('utf-8'),
615 'uploader': video_uploader.decode('utf-8'),
616 'upload_date': u'NA',
617 'title': video_title,
618 'ext': video_extension.decode('utf-8'),
619 'format': u'NA',
620 'player_url': None,
621 }]
622
623
624 class DailymotionIE(InfoExtractor):
625 """Information Extractor for Dailymotion"""
626
627 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
628 IE_NAME = u'dailymotion'
629
630 def __init__(self, downloader=None):
631 InfoExtractor.__init__(self, downloader)
632
633 def report_download_webpage(self, video_id):
634 """Report webpage download."""
635 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
636
637 def report_extraction(self, video_id):
638 """Report information extraction."""
639 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
640
641 def _real_extract(self, url):
642 # Extract id and simplified title from URL
643 mobj = re.match(self._VALID_URL, url)
644 if mobj is None:
645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 return
647
648 video_id = mobj.group(1).split('_')[0].split('?')[0]
649
650 video_extension = 'mp4'
651
652 # Retrieve video webpage to extract further information
653 request = urllib2.Request(url)
654 request.add_header('Cookie', 'family_filter=off')
655 try:
656 self.report_download_webpage(video_id)
657 webpage = urllib2.urlopen(request).read()
658 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
659 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
660 return
661
662 # Extract URL, uploader and title from webpage
663 self.report_extraction(video_id)
664 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
665 if mobj is None:
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
667 return
668 flashvars = urllib.unquote(mobj.group(1))
669
670 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
671 if key in flashvars:
672 max_quality = key
673 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
674 break
675 else:
676 self._downloader.trouble(u'ERROR: unable to extract video URL')
677 return
678
679 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
680 if mobj is None:
681 self._downloader.trouble(u'ERROR: unable to extract video URL')
682 return
683
684 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
685
686 # TODO: support choosing qualities
687
688 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
689 if mobj is None:
690 self._downloader.trouble(u'ERROR: unable to extract title')
691 return
692 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
693
694 video_uploader = u'NA'
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
696 if mobj is None:
697 # lookin for official user
698 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
699 if mobj_official is None:
700 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
701 else:
702 video_uploader = mobj_official.group(1)
703 else:
704 video_uploader = mobj.group(1)
705
706 video_upload_date = u'NA'
707 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
708 if mobj is not None:
709 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
710
711 return [{
712 'id': video_id.decode('utf-8'),
713 'url': video_url.decode('utf-8'),
714 'uploader': video_uploader.decode('utf-8'),
715 'upload_date': video_upload_date,
716 'title': video_title,
717 'ext': video_extension.decode('utf-8'),
718 'format': u'NA',
719 'player_url': None,
720 }]
721
722
723 class GoogleIE(InfoExtractor):
724 """Information extractor for video.google.com."""
725
726 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
727 IE_NAME = u'video.google'
728
729 def __init__(self, downloader=None):
730 InfoExtractor.__init__(self, downloader)
731
732 def report_download_webpage(self, video_id):
733 """Report webpage download."""
734 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
735
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
739
740 def _real_extract(self, url):
741 # Extract id from URL
742 mobj = re.match(self._VALID_URL, url)
743 if mobj is None:
744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
745 return
746
747 video_id = mobj.group(1)
748
749 video_extension = 'mp4'
750
751 # Retrieve video webpage to extract further information
752 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
753 try:
754 self.report_download_webpage(video_id)
755 webpage = urllib2.urlopen(request).read()
756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
758 return
759
760 # Extract URL, uploader, and title from webpage
761 self.report_extraction(video_id)
762 mobj = re.search(r"download_url:'([^']+)'", webpage)
763 if mobj is None:
764 video_extension = 'flv'
765 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
766 if mobj is None:
767 self._downloader.trouble(u'ERROR: unable to extract media URL')
768 return
769 mediaURL = urllib.unquote(mobj.group(1))
770 mediaURL = mediaURL.replace('\\x3d', '\x3d')
771 mediaURL = mediaURL.replace('\\x26', '\x26')
772
773 video_url = mediaURL
774
775 mobj = re.search(r'<title>(.*)</title>', webpage)
776 if mobj is None:
777 self._downloader.trouble(u'ERROR: unable to extract title')
778 return
779 video_title = mobj.group(1).decode('utf-8')
780
781 # Extract video description
782 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: unable to extract video description')
785 return
786 video_description = mobj.group(1).decode('utf-8')
787 if not video_description:
788 video_description = 'No description available.'
789
790 # Extract video thumbnail
791 if self._downloader.params.get('forcethumbnail', False):
792 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
793 try:
794 webpage = urllib2.urlopen(request).read()
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
797 return
798 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
799 if mobj is None:
800 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
801 return
802 video_thumbnail = mobj.group(1)
803 else: # we need something to pass to process_info
804 video_thumbnail = ''
805
806 return [{
807 'id': video_id.decode('utf-8'),
808 'url': video_url.decode('utf-8'),
809 'uploader': u'NA',
810 'upload_date': u'NA',
811 'title': video_title,
812 'ext': video_extension.decode('utf-8'),
813 'format': u'NA',
814 'player_url': None,
815 }]
816
817
818 class PhotobucketIE(InfoExtractor):
819 """Information extractor for photobucket.com."""
820
821 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
822 IE_NAME = u'photobucket'
823
824 def __init__(self, downloader=None):
825 InfoExtractor.__init__(self, downloader)
826
827 def report_download_webpage(self, video_id):
828 """Report webpage download."""
829 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
830
831 def report_extraction(self, video_id):
832 """Report information extraction."""
833 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
834
835 def _real_extract(self, url):
836 # Extract id from URL
837 mobj = re.match(self._VALID_URL, url)
838 if mobj is None:
839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
840 return
841
842 video_id = mobj.group(1)
843
844 video_extension = 'flv'
845
846 # Retrieve video webpage to extract further information
847 request = urllib2.Request(url)
848 try:
849 self.report_download_webpage(video_id)
850 webpage = urllib2.urlopen(request).read()
851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
852 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
853 return
854
855 # Extract URL, uploader, and title from webpage
856 self.report_extraction(video_id)
857 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
858 if mobj is None:
859 self._downloader.trouble(u'ERROR: unable to extract media URL')
860 return
861 mediaURL = urllib.unquote(mobj.group(1))
862
863 video_url = mediaURL
864
865 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
866 if mobj is None:
867 self._downloader.trouble(u'ERROR: unable to extract title')
868 return
869 video_title = mobj.group(1).decode('utf-8')
870
871 video_uploader = mobj.group(2).decode('utf-8')
872
873 return [{
874 'id': video_id.decode('utf-8'),
875 'url': video_url.decode('utf-8'),
876 'uploader': video_uploader,
877 'upload_date': u'NA',
878 'title': video_title,
879 'ext': video_extension.decode('utf-8'),
880 'format': u'NA',
881 'player_url': None,
882 }]
883
884
885 class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
887
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
893
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
896
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
910 return
911
912 video_id = mobj.group(2)
913 video_extension = 'flv'
914
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = urllib2.Request(url)
919 try:
920 webpage = urllib2.urlopen(request).read()
921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
923 return
924
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
928 return
929 yahoo_id = mobj.group(1)
930
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934 return
935 yahoo_vid = mobj.group(1)
936
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
939
940 # Retrieve video webpage to extract further information
941 request = urllib2.Request(url)
942 try:
943 self.report_download_webpage(video_id)
944 webpage = urllib2.urlopen(request).read()
945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
947 return
948
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952 if mobj is None:
953 self._downloader.trouble(u'ERROR: unable to extract video title')
954 return
955 video_title = mobj.group(1).decode('utf-8')
956
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
960 return
961 video_uploader = mobj.group(1).decode('utf-8')
962
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965 if mobj is None:
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967 return
968 video_thumbnail = mobj.group(1).decode('utf-8')
969
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video description')
974 return
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
978
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: unable to extract video height')
983 return
984 yv_video_height = mobj.group(1)
985
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video width')
989 return
990 yv_video_width = mobj.group(1)
991
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000 try:
1001 self.report_download_webpage(video_id)
1002 webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1005 return
1006
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009 if mobj is None:
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011 return
1012 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013 video_url = unescapeHTML(video_url)
1014
1015 return [{
1016 'id': video_id.decode('utf-8'),
1017 'url': video_url,
1018 'uploader': video_uploader,
1019 'upload_date': u'NA',
1020 'title': video_title,
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1024 'thumbnail': video_thumbnail,
1025 'player_url': None,
1026 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1031
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1034 IE_NAME = u'vimeo'
1035
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1038
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1050 if mobj is None:
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052 return
1053
1054 video_id = mobj.group(1)
1055
1056 # Retrieve video webpage to extract further information
1057 request = urllib2.Request(url, None, std_headers)
1058 try:
1059 self.report_download_webpage(video_id)
1060 webpage = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1063 return
1064
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1069
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072 try:
1073 config = json.loads(config)
1074 except:
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1076 return
1077
1078 # Extract title
1079 video_title = config["video"]["title"]
1080
1081 # Extract uploader
1082 video_uploader = config["video"]["owner"]["name"]
1083
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1086
1087 # Extract video description
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1091
1092 # Extract upload date
1093 video_upload_date = u'NA'
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1097
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1101
1102 # Vimeo specific: extract video codec and quality information
1103 # TODO bind to format param
1104 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1105 for codec in codecs:
1106 if codec[0] in config["video"]["files"]:
1107 video_codec = codec[0]
1108 video_extension = codec[1]
1109 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1110 else: quality = 'sd'
1111 break
1112 else:
1113 self._downloader.trouble(u'ERROR: no known codec found')
1114 return
1115
1116 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1117 %(video_id, sig, timestamp, quality, video_codec.upper())
1118
1119 return [{
1120 'id': video_id,
1121 'url': video_url,
1122 'uploader': video_uploader,
1123 'upload_date': video_upload_date,
1124 'title': video_title,
1125 'ext': video_extension,
1126 'thumbnail': video_thumbnail,
1127 'description': video_description,
1128 'player_url': None,
1129 }]
1130
1131
1132 class GenericIE(InfoExtractor):
1133 """Generic last-resort information extractor."""
1134
1135 _VALID_URL = r'.*'
1136 IE_NAME = u'generic'
1137
1138 def __init__(self, downloader=None):
1139 InfoExtractor.__init__(self, downloader)
1140
1141 def report_download_webpage(self, video_id):
1142 """Report webpage download."""
1143 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1144 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1145
1146 def report_extraction(self, video_id):
1147 """Report information extraction."""
1148 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1149
1150 def report_following_redirect(self, new_url):
1151 """Report information extraction."""
1152 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1153
1154 def _test_redirect(self, url):
1155 """Check if it is a redirect, like url shorteners, in case restart chain."""
1156 class HeadRequest(urllib2.Request):
1157 def get_method(self):
1158 return "HEAD"
1159
1160 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1161 """
1162 Subclass the HTTPRedirectHandler to make it use our
1163 HeadRequest also on the redirected URL
1164 """
1165 def redirect_request(self, req, fp, code, msg, headers, newurl):
1166 if code in (301, 302, 303, 307):
1167 newurl = newurl.replace(' ', '%20')
1168 newheaders = dict((k,v) for k,v in req.headers.items()
1169 if k.lower() not in ("content-length", "content-type"))
1170 return HeadRequest(newurl,
1171 headers=newheaders,
1172 origin_req_host=req.get_origin_req_host(),
1173 unverifiable=True)
1174 else:
1175 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1176
1177 class HTTPMethodFallback(urllib2.BaseHandler):
1178 """
1179 Fallback to GET if HEAD is not allowed (405 HTTP error)
1180 """
1181 def http_error_405(self, req, fp, code, msg, headers):
1182 fp.read()
1183 fp.close()
1184
1185 newheaders = dict((k,v) for k,v in req.headers.items()
1186 if k.lower() not in ("content-length", "content-type"))
1187 return self.parent.open(urllib2.Request(req.get_full_url(),
1188 headers=newheaders,
1189 origin_req_host=req.get_origin_req_host(),
1190 unverifiable=True))
1191
1192 # Build our opener
1193 opener = urllib2.OpenerDirector()
1194 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1195 HTTPMethodFallback, HEADRedirectHandler,
1196 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1197 opener.add_handler(handler())
1198
1199 response = opener.open(HeadRequest(url))
1200 new_url = response.geturl()
1201
1202 if url == new_url: return False
1203
1204 self.report_following_redirect(new_url)
1205 self._downloader.download([new_url])
1206 return True
1207
1208 def _real_extract(self, url):
1209 if self._test_redirect(url): return
1210
1211 video_id = url.split('/')[-1]
1212 request = urllib2.Request(url)
1213 try:
1214 self.report_download_webpage(video_id)
1215 webpage = urllib2.urlopen(request).read()
1216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1217 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1218 return
1219 except ValueError, err:
1220 # since this is the last-resort InfoExtractor, if
1221 # this error is thrown, it'll be thrown here
1222 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1223 return
1224
1225 self.report_extraction(video_id)
1226 # Start with something easy: JW Player in SWFObject
1227 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1228 if mobj is None:
1229 # Broaden the search a little bit
1230 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1231 if mobj is None:
1232 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1233 return
1234
1235 # It's possible that one of the regexes
1236 # matched, but returned an empty group:
1237 if mobj.group(1) is None:
1238 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1239 return
1240
1241 video_url = urllib.unquote(mobj.group(1))
1242 video_id = os.path.basename(video_url)
1243
1244 # here's a fun little line of code for you:
1245 video_extension = os.path.splitext(video_id)[1][1:]
1246 video_id = os.path.splitext(video_id)[0]
1247
1248 # it's tempting to parse this further, but you would
1249 # have to take into account all the variations like
1250 # Video Title - Site Name
1251 # Site Name | Video Title
1252 # Video Title - Tagline | Site Name
1253 # and so on and so forth; it's just not practical
1254 mobj = re.search(r'<title>(.*)</title>', webpage)
1255 if mobj is None:
1256 self._downloader.trouble(u'ERROR: unable to extract title')
1257 return
1258 video_title = mobj.group(1).decode('utf-8')
1259
1260 # video uploader is domain name
1261 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1262 if mobj is None:
1263 self._downloader.trouble(u'ERROR: unable to extract title')
1264 return
1265 video_uploader = mobj.group(1).decode('utf-8')
1266
1267 return [{
1268 'id': video_id.decode('utf-8'),
1269 'url': video_url.decode('utf-8'),
1270 'uploader': video_uploader,
1271 'upload_date': u'NA',
1272 'title': video_title,
1273 'ext': video_extension.decode('utf-8'),
1274 'format': u'NA',
1275 'player_url': None,
1276 }]
1277
1278
1279 class YoutubeSearchIE(InfoExtractor):
1280 """Information Extractor for YouTube search queries."""
1281 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1282 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1283 _max_youtube_results = 1000
1284 IE_NAME = u'youtube:search'
1285
1286 def __init__(self, downloader=None):
1287 InfoExtractor.__init__(self, downloader)
1288
1289 def report_download_page(self, query, pagenum):
1290 """Report attempt to download search page with given number."""
1291 query = query.decode(preferredencoding())
1292 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1293
1294 def _real_extract(self, query):
1295 mobj = re.match(self._VALID_URL, query)
1296 if mobj is None:
1297 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1298 return
1299
1300 prefix, query = query.split(':')
1301 prefix = prefix[8:]
1302 query = query.encode('utf-8')
1303 if prefix == '':
1304 self._download_n_results(query, 1)
1305 return
1306 elif prefix == 'all':
1307 self._download_n_results(query, self._max_youtube_results)
1308 return
1309 else:
1310 try:
1311 n = long(prefix)
1312 if n <= 0:
1313 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1314 return
1315 elif n > self._max_youtube_results:
1316 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1317 n = self._max_youtube_results
1318 self._download_n_results(query, n)
1319 return
1320 except ValueError: # parsing prefix as integer fails
1321 self._download_n_results(query, 1)
1322 return
1323
1324 def _download_n_results(self, query, n):
1325 """Downloads a specified number of results for a query"""
1326
1327 video_ids = []
1328 pagenum = 0
1329 limit = n
1330
1331 while (50 * pagenum) < limit:
1332 self.report_download_page(query, pagenum+1)
1333 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1334 request = urllib2.Request(result_url)
1335 try:
1336 data = urllib2.urlopen(request).read()
1337 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1338 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1339 return
1340 api_response = json.loads(data)['data']
1341
1342 new_ids = list(video['id'] for video in api_response['items'])
1343 video_ids += new_ids
1344
1345 limit = min(n, api_response['totalItems'])
1346 pagenum += 1
1347
1348 if len(video_ids) > n:
1349 video_ids = video_ids[:n]
1350 for id in video_ids:
1351 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1352 return
1353
1354
1355 class GoogleSearchIE(InfoExtractor):
1356 """Information Extractor for Google Video search queries."""
1357 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1358 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1359 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1360 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1361 _max_google_results = 1000
1362 IE_NAME = u'video.google:search'
1363
1364 def __init__(self, downloader=None):
1365 InfoExtractor.__init__(self, downloader)
1366
1367 def report_download_page(self, query, pagenum):
1368 """Report attempt to download playlist page with given number."""
1369 query = query.decode(preferredencoding())
1370 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1371
1372 def _real_extract(self, query):
1373 mobj = re.match(self._VALID_URL, query)
1374 if mobj is None:
1375 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1376 return
1377
1378 prefix, query = query.split(':')
1379 prefix = prefix[8:]
1380 query = query.encode('utf-8')
1381 if prefix == '':
1382 self._download_n_results(query, 1)
1383 return
1384 elif prefix == 'all':
1385 self._download_n_results(query, self._max_google_results)
1386 return
1387 else:
1388 try:
1389 n = long(prefix)
1390 if n <= 0:
1391 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1392 return
1393 elif n > self._max_google_results:
1394 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1395 n = self._max_google_results
1396 self._download_n_results(query, n)
1397 return
1398 except ValueError: # parsing prefix as integer fails
1399 self._download_n_results(query, 1)
1400 return
1401
1402 def _download_n_results(self, query, n):
1403 """Downloads a specified number of results for a query"""
1404
1405 video_ids = []
1406 pagenum = 0
1407
1408 while True:
1409 self.report_download_page(query, pagenum)
1410 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1411 request = urllib2.Request(result_url)
1412 try:
1413 page = urllib2.urlopen(request).read()
1414 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1415 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1416 return
1417
1418 # Extract video identifiers
1419 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1420 video_id = mobj.group(1)
1421 if video_id not in video_ids:
1422 video_ids.append(video_id)
1423 if len(video_ids) == n:
1424 # Specified n videos reached
1425 for id in video_ids:
1426 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1427 return
1428
1429 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1430 for id in video_ids:
1431 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1432 return
1433
1434 pagenum = pagenum + 1
1435
1436
1437 class YahooSearchIE(InfoExtractor):
1438 """Information Extractor for Yahoo! Video search queries."""
1439 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1440 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1441 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1442 _MORE_PAGES_INDICATOR = r'\s*Next'
1443 _max_yahoo_results = 1000
1444 IE_NAME = u'video.yahoo:search'
1445
1446 def __init__(self, downloader=None):
1447 InfoExtractor.__init__(self, downloader)
1448
1449 def report_download_page(self, query, pagenum):
1450 """Report attempt to download playlist page with given number."""
1451 query = query.decode(preferredencoding())
1452 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1453
1454 def _real_extract(self, query):
1455 mobj = re.match(self._VALID_URL, query)
1456 if mobj is None:
1457 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1458 return
1459
1460 prefix, query = query.split(':')
1461 prefix = prefix[8:]
1462 query = query.encode('utf-8')
1463 if prefix == '':
1464 self._download_n_results(query, 1)
1465 return
1466 elif prefix == 'all':
1467 self._download_n_results(query, self._max_yahoo_results)
1468 return
1469 else:
1470 try:
1471 n = long(prefix)
1472 if n <= 0:
1473 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1474 return
1475 elif n > self._max_yahoo_results:
1476 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1477 n = self._max_yahoo_results
1478 self._download_n_results(query, n)
1479 return
1480 except ValueError: # parsing prefix as integer fails
1481 self._download_n_results(query, 1)
1482 return
1483
1484 def _download_n_results(self, query, n):
1485 """Downloads a specified number of results for a query"""
1486
1487 video_ids = []
1488 already_seen = set()
1489 pagenum = 1
1490
1491 while True:
1492 self.report_download_page(query, pagenum)
1493 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1494 request = urllib2.Request(result_url)
1495 try:
1496 page = urllib2.urlopen(request).read()
1497 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1499 return
1500
1501 # Extract video identifiers
1502 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1503 video_id = mobj.group(1)
1504 if video_id not in already_seen:
1505 video_ids.append(video_id)
1506 already_seen.add(video_id)
1507 if len(video_ids) == n:
1508 # Specified n videos reached
1509 for id in video_ids:
1510 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1511 return
1512
1513 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514 for id in video_ids:
1515 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1516 return
1517
1518 pagenum = pagenum + 1
1519
1520
1521 class YoutubePlaylistIE(InfoExtractor):
1522 """Information Extractor for YouTube playlists."""
1523
1524 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1525 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1526 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1527 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1528 IE_NAME = u'youtube:playlist'
1529
1530 def __init__(self, downloader=None):
1531 InfoExtractor.__init__(self, downloader)
1532
1533 def report_download_page(self, playlist_id, pagenum):
1534 """Report attempt to download playlist page with given number."""
1535 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1536
1537 def _real_extract(self, url):
1538 # Extract playlist id
1539 mobj = re.match(self._VALID_URL, url)
1540 if mobj is None:
1541 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1542 return
1543
1544 # Single video case
1545 if mobj.group(3) is not None:
1546 self._downloader.download([mobj.group(3)])
1547 return
1548
1549 # Download playlist pages
1550 # prefix is 'p' as default for playlists but there are other types that need extra care
1551 playlist_prefix = mobj.group(1)
1552 if playlist_prefix == 'a':
1553 playlist_access = 'artist'
1554 else:
1555 playlist_prefix = 'p'
1556 playlist_access = 'view_play_list'
1557 playlist_id = mobj.group(2)
1558 video_ids = []
1559 pagenum = 1
1560
1561 while True:
1562 self.report_download_page(playlist_id, pagenum)
1563 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1564 request = urllib2.Request(url)
1565 try:
1566 page = urllib2.urlopen(request).read()
1567 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1568 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1569 return
1570
1571 # Extract video identifiers
1572 ids_in_page = []
1573 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1574 if mobj.group(1) not in ids_in_page:
1575 ids_in_page.append(mobj.group(1))
1576 video_ids.extend(ids_in_page)
1577
1578 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1579 break
1580 pagenum = pagenum + 1
1581
1582 playliststart = self._downloader.params.get('playliststart', 1) - 1
1583 playlistend = self._downloader.params.get('playlistend', -1)
1584 if playlistend == -1:
1585 video_ids = video_ids[playliststart:]
1586 else:
1587 video_ids = video_ids[playliststart:playlistend]
1588
1589 for id in video_ids:
1590 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1591 return
1592
1593
1594 class YoutubeChannelIE(InfoExtractor):
1595 """Information Extractor for YouTube channels."""
1596
1597 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1598 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1599 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1600 IE_NAME = u'youtube:channel'
1601
1602 def report_download_page(self, channel_id, pagenum):
1603 """Report attempt to download channel page with given number."""
1604 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1605
1606 def _real_extract(self, url):
1607 # Extract channel id
1608 mobj = re.match(self._VALID_URL, url)
1609 if mobj is None:
1610 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1611 return
1612
1613 # Download channel pages
1614 channel_id = mobj.group(1)
1615 video_ids = []
1616 pagenum = 1
1617
1618 while True:
1619 self.report_download_page(channel_id, pagenum)
1620 url = self._TEMPLATE_URL % (channel_id, pagenum)
1621 request = urllib2.Request(url)
1622 try:
1623 page = urllib2.urlopen(request).read()
1624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1625 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1626 return
1627
1628 # Extract video identifiers
1629 ids_in_page = []
1630 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1631 if mobj.group(1) not in ids_in_page:
1632 ids_in_page.append(mobj.group(1))
1633 video_ids.extend(ids_in_page)
1634
1635 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1636 break
1637 pagenum = pagenum + 1
1638
1639 for id in video_ids:
1640 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1641 return
1642
1643
1644 class YoutubeUserIE(InfoExtractor):
1645 """Information Extractor for YouTube users."""
1646
1647 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1648 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1649 _GDATA_PAGE_SIZE = 50
1650 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1651 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1652 IE_NAME = u'youtube:user'
1653
1654 def __init__(self, downloader=None):
1655 InfoExtractor.__init__(self, downloader)
1656
1657 def report_download_page(self, username, start_index):
1658 """Report attempt to download user page."""
1659 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1660 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1661
1662 def _real_extract(self, url):
1663 # Extract username
1664 mobj = re.match(self._VALID_URL, url)
1665 if mobj is None:
1666 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1667 return
1668
1669 username = mobj.group(1)
1670
1671 # Download video ids using YouTube Data API. Result size per
1672 # query is limited (currently to 50 videos) so we need to query
1673 # page by page until there are no video ids - it means we got
1674 # all of them.
1675
1676 video_ids = []
1677 pagenum = 0
1678
1679 while True:
1680 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1681 self.report_download_page(username, start_index)
1682
1683 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1684
1685 try:
1686 page = urllib2.urlopen(request).read()
1687 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1688 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1689 return
1690
1691 # Extract video identifiers
1692 ids_in_page = []
1693
1694 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1695 if mobj.group(1) not in ids_in_page:
1696 ids_in_page.append(mobj.group(1))
1697
1698 video_ids.extend(ids_in_page)
1699
1700 # A little optimization - if current page is not
1701 # "full", ie. does not contain PAGE_SIZE video ids then
1702 # we can assume that this page is the last one - there
1703 # are no more ids on further pages - no need to query
1704 # again.
1705
1706 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1707 break
1708
1709 pagenum += 1
1710
1711 all_ids_count = len(video_ids)
1712 playliststart = self._downloader.params.get('playliststart', 1) - 1
1713 playlistend = self._downloader.params.get('playlistend', -1)
1714
1715 if playlistend == -1:
1716 video_ids = video_ids[playliststart:]
1717 else:
1718 video_ids = video_ids[playliststart:playlistend]
1719
1720 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1721 (username, all_ids_count, len(video_ids)))
1722
1723 for video_id in video_ids:
1724 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1725
1726
1727 class BlipTVUserIE(InfoExtractor):
1728 """Information Extractor for blip.tv users."""
1729
1730 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1731 _PAGE_SIZE = 12
1732 IE_NAME = u'blip.tv:user'
1733
1734 def __init__(self, downloader=None):
1735 InfoExtractor.__init__(self, downloader)
1736
1737 def report_download_page(self, username, pagenum):
1738 """Report attempt to download user page."""
1739 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1740 (self.IE_NAME, username, pagenum))
1741
1742 def _real_extract(self, url):
1743 # Extract username
1744 mobj = re.match(self._VALID_URL, url)
1745 if mobj is None:
1746 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1747 return
1748
1749 username = mobj.group(1)
1750
1751 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1752
1753 request = urllib2.Request(url)
1754
1755 try:
1756 page = urllib2.urlopen(request).read().decode('utf-8')
1757 mobj = re.search(r'data-users-id="([^"]+)"', page)
1758 page_base = page_base % mobj.group(1)
1759 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1761 return
1762
1763
1764 # Download video ids using BlipTV Ajax calls. Result size per
1765 # query is limited (currently to 12 videos) so we need to query
1766 # page by page until there are no video ids - it means we got
1767 # all of them.
1768
1769 video_ids = []
1770 pagenum = 1
1771
1772 while True:
1773 self.report_download_page(username, pagenum)
1774
1775 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1776
1777 try:
1778 page = urllib2.urlopen(request).read().decode('utf-8')
1779 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1780 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1781 return
1782
1783 # Extract video identifiers
1784 ids_in_page = []
1785
1786 for mobj in re.finditer(r'href="/([^"]+)"', page):
1787 if mobj.group(1) not in ids_in_page:
1788 ids_in_page.append(unescapeHTML(mobj.group(1)))
1789
1790 video_ids.extend(ids_in_page)
1791
1792 # A little optimization - if current page is not
1793 # "full", ie. does not contain PAGE_SIZE video ids then
1794 # we can assume that this page is the last one - there
1795 # are no more ids on further pages - no need to query
1796 # again.
1797
1798 if len(ids_in_page) < self._PAGE_SIZE:
1799 break
1800
1801 pagenum += 1
1802
1803 all_ids_count = len(video_ids)
1804 playliststart = self._downloader.params.get('playliststart', 1) - 1
1805 playlistend = self._downloader.params.get('playlistend', -1)
1806
1807 if playlistend == -1:
1808 video_ids = video_ids[playliststart:]
1809 else:
1810 video_ids = video_ids[playliststart:playlistend]
1811
1812 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1813 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1814
1815 for video_id in video_ids:
1816 self._downloader.download([u'http://blip.tv/'+video_id])
1817
1818
1819 class DepositFilesIE(InfoExtractor):
1820 """Information extractor for depositfiles.com"""
1821
1822 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1823 IE_NAME = u'DepositFiles'
1824
1825 def __init__(self, downloader=None):
1826 InfoExtractor.__init__(self, downloader)
1827
1828 def report_download_webpage(self, file_id):
1829 """Report webpage download."""
1830 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1831
1832 def report_extraction(self, file_id):
1833 """Report information extraction."""
1834 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1835
1836 def _real_extract(self, url):
1837 file_id = url.split('/')[-1]
1838 # Rebuild url in english locale
1839 url = 'http://depositfiles.com/en/files/' + file_id
1840
1841 # Retrieve file webpage with 'Free download' button pressed
1842 free_download_indication = { 'gateway_result' : '1' }
1843 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1844 try:
1845 self.report_download_webpage(file_id)
1846 webpage = urllib2.urlopen(request).read()
1847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1849 return
1850
1851 # Search for the real file URL
1852 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1853 if (mobj is None) or (mobj.group(1) is None):
1854 # Try to figure out reason of the error.
1855 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1856 if (mobj is not None) and (mobj.group(1) is not None):
1857 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1858 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1859 else:
1860 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1861 return
1862
1863 file_url = mobj.group(1)
1864 file_extension = os.path.splitext(file_url)[1][1:]
1865
1866 # Search for file title
1867 mobj = re.search(r'<b title="(.*?)">', webpage)
1868 if mobj is None:
1869 self._downloader.trouble(u'ERROR: unable to extract title')
1870 return
1871 file_title = mobj.group(1).decode('utf-8')
1872
1873 return [{
1874 'id': file_id.decode('utf-8'),
1875 'url': file_url.decode('utf-8'),
1876 'uploader': u'NA',
1877 'upload_date': u'NA',
1878 'title': file_title,
1879 'ext': file_extension.decode('utf-8'),
1880 'format': u'NA',
1881 'player_url': None,
1882 }]
1883
1884
1885 class FacebookIE(InfoExtractor):
1886 """Information Extractor for Facebook"""
1887
1888 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1889 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1890 _NETRC_MACHINE = 'facebook'
1891 _available_formats = ['video', 'highqual', 'lowqual']
1892 _video_extensions = {
1893 'video': 'mp4',
1894 'highqual': 'mp4',
1895 'lowqual': 'mp4',
1896 }
1897 IE_NAME = u'facebook'
1898
1899 def __init__(self, downloader=None):
1900 InfoExtractor.__init__(self, downloader)
1901
1902 def _reporter(self, message):
1903 """Add header and report message."""
1904 self._downloader.to_screen(u'[facebook] %s' % message)
1905
1906 def report_login(self):
1907 """Report attempt to log in."""
1908 self._reporter(u'Logging in')
1909
1910 def report_video_webpage_download(self, video_id):
1911 """Report attempt to download video webpage."""
1912 self._reporter(u'%s: Downloading video webpage' % video_id)
1913
1914 def report_information_extraction(self, video_id):
1915 """Report attempt to extract video information."""
1916 self._reporter(u'%s: Extracting video information' % video_id)
1917
1918 def _parse_page(self, video_webpage):
1919 """Extract video information from page"""
1920 # General data
1921 data = {'title': r'\("video_title", "(.*?)"\)',
1922 'description': r'<div class="datawrap">(.*?)</div>',
1923 'owner': r'\("video_owner_name", "(.*?)"\)',
1924 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1925 }
1926 video_info = {}
1927 for piece in data.keys():
1928 mobj = re.search(data[piece], video_webpage)
1929 if mobj is not None:
1930 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1931
1932 # Video urls
1933 video_urls = {}
1934 for fmt in self._available_formats:
1935 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1936 if mobj is not None:
1937 # URL is in a Javascript segment inside an escaped Unicode format within
1938 # the generally utf-8 page
1939 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1940 video_info['video_urls'] = video_urls
1941
1942 return video_info
1943
1944 def _real_initialize(self):
1945 if self._downloader is None:
1946 return
1947
1948 useremail = None
1949 password = None
1950 downloader_params = self._downloader.params
1951
1952 # Attempt to use provided username and password or .netrc data
1953 if downloader_params.get('username', None) is not None:
1954 useremail = downloader_params['username']
1955 password = downloader_params['password']
1956 elif downloader_params.get('usenetrc', False):
1957 try:
1958 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1959 if info is not None:
1960 useremail = info[0]
1961 password = info[2]
1962 else:
1963 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1964 except (IOError, netrc.NetrcParseError), err:
1965 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1966 return
1967
1968 if useremail is None:
1969 return
1970
1971 # Log in
1972 login_form = {
1973 'email': useremail,
1974 'pass': password,
1975 'login': 'Log+In'
1976 }
1977 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1978 try:
1979 self.report_login()
1980 login_results = urllib2.urlopen(request).read()
1981 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1982 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1983 return
1984 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1985 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1986 return
1987
1988 def _real_extract(self, url):
1989 mobj = re.match(self._VALID_URL, url)
1990 if mobj is None:
1991 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1992 return
1993 video_id = mobj.group('ID')
1994
1995 # Get video webpage
1996 self.report_video_webpage_download(video_id)
1997 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1998 try:
1999 page = urllib2.urlopen(request)
2000 video_webpage = page.read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2003 return
2004
2005 # Start extracting information
2006 self.report_information_extraction(video_id)
2007
2008 # Extract information
2009 video_info = self._parse_page(video_webpage)
2010
2011 # uploader
2012 if 'owner' not in video_info:
2013 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2014 return
2015 video_uploader = video_info['owner']
2016
2017 # title
2018 if 'title' not in video_info:
2019 self._downloader.trouble(u'ERROR: unable to extract video title')
2020 return
2021 video_title = video_info['title']
2022 video_title = video_title.decode('utf-8')
2023
2024 # thumbnail image
2025 if 'thumbnail' not in video_info:
2026 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2027 video_thumbnail = ''
2028 else:
2029 video_thumbnail = video_info['thumbnail']
2030
2031 # upload date
2032 upload_date = u'NA'
2033 if 'upload_date' in video_info:
2034 upload_time = video_info['upload_date']
2035 timetuple = email.utils.parsedate_tz(upload_time)
2036 if timetuple is not None:
2037 try:
2038 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2039 except:
2040 pass
2041
2042 # description
2043 video_description = video_info.get('description', 'No description available.')
2044
2045 url_map = video_info['video_urls']
2046 if len(url_map.keys()) > 0:
2047 # Decide which formats to download
2048 req_format = self._downloader.params.get('format', None)
2049 format_limit = self._downloader.params.get('format_limit', None)
2050
2051 if format_limit is not None and format_limit in self._available_formats:
2052 format_list = self._available_formats[self._available_formats.index(format_limit):]
2053 else:
2054 format_list = self._available_formats
2055 existing_formats = [x for x in format_list if x in url_map]
2056 if len(existing_formats) == 0:
2057 self._downloader.trouble(u'ERROR: no known formats available for video')
2058 return
2059 if req_format is None:
2060 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2061 elif req_format == 'worst':
2062 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2063 elif req_format == '-1':
2064 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2065 else:
2066 # Specific format
2067 if req_format not in url_map:
2068 self._downloader.trouble(u'ERROR: requested format not available')
2069 return
2070 video_url_list = [(req_format, url_map[req_format])] # Specific format
2071
2072 results = []
2073 for format_param, video_real_url in video_url_list:
2074 # Extension
2075 video_extension = self._video_extensions.get(format_param, 'mp4')
2076
2077 results.append({
2078 'id': video_id.decode('utf-8'),
2079 'url': video_real_url.decode('utf-8'),
2080 'uploader': video_uploader.decode('utf-8'),
2081 'upload_date': upload_date,
2082 'title': video_title,
2083 'ext': video_extension.decode('utf-8'),
2084 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2085 'thumbnail': video_thumbnail.decode('utf-8'),
2086 'description': video_description.decode('utf-8'),
2087 'player_url': None,
2088 })
2089 return results
2090
2091 class BlipTVIE(InfoExtractor):
2092 """Information extractor for blip.tv"""
2093
2094 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2095 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2096 IE_NAME = u'blip.tv'
2097
2098 def report_extraction(self, file_id):
2099 """Report information extraction."""
2100 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2101
2102 def report_direct_download(self, title):
2103 """Report information extraction."""
2104 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2105
2106 def _real_extract(self, url):
2107 mobj = re.match(self._VALID_URL, url)
2108 if mobj is None:
2109 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2110 return
2111
2112 if '?' in url:
2113 cchar = '&'
2114 else:
2115 cchar = '?'
2116 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2117 request = urllib2.Request(json_url.encode('utf-8'))
2118 self.report_extraction(mobj.group(1))
2119 info = None
2120 try:
2121 urlh = urllib2.urlopen(request)
2122 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2123 basename = url.split('/')[-1]
2124 title,ext = os.path.splitext(basename)
2125 title = title.decode('UTF-8')
2126 ext = ext.replace('.', '')
2127 self.report_direct_download(title)
2128 info = {
2129 'id': title,
2130 'url': url,
2131 'title': title,
2132 'ext': ext,
2133 'urlhandle': urlh
2134 }
2135 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2136 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2137 return
2138 if info is None: # Regular URL
2139 try:
2140 json_code = urlh.read()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2143 return
2144
2145 try:
2146 json_data = json.loads(json_code)
2147 if 'Post' in json_data:
2148 data = json_data['Post']
2149 else:
2150 data = json_data
2151
2152 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2153 video_url = data['media']['url']
2154 umobj = re.match(self._URL_EXT, video_url)
2155 if umobj is None:
2156 raise ValueError('Can not determine filename extension')
2157 ext = umobj.group(1)
2158
2159 info = {
2160 'id': data['item_id'],
2161 'url': video_url,
2162 'uploader': data['display_name'],
2163 'upload_date': upload_date,
2164 'title': data['title'],
2165 'ext': ext,
2166 'format': data['media']['mimeType'],
2167 'thumbnail': data['thumbnailUrl'],
2168 'description': data['description'],
2169 'player_url': data['embedUrl']
2170 }
2171 except (ValueError,KeyError), err:
2172 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2173 return
2174
2175 std_headers['User-Agent'] = 'iTunes/10.6.1'
2176 return [info]
2177
2178
2179 class MyVideoIE(InfoExtractor):
2180 """Information Extractor for myvideo.de."""
2181
2182 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2183 IE_NAME = u'myvideo'
2184
2185 def __init__(self, downloader=None):
2186 InfoExtractor.__init__(self, downloader)
2187
2188 def report_download_webpage(self, video_id):
2189 """Report webpage download."""
2190 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2191
2192 def report_extraction(self, video_id):
2193 """Report information extraction."""
2194 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2195
2196 def _real_extract(self,url):
2197 mobj = re.match(self._VALID_URL, url)
2198 if mobj is None:
2199 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2200 return
2201
2202 video_id = mobj.group(1)
2203
2204 # Get video webpage
2205 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2206 try:
2207 self.report_download_webpage(video_id)
2208 webpage = urllib2.urlopen(request).read()
2209 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2210 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2211 return
2212
2213 self.report_extraction(video_id)
2214 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2215 webpage)
2216 if mobj is None:
2217 self._downloader.trouble(u'ERROR: unable to extract media URL')
2218 return
2219 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2220
2221 mobj = re.search('<title>([^<]+)</title>', webpage)
2222 if mobj is None:
2223 self._downloader.trouble(u'ERROR: unable to extract title')
2224 return
2225
2226 video_title = mobj.group(1)
2227
2228 return [{
2229 'id': video_id,
2230 'url': video_url,
2231 'uploader': u'NA',
2232 'upload_date': u'NA',
2233 'title': video_title,
2234 'ext': u'flv',
2235 'format': u'NA',
2236 'player_url': None,
2237 }]
2238
2239 class ComedyCentralIE(InfoExtractor):
2240 """Information extractor for The Daily Show and Colbert Report """
2241
2242 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2243 IE_NAME = u'comedycentral'
2244
2245 def report_extraction(self, episode_id):
2246 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2247
2248 def report_config_download(self, episode_id):
2249 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2250
2251 def report_index_download(self, episode_id):
2252 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2253
2254 def report_player_url(self, episode_id):
2255 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2256
2257 def _real_extract(self, url):
2258 mobj = re.match(self._VALID_URL, url)
2259 if mobj is None:
2260 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2261 return
2262
2263 if mobj.group('shortname'):
2264 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2265 url = u'http://www.thedailyshow.com/full-episodes/'
2266 else:
2267 url = u'http://www.colbertnation.com/full-episodes/'
2268 mobj = re.match(self._VALID_URL, url)
2269 assert mobj is not None
2270
2271 dlNewest = not mobj.group('episode')
2272 if dlNewest:
2273 epTitle = mobj.group('showname')
2274 else:
2275 epTitle = mobj.group('episode')
2276
2277 req = urllib2.Request(url)
2278 self.report_extraction(epTitle)
2279 try:
2280 htmlHandle = urllib2.urlopen(req)
2281 html = htmlHandle.read()
2282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2283 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2284 return
2285 if dlNewest:
2286 url = htmlHandle.geturl()
2287 mobj = re.match(self._VALID_URL, url)
2288 if mobj is None:
2289 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2290 return
2291 if mobj.group('episode') == '':
2292 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2293 return
2294 epTitle = mobj.group('episode')
2295
2296 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2297 if len(mMovieParams) == 0:
2298 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2299 return
2300
2301 playerUrl_raw = mMovieParams[0][0]
2302 self.report_player_url(epTitle)
2303 try:
2304 urlHandle = urllib2.urlopen(playerUrl_raw)
2305 playerUrl = urlHandle.geturl()
2306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2308 return
2309
2310 uri = mMovieParams[0][1]
2311 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2312 self.report_index_download(epTitle)
2313 try:
2314 indexXml = urllib2.urlopen(indexUrl).read()
2315 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2317 return
2318
2319 results = []
2320
2321 idoc = xml.etree.ElementTree.fromstring(indexXml)
2322 itemEls = idoc.findall('.//item')
2323 for itemEl in itemEls:
2324 mediaId = itemEl.findall('./guid')[0].text
2325 shortMediaId = mediaId.split(':')[-1]
2326 showId = mediaId.split(':')[-2].replace('.com', '')
2327 officialTitle = itemEl.findall('./title')[0].text
2328 officialDate = itemEl.findall('./pubDate')[0].text
2329
2330 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2331 urllib.urlencode({'uri': mediaId}))
2332 configReq = urllib2.Request(configUrl)
2333 self.report_config_download(epTitle)
2334 try:
2335 configXml = urllib2.urlopen(configReq).read()
2336 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2337 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2338 return
2339
2340 cdoc = xml.etree.ElementTree.fromstring(configXml)
2341 turls = []
2342 for rendition in cdoc.findall('.//rendition'):
2343 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2344 turls.append(finfo)
2345
2346 if len(turls) == 0:
2347 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2348 continue
2349
2350 # For now, just pick the highest bitrate
2351 format,video_url = turls[-1]
2352
2353 effTitle = showId + u'-' + epTitle
2354 info = {
2355 'id': shortMediaId,
2356 'url': video_url,
2357 'uploader': showId,
2358 'upload_date': officialDate,
2359 'title': effTitle,
2360 'ext': 'mp4',
2361 'format': format,
2362 'thumbnail': None,
2363 'description': officialTitle,
2364 'player_url': playerUrl
2365 }
2366
2367 results.append(info)
2368
2369 return results
2370
2371
2372 class EscapistIE(InfoExtractor):
2373 """Information extractor for The Escapist """
2374
2375 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2376 IE_NAME = u'escapist'
2377
2378 def report_extraction(self, showName):
2379 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2380
2381 def report_config_download(self, showName):
2382 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2383
2384 def _real_extract(self, url):
2385 mobj = re.match(self._VALID_URL, url)
2386 if mobj is None:
2387 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2388 return
2389 showName = mobj.group('showname')
2390 videoId = mobj.group('episode')
2391
2392 self.report_extraction(showName)
2393 try:
2394 webPage = urllib2.urlopen(url)
2395 webPageBytes = webPage.read()
2396 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2397 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2398 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2399 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2400 return
2401
2402 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2403 description = unescapeHTML(descMatch.group(1))
2404 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2405 imgUrl = unescapeHTML(imgMatch.group(1))
2406 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2407 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2408 configUrlMatch = re.search('config=(.*)$', playerUrl)
2409 configUrl = urllib2.unquote(configUrlMatch.group(1))
2410
2411 self.report_config_download(showName)
2412 try:
2413 configJSON = urllib2.urlopen(configUrl).read()
2414 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2415 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2416 return
2417
2418 # Technically, it's JavaScript, not JSON
2419 configJSON = configJSON.replace("'", '"')
2420
2421 try:
2422 config = json.loads(configJSON)
2423 except (ValueError,), err:
2424 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2425 return
2426
2427 playlist = config['playlist']
2428 videoUrl = playlist[1]['url']
2429
2430 info = {
2431 'id': videoId,
2432 'url': videoUrl,
2433 'uploader': showName,
2434 'upload_date': None,
2435 'title': showName,
2436 'ext': 'flv',
2437 'format': 'flv',
2438 'thumbnail': imgUrl,
2439 'description': description,
2440 'player_url': playerUrl,
2441 }
2442
2443 return [info]
2444
2445
2446 class CollegeHumorIE(InfoExtractor):
2447 """Information extractor for collegehumor.com"""
2448
2449 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2450 IE_NAME = u'collegehumor'
2451
2452 def report_webpage(self, video_id):
2453 """Report information extraction."""
2454 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2455
2456 def report_extraction(self, video_id):
2457 """Report information extraction."""
2458 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2459
2460 def _real_extract(self, url):
2461 mobj = re.match(self._VALID_URL, url)
2462 if mobj is None:
2463 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2464 return
2465 video_id = mobj.group('videoid')
2466
2467 self.report_webpage(video_id)
2468 request = urllib2.Request(url)
2469 try:
2470 webpage = urllib2.urlopen(request).read()
2471 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2472 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2473 return
2474
2475 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2476 if m is None:
2477 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2478 return
2479 internal_video_id = m.group('internalvideoid')
2480
2481 info = {
2482 'id': video_id,
2483 'internal_id': internal_video_id,
2484 }
2485
2486 self.report_extraction(video_id)
2487 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2488 try:
2489 metaXml = urllib2.urlopen(xmlUrl).read()
2490 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2491 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2492 return
2493
2494 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2495 try:
2496 videoNode = mdoc.findall('./video')[0]
2497 info['description'] = videoNode.findall('./description')[0].text
2498 info['title'] = videoNode.findall('./caption')[0].text
2499 info['url'] = videoNode.findall('./file')[0].text
2500 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2501 info['ext'] = info['url'].rpartition('.')[2]
2502 info['format'] = info['ext']
2503 except IndexError:
2504 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2505 return
2506
2507 return [info]
2508
2509
2510 class XVideosIE(InfoExtractor):
2511 """Information extractor for xvideos.com"""
2512
2513 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2514 IE_NAME = u'xvideos'
2515
2516 def report_webpage(self, video_id):
2517 """Report information extraction."""
2518 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519
2520 def report_extraction(self, video_id):
2521 """Report information extraction."""
2522 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523
2524 def _real_extract(self, url):
2525 mobj = re.match(self._VALID_URL, url)
2526 if mobj is None:
2527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2528 return
2529 video_id = mobj.group(1).decode('utf-8')
2530
2531 self.report_webpage(video_id)
2532
2533 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2534 try:
2535 webpage = urllib2.urlopen(request).read()
2536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2537 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2538 return
2539
2540 self.report_extraction(video_id)
2541
2542
2543 # Extract video URL
2544 mobj = re.search(r'flv_url=(.+?)&', webpage)
2545 if mobj is None:
2546 self._downloader.trouble(u'ERROR: unable to extract video url')
2547 return
2548 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2549
2550
2551 # Extract title
2552 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2553 if mobj is None:
2554 self._downloader.trouble(u'ERROR: unable to extract video title')
2555 return
2556 video_title = mobj.group(1).decode('utf-8')
2557
2558
2559 # Extract video thumbnail
2560 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2561 if mobj is None:
2562 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2563 return
2564 video_thumbnail = mobj.group(0).decode('utf-8')
2565
2566 info = {
2567 'id': video_id,
2568 'url': video_url,
2569 'uploader': None,
2570 'upload_date': None,
2571 'title': video_title,
2572 'ext': 'flv',
2573 'format': 'flv',
2574 'thumbnail': video_thumbnail,
2575 'description': None,
2576 'player_url': None,
2577 }
2578
2579 return [info]
2580
2581
2582 class SoundcloudIE(InfoExtractor):
2583 """Information extractor for soundcloud.com
2584 To access the media, the uid of the song and a stream token
2585 must be extracted from the page source and the script must make
2586 a request to media.soundcloud.com/crossdomain.xml. Then
2587 the media can be grabbed by requesting from an url composed
2588 of the stream token and uid
2589 """
2590
2591 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2592 IE_NAME = u'soundcloud'
2593
2594 def __init__(self, downloader=None):
2595 InfoExtractor.__init__(self, downloader)
2596
2597 def report_webpage(self, video_id):
2598 """Report information extraction."""
2599 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2600
2601 def report_extraction(self, video_id):
2602 """Report information extraction."""
2603 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2604
2605 def _real_extract(self, url):
2606 mobj = re.match(self._VALID_URL, url)
2607 if mobj is None:
2608 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2609 return
2610
2611 # extract uploader (which is in the url)
2612 uploader = mobj.group(1).decode('utf-8')
2613 # extract simple title (uploader + slug of song title)
2614 slug_title = mobj.group(2).decode('utf-8')
2615 simple_title = uploader + u'-' + slug_title
2616
2617 self.report_webpage('%s/%s' % (uploader, slug_title))
2618
2619 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2620 try:
2621 webpage = urllib2.urlopen(request).read()
2622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2624 return
2625
2626 self.report_extraction('%s/%s' % (uploader, slug_title))
2627
2628 # extract uid and stream token that soundcloud hands out for access
2629 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2630 if mobj:
2631 video_id = mobj.group(1)
2632 stream_token = mobj.group(2)
2633
2634 # extract unsimplified title
2635 mobj = re.search('"title":"(.*?)",', webpage)
2636 if mobj:
2637 title = mobj.group(1).decode('utf-8')
2638 else:
2639 title = simple_title
2640
2641 # construct media url (with uid/token)
2642 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2643 mediaURL = mediaURL % (video_id, stream_token)
2644
2645 # description
2646 description = u'No description available'
2647 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2648 if mobj:
2649 description = mobj.group(1)
2650
2651 # upload date
2652 upload_date = None
2653 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2654 if mobj:
2655 try:
2656 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2657 except Exception, e:
2658 self._downloader.to_stderr(str(e))
2659
2660 # for soundcloud, a request to a cross domain is required for cookies
2661 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2662
2663 return [{
2664 'id': video_id.decode('utf-8'),
2665 'url': mediaURL,
2666 'uploader': uploader.decode('utf-8'),
2667 'upload_date': upload_date,
2668 'title': title,
2669 'ext': u'mp3',
2670 'format': u'NA',
2671 'player_url': None,
2672 'description': description.decode('utf-8')
2673 }]
2674
2675
2676 class InfoQIE(InfoExtractor):
2677 """Information extractor for infoq.com"""
2678
2679 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2680 IE_NAME = u'infoq'
2681
2682 def report_webpage(self, video_id):
2683 """Report information extraction."""
2684 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2685
2686 def report_extraction(self, video_id):
2687 """Report information extraction."""
2688 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2689
2690 def _real_extract(self, url):
2691 mobj = re.match(self._VALID_URL, url)
2692 if mobj is None:
2693 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2694 return
2695
2696 self.report_webpage(url)
2697
2698 request = urllib2.Request(url)
2699 try:
2700 webpage = urllib2.urlopen(request).read()
2701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2702 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2703 return
2704
2705 self.report_extraction(url)
2706
2707
2708 # Extract video URL
2709 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2710 if mobj is None:
2711 self._downloader.trouble(u'ERROR: unable to extract video url')
2712 return
2713 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2714
2715
2716 # Extract title
2717 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2718 if mobj is None:
2719 self._downloader.trouble(u'ERROR: unable to extract video title')
2720 return
2721 video_title = mobj.group(1).decode('utf-8')
2722
2723 # Extract description
2724 video_description = u'No description available.'
2725 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2726 if mobj is not None:
2727 video_description = mobj.group(1).decode('utf-8')
2728
2729 video_filename = video_url.split('/')[-1]
2730 video_id, extension = video_filename.split('.')
2731
2732 info = {
2733 'id': video_id,
2734 'url': video_url,
2735 'uploader': None,
2736 'upload_date': None,
2737 'title': video_title,
2738 'ext': extension,
2739 'format': extension, # Extension is always(?) mp4, but seems to be flv
2740 'thumbnail': None,
2741 'description': video_description,
2742 'player_url': None,
2743 }
2744
2745 return [info]
2746
2747 class MixcloudIE(InfoExtractor):
2748 """Information extractor for www.mixcloud.com"""
2749 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2750 IE_NAME = u'mixcloud'
2751
2752 def __init__(self, downloader=None):
2753 InfoExtractor.__init__(self, downloader)
2754
2755 def report_download_json(self, file_id):
2756 """Report JSON download."""
2757 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2758
2759 def report_extraction(self, file_id):
2760 """Report information extraction."""
2761 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2762
2763 def get_urls(self, jsonData, fmt, bitrate='best'):
2764 """Get urls from 'audio_formats' section in json"""
2765 file_url = None
2766 try:
2767 bitrate_list = jsonData[fmt]
2768 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2769 bitrate = max(bitrate_list) # select highest
2770
2771 url_list = jsonData[fmt][bitrate]
2772 except TypeError: # we have no bitrate info.
2773 url_list = jsonData[fmt]
2774 return url_list
2775
2776 def check_urls(self, url_list):
2777 """Returns 1st active url from list"""
2778 for url in url_list:
2779 try:
2780 urllib2.urlopen(url)
2781 return url
2782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783 url = None
2784
2785 return None
2786
2787 def _print_formats(self, formats):
2788 print 'Available formats:'
2789 for fmt in formats.keys():
2790 for b in formats[fmt]:
2791 try:
2792 ext = formats[fmt][b][0]
2793 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2794 except TypeError: # we have no bitrate info
2795 ext = formats[fmt][0]
2796 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2797 break
2798
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2801 if mobj is None:
2802 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2803 return
2804 # extract uploader & filename from url
2805 uploader = mobj.group(1).decode('utf-8')
2806 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2807
2808 # construct API request
2809 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2810 # retrieve .json file with links to files
2811 request = urllib2.Request(file_url)
2812 try:
2813 self.report_download_json(file_url)
2814 jsonData = urllib2.urlopen(request).read()
2815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2817 return
2818
2819 # parse JSON
2820 json_data = json.loads(jsonData)
2821 player_url = json_data['player_swf_url']
2822 formats = dict(json_data['audio_formats'])
2823
2824 req_format = self._downloader.params.get('format', None)
2825 bitrate = None
2826
2827 if self._downloader.params.get('listformats', None):
2828 self._print_formats(formats)
2829 return
2830
2831 if req_format is None or req_format == 'best':
2832 for format_param in formats.keys():
2833 url_list = self.get_urls(formats, format_param)
2834 # check urls
2835 file_url = self.check_urls(url_list)
2836 if file_url is not None:
2837 break # got it!
2838 else:
2839 if req_format not in formats.keys():
2840 self._downloader.trouble(u'ERROR: format is not available')
2841 return
2842
2843 url_list = self.get_urls(formats, req_format)
2844 file_url = self.check_urls(url_list)
2845 format_param = req_format
2846
2847 return [{
2848 'id': file_id.decode('utf-8'),
2849 'url': file_url.decode('utf-8'),
2850 'uploader': uploader.decode('utf-8'),
2851 'upload_date': u'NA',
2852 'title': json_data['name'],
2853 'ext': file_url.split('.')[-1].decode('utf-8'),
2854 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2855 'thumbnail': json_data['thumbnail_url'],
2856 'description': json_data['description'],
2857 'player_url': player_url.decode('utf-8'),
2858 }]
2859
2860 class StanfordOpenClassroomIE(InfoExtractor):
2861 """Information extractor for Stanford's Open ClassRoom"""
2862
2863 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2864 IE_NAME = u'stanfordoc'
2865
2866 def report_download_webpage(self, objid):
2867 """Report information extraction."""
2868 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2869
2870 def report_extraction(self, video_id):
2871 """Report information extraction."""
2872 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2873
2874 def _real_extract(self, url):
2875 mobj = re.match(self._VALID_URL, url)
2876 if mobj is None:
2877 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2878 return
2879
2880 if mobj.group('course') and mobj.group('video'): # A specific video
2881 course = mobj.group('course')
2882 video = mobj.group('video')
2883 info = {
2884 'id': course + '_' + video,
2885 }
2886
2887 self.report_extraction(info['id'])
2888 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2889 xmlUrl = baseUrl + video + '.xml'
2890 try:
2891 metaXml = urllib2.urlopen(xmlUrl).read()
2892 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2893 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2894 return
2895 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2896 try:
2897 info['title'] = mdoc.findall('./title')[0].text
2898 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2899 except IndexError:
2900 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2901 return
2902 info['ext'] = info['url'].rpartition('.')[2]
2903 info['format'] = info['ext']
2904 return [info]
2905 elif mobj.group('course'): # A course page
2906 course = mobj.group('course')
2907 info = {
2908 'id': course,
2909 'type': 'playlist',
2910 }
2911
2912 self.report_download_webpage(info['id'])
2913 try:
2914 coursepage = urllib2.urlopen(url).read()
2915 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2917 return
2918
2919 m = re.search('<h1>([^<]+)</h1>', coursepage)
2920 if m:
2921 info['title'] = unescapeHTML(m.group(1))
2922 else:
2923 info['title'] = info['id']
2924
2925 m = re.search('<description>([^<]+)</description>', coursepage)
2926 if m:
2927 info['description'] = unescapeHTML(m.group(1))
2928
2929 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2930 info['list'] = [
2931 {
2932 'type': 'reference',
2933 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2934 }
2935 for vpage in links]
2936 results = []
2937 for entry in info['list']:
2938 assert entry['type'] == 'reference'
2939 results += self.extract(entry['url'])
2940 return results
2941
2942 else: # Root page
2943 info = {
2944 'id': 'Stanford OpenClassroom',
2945 'type': 'playlist',
2946 }
2947
2948 self.report_download_webpage(info['id'])
2949 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2950 try:
2951 rootpage = urllib2.urlopen(rootURL).read()
2952 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2953 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2954 return
2955
2956 info['title'] = info['id']
2957
2958 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2959 info['list'] = [
2960 {
2961 'type': 'reference',
2962 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2963 }
2964 for cpage in links]
2965
2966 results = []
2967 for entry in info['list']:
2968 assert entry['type'] == 'reference'
2969 results += self.extract(entry['url'])
2970 return results
2971
2972 class MTVIE(InfoExtractor):
2973 """Information extractor for MTV.com"""
2974
2975 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2976 IE_NAME = u'mtv'
2977
2978 def report_webpage(self, video_id):
2979 """Report information extraction."""
2980 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2981
2982 def report_extraction(self, video_id):
2983 """Report information extraction."""
2984 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2985
2986 def _real_extract(self, url):
2987 mobj = re.match(self._VALID_URL, url)
2988 if mobj is None:
2989 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2990 return
2991 if not mobj.group('proto'):
2992 url = 'http://' + url
2993 video_id = mobj.group('videoid')
2994 self.report_webpage(video_id)
2995
2996 request = urllib2.Request(url)
2997 try:
2998 webpage = urllib2.urlopen(request).read()
2999 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3000 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3001 return
3002
3003 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3004 if mobj is None:
3005 self._downloader.trouble(u'ERROR: unable to extract song name')
3006 return
3007 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3008 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3009 if mobj is None:
3010 self._downloader.trouble(u'ERROR: unable to extract performer')
3011 return
3012 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3013 video_title = performer + ' - ' + song_name
3014
3015 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3016 if mobj is None:
3017 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3018 return
3019 mtvn_uri = mobj.group(1)
3020
3021 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3022 if mobj is None:
3023 self._downloader.trouble(u'ERROR: unable to extract content id')
3024 return
3025 content_id = mobj.group(1)
3026
3027 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3028 self.report_extraction(video_id)
3029 request = urllib2.Request(videogen_url)
3030 try:
3031 metadataXml = urllib2.urlopen(request).read()
3032 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3033 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3034 return
3035
3036 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3037 renditions = mdoc.findall('.//rendition')
3038
3039 # For now, always pick the highest quality.
3040 rendition = renditions[-1]
3041
3042 try:
3043 _,_,ext = rendition.attrib['type'].partition('/')
3044 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3045 video_url = rendition.find('./src').text
3046 except KeyError:
3047 self._downloader.trouble('Invalid rendition field.')
3048 return
3049
3050 info = {
3051 'id': video_id,
3052 'url': video_url,
3053 'uploader': performer,
3054 'title': video_title,
3055 'ext': ext,
3056 'format': format,
3057 }
3058
3059 return [info]
3060
3061
3062 class YoukuIE(InfoExtractor):
3063
3064 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3065 IE_NAME = u'Youku'
3066
3067 def __init__(self, downloader=None):
3068 InfoExtractor.__init__(self, downloader)
3069
3070 def report_download_webpage(self, file_id):
3071 """Report webpage download."""
3072 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3073
3074 def report_extraction(self, file_id):
3075 """Report information extraction."""
3076 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3077
3078 def _gen_sid(self):
3079 nowTime = int(time.time() * 1000)
3080 random1 = random.randint(1000,1998)
3081 random2 = random.randint(1000,9999)
3082
3083 return "%d%d%d" %(nowTime,random1,random2)
3084
3085 def _get_file_ID_mix_string(self, seed):
3086 mixed = []
3087 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3088 seed = float(seed)
3089 for i in range(len(source)):
3090 seed = (seed * 211 + 30031 ) % 65536
3091 index = math.floor(seed / 65536 * len(source) )
3092 mixed.append(source[int(index)])
3093 source.remove(source[int(index)])
3094 #return ''.join(mixed)
3095 return mixed
3096
3097 def _get_file_id(self, fileId, seed):
3098 mixed = self._get_file_ID_mix_string(seed)
3099 ids = fileId.split('*')
3100 realId = []
3101 for ch in ids:
3102 if ch:
3103 realId.append(mixed[int(ch)])
3104 return ''.join(realId)
3105
3106 def _real_extract(self, url):
3107 mobj = re.match(self._VALID_URL, url)
3108 if mobj is None:
3109 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3110 return
3111 video_id = mobj.group('ID')
3112
3113 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3114
3115 request = urllib2.Request(info_url, None, std_headers)
3116 try:
3117 self.report_download_webpage(video_id)
3118 jsondata = urllib2.urlopen(request).read()
3119 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3120 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3121 return
3122
3123 self.report_extraction(video_id)
3124 try:
3125 config = json.loads(jsondata)
3126
3127 video_title = config['data'][0]['title']
3128 seed = config['data'][0]['seed']
3129
3130 format = self._downloader.params.get('format', None)
3131 supported_format = config['data'][0]['streamfileids'].keys()
3132
3133 if format is None or format == 'best':
3134 if 'hd2' in supported_format:
3135 format = 'hd2'
3136 else:
3137 format = 'flv'
3138 ext = u'flv'
3139 elif format == 'worst':
3140 format = 'mp4'
3141 ext = u'mp4'
3142 else:
3143 format = 'flv'
3144 ext = u'flv'
3145
3146
3147 fileid = config['data'][0]['streamfileids'][format]
3148 seg_number = len(config['data'][0]['segs'][format])
3149
3150 keys=[]
3151 for i in xrange(seg_number):
3152 keys.append(config['data'][0]['segs'][format][i]['k'])
3153
3154 #TODO check error
3155 #youku only could be viewed from mainland china
3156 except:
3157 self._downloader.trouble(u'ERROR: unable to extract info section')
3158 return
3159
3160 files_info=[]
3161 sid = self._gen_sid()
3162 fileid = self._get_file_id(fileid, seed)
3163
3164 #column 8,9 of fileid represent the segment number
3165 #fileid[7:9] should be changed
3166 for index, key in enumerate(keys):
3167
3168 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3169 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3170
3171 info = {
3172 'id': '%s_part%02d' % (video_id, index),
3173 'url': download_url,
3174 'uploader': None,
3175 'title': video_title,
3176 'ext': ext,
3177 'format': u'NA'
3178 }
3179 files_info.append(info)
3180
3181 return files_info
3182
3183
3184 class XNXXIE(InfoExtractor):
3185 """Information extractor for xnxx.com"""
3186
3187 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3188 IE_NAME = u'xnxx'
3189 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3190 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3191 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3192
3193 def report_webpage(self, video_id):
3194 """Report information extraction"""
3195 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3196
3197 def report_extraction(self, video_id):
3198 """Report information extraction"""
3199 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3200
3201 def _real_extract(self, url):
3202 mobj = re.match(self._VALID_URL, url)
3203 if mobj is None:
3204 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3205 return
3206 video_id = mobj.group(1).decode('utf-8')
3207
3208 self.report_webpage(video_id)
3209
3210 # Get webpage content
3211 try:
3212 webpage = urllib2.urlopen(url).read()
3213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3214 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3215 return
3216
3217 result = re.search(self.VIDEO_URL_RE, webpage)
3218 if result is None:
3219 self._downloader.trouble(u'ERROR: unable to extract video url')
3220 return
3221 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3222
3223 result = re.search(self.VIDEO_TITLE_RE, webpage)
3224 if result is None:
3225 self._downloader.trouble(u'ERROR: unable to extract video title')
3226 return
3227 video_title = result.group(1).decode('utf-8')
3228
3229 result = re.search(self.VIDEO_THUMB_RE, webpage)
3230 if result is None:
3231 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3232 return
3233 video_thumbnail = result.group(1).decode('utf-8')
3234
3235 info = {'id': video_id,
3236 'url': video_url,
3237 'uploader': None,
3238 'upload_date': None,
3239 'title': video_title,
3240 'ext': 'flv',
3241 'format': 'flv',
3242 'thumbnail': video_thumbnail,
3243 'description': None,
3244 'player_url': None}
3245
3246 return [info]
3247
3248
3249 class GooglePlusIE(InfoExtractor):
3250 """Information extractor for plus.google.com."""
3251
3252 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3253 IE_NAME = u'plus.google'
3254
3255 def __init__(self, downloader=None):
3256 InfoExtractor.__init__(self, downloader)
3257
3258 def report_extract_entry(self, url):
3259 """Report downloading extry"""
3260 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3261
3262 def report_date(self, upload_date):
3263 """Report downloading extry"""
3264 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3265
3266 def report_uploader(self, uploader):
3267 """Report downloading extry"""
3268 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3269
3270 def report_title(self, video_title):
3271 """Report downloading extry"""
3272 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3273
3274 def report_extract_vid_page(self, video_page):
3275 """Report information extraction."""
3276 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3277
3278 def _real_extract(self, url):
3279 # Extract id from URL
3280 mobj = re.match(self._VALID_URL, url)
3281 if mobj is None:
3282 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3283 return
3284
3285 post_url = mobj.group(0)
3286 video_id = mobj.group(2)
3287
3288 video_extension = 'flv'
3289
3290 # Step 1, Retrieve post webpage to extract further information
3291 self.report_extract_entry(post_url)
3292 request = urllib2.Request(post_url)
3293 try:
3294 webpage = urllib2.urlopen(request).read()
3295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3296 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3297 return
3298
3299 # Extract update date
3300 upload_date = u'NA'
3301 pattern = 'title="Timestamp">(.*?)</a>'
3302 mobj = re.search(pattern, webpage)
3303 if mobj:
3304 upload_date = mobj.group(1)
3305 # Convert timestring to a format suitable for filename
3306 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3307 upload_date = upload_date.strftime('%Y%m%d')
3308 self.report_date(upload_date)
3309
3310 # Extract uploader
3311 uploader = u'NA'
3312 pattern = r'rel\="author".*?>(.*?)</a>'
3313 mobj = re.search(pattern, webpage)
3314 if mobj:
3315 uploader = mobj.group(1)
3316 self.report_uploader(uploader)
3317
3318 # Extract title
3319 # Get the first line for title
3320 video_title = u'NA'
3321 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3322 mobj = re.search(pattern, webpage)
3323 if mobj:
3324 video_title = mobj.group(1)
3325 self.report_title(video_title)
3326
3327 # Step 2, Stimulate clicking the image box to launch video
3328 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3329 mobj = re.search(pattern, webpage)
3330 if mobj is None:
3331 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3332
3333 video_page = mobj.group(1)
3334 request = urllib2.Request(video_page)
3335 try:
3336 webpage = urllib2.urlopen(request).read()
3337 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3338 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3339 return
3340 self.report_extract_vid_page(video_page)
3341
3342
3343 # Extract video links on video page
3344 """Extract video links of all sizes"""
3345 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3346 mobj = re.findall(pattern, webpage)
3347 if len(mobj) == 0:
3348 self._downloader.trouble(u'ERROR: unable to extract video links')
3349
3350 # Sort in resolution
3351 links = sorted(mobj)
3352
3353 # Choose the lowest of the sort, i.e. highest resolution
3354 video_url = links[-1]
3355 # Only get the url. The resolution part in the tuple has no use anymore
3356 video_url = video_url[-1]
3357 # Treat escaped \u0026 style hex
3358 video_url = unicode(video_url, "unicode_escape")
3359
3360
3361 return [{
3362 'id': video_id.decode('utf-8'),
3363 'url': video_url,
3364 'uploader': uploader.decode('utf-8'),
3365 'upload_date': upload_date.decode('utf-8'),
3366 'title': video_title.decode('utf-8'),
3367 'ext': video_extension.decode('utf-8'),
3368 'format': u'NA',
3369 'player_url': None,
3370 }]