]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
6e59ba8fd740c789435c3ada8ba0a13dcb2297a8
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21 import cStringIO as StringIO
22 except ImportError:
23 import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
38 the following fields:
39
40 id: Video identifier.
41 url: Final video URL.
42 uploader: Nickname of the video uploader.
43 title: Literal title.
44 ext: Video filename extension.
45 format: Video format.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
52
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59 """
60
61 _ready = False
62 _downloader = None
63
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
66 self._ready = False
67 self.set_downloader(downloader)
68
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
72
73 def initialize(self):
74 """Initializes an instance (authentication, etc)."""
75 if not self._ready:
76 self._real_initialize()
77 self._ready = True
78
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
81 self.initialize()
82 return self._real_extract(url)
83
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
87
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
90 pass
91
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
94 pass
95
96
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
99
100 _VALID_URL = r"""^
101 (
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?:.*?\#/)? # handle anchor (#/) redirect urls
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
113 v=
114 )
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
119 $"""
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
128 _video_extensions = {
129 '13': '3gp',
130 '17': 'mp4',
131 '18': 'mp4',
132 '22': 'mp4',
133 '37': 'mp4',
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
135 '43': 'webm',
136 '44': 'webm',
137 '45': 'webm',
138 '46': 'webm',
139 }
140 _video_dimensions = {
141 '5': '240x400',
142 '6': '???',
143 '13': '???',
144 '17': '144x176',
145 '18': '360x640',
146 '22': '720x1280',
147 '34': '360x640',
148 '35': '480x854',
149 '37': '1080x1920',
150 '38': '3072x4096',
151 '43': '360x640',
152 '44': '480x854',
153 '45': '720x1280',
154 '46': '1080x1920',
155 }
156 IE_NAME = u'youtube'
157
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
165
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
169
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
173
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
197
198 def _closed_captions_xml_to_srt(self, xml_string):
199 srt = ''
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
204 start = float(start)
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
208 caption = unescapeHTML(caption)
209 caption = unescapeHTML(caption) # double cycle, intentional
210 srt += str(n+1) + '\n'
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
213 return srt
214
215 def _print_formats(self, formats):
216 print 'Available formats:'
217 for x in formats:
218 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
219
220 def _real_initialize(self):
221 if self._downloader is None:
222 return
223
224 username = None
225 password = None
226 downloader_params = self._downloader.params
227
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
233 try:
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
235 if info is not None:
236 username = info[0]
237 password = info[2]
238 else:
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
242 return
243
244 # Set language
245 request = urllib2.Request(self._LANG_URL)
246 try:
247 self.report_lang()
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
251 return
252
253 # No authentication to be performed
254 if username is None:
255 return
256
257 # Log in
258 login_form = {
259 'current_form': 'loginForm',
260 'next': '/',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
264 }
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
266 try:
267 self.report_login()
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 return
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
274 return
275
276 # Confirm age
277 age_form = {
278 'next_url': '/',
279 'action_confirm': 'Confirm',
280 }
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 try:
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
287 return
288
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
292 if mobj:
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294
295 # Extract video id from URL
296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 if mobj is None:
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 return
300 video_id = mobj.group(2)
301
302 # Get video webpage
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 try:
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
309 return
310
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 if mobj is not None:
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
315 else:
316 player_url = None
317
318 # Get video info
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
324 try:
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
328 break
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
331 return
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 else:
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
337 return
338
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
342 return
343
344 # Start extracting information
345 self.report_information_extraction(video_id)
346
347 # uploader
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 return
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
352
353 # title
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
356 return
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
359
360 # thumbnail image
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 video_thumbnail = ''
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
366
367 # upload date
368 upload_date = u'NA'
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 if mobj is not None:
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
374 try:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
376 except:
377 pass
378
379 # description
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
382 else: video_description = ''
383
384 # closed captions
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
387 try:
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 try:
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
401 srt_lang = 'en'
402 else:
403 srt_lang = srt_lang_list.keys()[0]
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 try:
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
411 if not srt_xml:
412 raise Trouble(u'WARNING: unable to download video subtitles')
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
416
417 if 'length_seconds' not in video_info:
418 self._downloader.trouble(u'WARNING: unable to extract video duration')
419 video_duration = ''
420 else:
421 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
422
423 # token
424 video_token = urllib.unquote_plus(video_info['token'][0])
425
426 # Decide which formats to download
427 req_format = self._downloader.params.get('format', None)
428
429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
430 self.report_rtmp_download()
431 video_url_list = [(None, video_info['conn'][0])]
432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
433 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
434 url_data = [parse_qs(uds) for uds in url_data_strs]
435 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
436 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
437
438 format_limit = self._downloader.params.get('format_limit', None)
439 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
440 if format_limit is not None and format_limit in available_formats:
441 format_list = available_formats[available_formats.index(format_limit):]
442 else:
443 format_list = available_formats
444 existing_formats = [x for x in format_list if x in url_map]
445 if len(existing_formats) == 0:
446 self._downloader.trouble(u'ERROR: no known formats available for video')
447 return
448 if self._downloader.params.get('listformats', None):
449 self._print_formats(existing_formats)
450 return
451 if req_format is None or req_format == 'best':
452 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
453 elif req_format == 'worst':
454 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
455 elif req_format in ('-1', 'all'):
456 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
457 else:
458 # Specific formats. We pick the first in a slash-delimeted sequence.
459 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
460 req_formats = req_format.split('/')
461 video_url_list = None
462 for rf in req_formats:
463 if rf in url_map:
464 video_url_list = [(rf, url_map[rf])]
465 break
466 if video_url_list is None:
467 self._downloader.trouble(u'ERROR: requested format not available')
468 return
469 else:
470 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
471 return
472
473 results = []
474 for format_param, video_real_url in video_url_list:
475 # Extension
476 video_extension = self._video_extensions.get(format_param, 'flv')
477
478 results.append({
479 'id': video_id.decode('utf-8'),
480 'url': video_real_url.decode('utf-8'),
481 'uploader': video_uploader.decode('utf-8'),
482 'upload_date': upload_date,
483 'title': video_title,
484 'ext': video_extension.decode('utf-8'),
485 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
486 'thumbnail': video_thumbnail.decode('utf-8'),
487 'description': video_description,
488 'player_url': player_url,
489 'subtitles': video_subtitles,
490 'duration': video_duration
491 })
492 return results
493
494
495 class MetacafeIE(InfoExtractor):
496 """Information Extractor for metacafe.com."""
497
498 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
499 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
500 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
501 IE_NAME = u'metacafe'
502
503 def __init__(self, downloader=None):
504 InfoExtractor.__init__(self, downloader)
505
506 def report_disclaimer(self):
507 """Report disclaimer retrieval."""
508 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
509
510 def report_age_confirmation(self):
511 """Report attempt to confirm age."""
512 self._downloader.to_screen(u'[metacafe] Confirming age')
513
514 def report_download_webpage(self, video_id):
515 """Report webpage download."""
516 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
517
518 def report_extraction(self, video_id):
519 """Report information extraction."""
520 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
521
522 def _real_initialize(self):
523 # Retrieve disclaimer
524 request = urllib2.Request(self._DISCLAIMER)
525 try:
526 self.report_disclaimer()
527 disclaimer = urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
530 return
531
532 # Confirm age
533 disclaimer_form = {
534 'filters': '0',
535 'submit': "Continue - I'm over 18",
536 }
537 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
538 try:
539 self.report_age_confirmation()
540 disclaimer = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
543 return
544
545 def _real_extract(self, url):
546 # Extract id and simplified title from URL
547 mobj = re.match(self._VALID_URL, url)
548 if mobj is None:
549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
550 return
551
552 video_id = mobj.group(1)
553
554 # Check if video comes from YouTube
555 mobj2 = re.match(r'^yt-(.*)$', video_id)
556 if mobj2 is not None:
557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
558 return
559
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
562 try:
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
567 return
568
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
572 if mobj is not None:
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
575
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
578 if mobj is None:
579 video_url = mediaURL
580 else:
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
583 else:
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
585 if mobj is None:
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
587 return
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
591 return
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
593 if mobj is None:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
599
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: unable to extract title')
603 return
604 video_title = mobj.group(1).decode('utf-8')
605
606 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
609 return
610 video_uploader = mobj.group(1)
611
612 return [{
613 'id': video_id.decode('utf-8'),
614 'url': video_url.decode('utf-8'),
615 'uploader': video_uploader.decode('utf-8'),
616 'upload_date': u'NA',
617 'title': video_title,
618 'ext': video_extension.decode('utf-8'),
619 'format': u'NA',
620 'player_url': None,
621 }]
622
623
624 class DailymotionIE(InfoExtractor):
625 """Information Extractor for Dailymotion"""
626
627 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
628 IE_NAME = u'dailymotion'
629
630 def __init__(self, downloader=None):
631 InfoExtractor.__init__(self, downloader)
632
633 def report_download_webpage(self, video_id):
634 """Report webpage download."""
635 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
636
637 def report_extraction(self, video_id):
638 """Report information extraction."""
639 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
640
641 def _real_extract(self, url):
642 # Extract id and simplified title from URL
643 mobj = re.match(self._VALID_URL, url)
644 if mobj is None:
645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 return
647
648 video_id = mobj.group(1).split('_')[0].split('?')[0]
649
650 video_extension = 'mp4'
651
652 # Retrieve video webpage to extract further information
653 request = urllib2.Request(url)
654 request.add_header('Cookie', 'family_filter=off')
655 try:
656 self.report_download_webpage(video_id)
657 webpage = urllib2.urlopen(request).read()
658 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
659 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
660 return
661
662 # Extract URL, uploader and title from webpage
663 self.report_extraction(video_id)
664 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
665 if mobj is None:
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
667 return
668 flashvars = urllib.unquote(mobj.group(1))
669
670 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
671 if key in flashvars:
672 max_quality = key
673 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
674 break
675 else:
676 self._downloader.trouble(u'ERROR: unable to extract video URL')
677 return
678
679 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
680 if mobj is None:
681 self._downloader.trouble(u'ERROR: unable to extract video URL')
682 return
683
684 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
685
686 # TODO: support choosing qualities
687
688 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
689 if mobj is None:
690 self._downloader.trouble(u'ERROR: unable to extract title')
691 return
692 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
693
694 video_uploader = u'NA'
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
696 if mobj is None:
697 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
698 else:
699 video_uploader = mobj.group(1)
700
701 video_upload_date = u'NA'
702 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
703 if mobj is not None:
704 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
705
706 return [{
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
710 'upload_date': video_upload_date,
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
713 'format': u'NA',
714 'player_url': None,
715 }]
716
717
718 class GoogleIE(InfoExtractor):
719 """Information extractor for video.google.com."""
720
721 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
722 IE_NAME = u'video.google'
723
724 def __init__(self, downloader=None):
725 InfoExtractor.__init__(self, downloader)
726
727 def report_download_webpage(self, video_id):
728 """Report webpage download."""
729 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
730
731 def report_extraction(self, video_id):
732 """Report information extraction."""
733 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
734
735 def _real_extract(self, url):
736 # Extract id from URL
737 mobj = re.match(self._VALID_URL, url)
738 if mobj is None:
739 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
740 return
741
742 video_id = mobj.group(1)
743
744 video_extension = 'mp4'
745
746 # Retrieve video webpage to extract further information
747 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
748 try:
749 self.report_download_webpage(video_id)
750 webpage = urllib2.urlopen(request).read()
751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
753 return
754
755 # Extract URL, uploader, and title from webpage
756 self.report_extraction(video_id)
757 mobj = re.search(r"download_url:'([^']+)'", webpage)
758 if mobj is None:
759 video_extension = 'flv'
760 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
761 if mobj is None:
762 self._downloader.trouble(u'ERROR: unable to extract media URL')
763 return
764 mediaURL = urllib.unquote(mobj.group(1))
765 mediaURL = mediaURL.replace('\\x3d', '\x3d')
766 mediaURL = mediaURL.replace('\\x26', '\x26')
767
768 video_url = mediaURL
769
770 mobj = re.search(r'<title>(.*)</title>', webpage)
771 if mobj is None:
772 self._downloader.trouble(u'ERROR: unable to extract title')
773 return
774 video_title = mobj.group(1).decode('utf-8')
775
776 # Extract video description
777 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
778 if mobj is None:
779 self._downloader.trouble(u'ERROR: unable to extract video description')
780 return
781 video_description = mobj.group(1).decode('utf-8')
782 if not video_description:
783 video_description = 'No description available.'
784
785 # Extract video thumbnail
786 if self._downloader.params.get('forcethumbnail', False):
787 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
788 try:
789 webpage = urllib2.urlopen(request).read()
790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
792 return
793 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
794 if mobj is None:
795 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
796 return
797 video_thumbnail = mobj.group(1)
798 else: # we need something to pass to process_info
799 video_thumbnail = ''
800
801 return [{
802 'id': video_id.decode('utf-8'),
803 'url': video_url.decode('utf-8'),
804 'uploader': u'NA',
805 'upload_date': u'NA',
806 'title': video_title,
807 'ext': video_extension.decode('utf-8'),
808 'format': u'NA',
809 'player_url': None,
810 }]
811
812
813 class PhotobucketIE(InfoExtractor):
814 """Information extractor for photobucket.com."""
815
816 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
817 IE_NAME = u'photobucket'
818
819 def __init__(self, downloader=None):
820 InfoExtractor.__init__(self, downloader)
821
822 def report_download_webpage(self, video_id):
823 """Report webpage download."""
824 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
825
826 def report_extraction(self, video_id):
827 """Report information extraction."""
828 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
829
830 def _real_extract(self, url):
831 # Extract id from URL
832 mobj = re.match(self._VALID_URL, url)
833 if mobj is None:
834 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
835 return
836
837 video_id = mobj.group(1)
838
839 video_extension = 'flv'
840
841 # Retrieve video webpage to extract further information
842 request = urllib2.Request(url)
843 try:
844 self.report_download_webpage(video_id)
845 webpage = urllib2.urlopen(request).read()
846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
847 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
848 return
849
850 # Extract URL, uploader, and title from webpage
851 self.report_extraction(video_id)
852 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
853 if mobj is None:
854 self._downloader.trouble(u'ERROR: unable to extract media URL')
855 return
856 mediaURL = urllib.unquote(mobj.group(1))
857
858 video_url = mediaURL
859
860 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
861 if mobj is None:
862 self._downloader.trouble(u'ERROR: unable to extract title')
863 return
864 video_title = mobj.group(1).decode('utf-8')
865
866 video_uploader = mobj.group(2).decode('utf-8')
867
868 return [{
869 'id': video_id.decode('utf-8'),
870 'url': video_url.decode('utf-8'),
871 'uploader': video_uploader,
872 'upload_date': u'NA',
873 'title': video_title,
874 'ext': video_extension.decode('utf-8'),
875 'format': u'NA',
876 'player_url': None,
877 }]
878
879
880 class YahooIE(InfoExtractor):
881 """Information extractor for video.yahoo.com."""
882
883 # _VALID_URL matches all Yahoo! Video URLs
884 # _VPAGE_URL matches only the extractable '/watch/' URLs
885 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
886 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
887 IE_NAME = u'video.yahoo'
888
889 def __init__(self, downloader=None):
890 InfoExtractor.__init__(self, downloader)
891
892 def report_download_webpage(self, video_id):
893 """Report webpage download."""
894 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
895
896 def report_extraction(self, video_id):
897 """Report information extraction."""
898 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
899
900 def _real_extract(self, url, new_video=True):
901 # Extract ID from URL
902 mobj = re.match(self._VALID_URL, url)
903 if mobj is None:
904 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
905 return
906
907 video_id = mobj.group(2)
908 video_extension = 'flv'
909
910 # Rewrite valid but non-extractable URLs as
911 # extractable English language /watch/ URLs
912 if re.match(self._VPAGE_URL, url) is None:
913 request = urllib2.Request(url)
914 try:
915 webpage = urllib2.urlopen(request).read()
916 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
917 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
918 return
919
920 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
921 if mobj is None:
922 self._downloader.trouble(u'ERROR: Unable to extract id field')
923 return
924 yahoo_id = mobj.group(1)
925
926 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
927 if mobj is None:
928 self._downloader.trouble(u'ERROR: Unable to extract vid field')
929 return
930 yahoo_vid = mobj.group(1)
931
932 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
933 return self._real_extract(url, new_video=False)
934
935 # Retrieve video webpage to extract further information
936 request = urllib2.Request(url)
937 try:
938 self.report_download_webpage(video_id)
939 webpage = urllib2.urlopen(request).read()
940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
941 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
942 return
943
944 # Extract uploader and title from webpage
945 self.report_extraction(video_id)
946 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
947 if mobj is None:
948 self._downloader.trouble(u'ERROR: unable to extract video title')
949 return
950 video_title = mobj.group(1).decode('utf-8')
951
952 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
953 if mobj is None:
954 self._downloader.trouble(u'ERROR: unable to extract video uploader')
955 return
956 video_uploader = mobj.group(1).decode('utf-8')
957
958 # Extract video thumbnail
959 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
960 if mobj is None:
961 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
962 return
963 video_thumbnail = mobj.group(1).decode('utf-8')
964
965 # Extract video description
966 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
967 if mobj is None:
968 self._downloader.trouble(u'ERROR: unable to extract video description')
969 return
970 video_description = mobj.group(1).decode('utf-8')
971 if not video_description:
972 video_description = 'No description available.'
973
974 # Extract video height and width
975 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
976 if mobj is None:
977 self._downloader.trouble(u'ERROR: unable to extract video height')
978 return
979 yv_video_height = mobj.group(1)
980
981 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
982 if mobj is None:
983 self._downloader.trouble(u'ERROR: unable to extract video width')
984 return
985 yv_video_width = mobj.group(1)
986
987 # Retrieve video playlist to extract media URL
988 # I'm not completely sure what all these options are, but we
989 # seem to need most of them, otherwise the server sends a 401.
990 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
991 yv_bitrate = '700' # according to Wikipedia this is hard-coded
992 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
993 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
994 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
995 try:
996 self.report_download_webpage(video_id)
997 webpage = urllib2.urlopen(request).read()
998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1000 return
1001
1002 # Extract media URL from playlist XML
1003 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1004 if mobj is None:
1005 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1006 return
1007 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1008 video_url = unescapeHTML(video_url)
1009
1010 return [{
1011 'id': video_id.decode('utf-8'),
1012 'url': video_url,
1013 'uploader': video_uploader,
1014 'upload_date': u'NA',
1015 'title': video_title,
1016 'ext': video_extension.decode('utf-8'),
1017 'thumbnail': video_thumbnail.decode('utf-8'),
1018 'description': video_description,
1019 'thumbnail': video_thumbnail,
1020 'player_url': None,
1021 }]
1022
1023
1024 class VimeoIE(InfoExtractor):
1025 """Information extractor for vimeo.com."""
1026
1027 # _VALID_URL matches Vimeo URLs
1028 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1029 IE_NAME = u'vimeo'
1030
1031 def __init__(self, downloader=None):
1032 InfoExtractor.__init__(self, downloader)
1033
1034 def report_download_webpage(self, video_id):
1035 """Report webpage download."""
1036 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1037
1038 def report_extraction(self, video_id):
1039 """Report information extraction."""
1040 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1041
1042 def _real_extract(self, url, new_video=True):
1043 # Extract ID from URL
1044 mobj = re.match(self._VALID_URL, url)
1045 if mobj is None:
1046 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1047 return
1048
1049 video_id = mobj.group(1)
1050
1051 # Retrieve video webpage to extract further information
1052 request = urllib2.Request(url, None, std_headers)
1053 try:
1054 self.report_download_webpage(video_id)
1055 webpage = urllib2.urlopen(request).read()
1056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1057 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1058 return
1059
1060 # Now we begin extracting as much information as we can from what we
1061 # retrieved. First we extract the information common to all extractors,
1062 # and latter we extract those that are Vimeo specific.
1063 self.report_extraction(video_id)
1064
1065 # Extract the config JSON
1066 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1067 try:
1068 config = json.loads(config)
1069 except:
1070 self._downloader.trouble(u'ERROR: unable to extract info section')
1071 return
1072
1073 # Extract title
1074 video_title = config["video"]["title"]
1075
1076 # Extract uploader
1077 video_uploader = config["video"]["owner"]["name"]
1078
1079 # Extract video thumbnail
1080 video_thumbnail = config["video"]["thumbnail"]
1081
1082 # Extract video description
1083 video_description = get_element_by_id("description", webpage.decode('utf8'))
1084 if video_description: video_description = clean_html(video_description)
1085 else: video_description = ''
1086
1087 # Extract upload date
1088 video_upload_date = u'NA'
1089 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1090 if mobj is not None:
1091 video_upload_date = mobj.group(1)
1092
1093 # Vimeo specific: extract request signature and timestamp
1094 sig = config['request']['signature']
1095 timestamp = config['request']['timestamp']
1096
1097 # Vimeo specific: extract video codec and quality information
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 for codec in codecs:
1101 if codec[0] in config["video"]["files"]:
1102 video_codec = codec[0]
1103 video_extension = codec[1]
1104 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1105 else: quality = 'sd'
1106 break
1107 else:
1108 self._downloader.trouble(u'ERROR: no known codec found')
1109 return
1110
1111 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112 %(video_id, sig, timestamp, quality, video_codec.upper())
1113
1114 return [{
1115 'id': video_id,
1116 'url': video_url,
1117 'uploader': video_uploader,
1118 'upload_date': video_upload_date,
1119 'title': video_title,
1120 'ext': video_extension,
1121 'thumbnail': video_thumbnail,
1122 'description': video_description,
1123 'player_url': None,
1124 }]
1125
1126
1127 class GenericIE(InfoExtractor):
1128 """Generic last-resort information extractor."""
1129
1130 _VALID_URL = r'.*'
1131 IE_NAME = u'generic'
1132
1133 def __init__(self, downloader=None):
1134 InfoExtractor.__init__(self, downloader)
1135
1136 def report_download_webpage(self, video_id):
1137 """Report webpage download."""
1138 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1139 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1140
1141 def report_extraction(self, video_id):
1142 """Report information extraction."""
1143 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1144
1145 def report_following_redirect(self, new_url):
1146 """Report information extraction."""
1147 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1148
1149 def _test_redirect(self, url):
1150 """Check if it is a redirect, like url shorteners, in case restart chain."""
1151 class HeadRequest(urllib2.Request):
1152 def get_method(self):
1153 return "HEAD"
1154
1155 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1156 """
1157 Subclass the HTTPRedirectHandler to make it use our
1158 HeadRequest also on the redirected URL
1159 """
1160 def redirect_request(self, req, fp, code, msg, headers, newurl):
1161 if code in (301, 302, 303, 307):
1162 newurl = newurl.replace(' ', '%20')
1163 newheaders = dict((k,v) for k,v in req.headers.items()
1164 if k.lower() not in ("content-length", "content-type"))
1165 return HeadRequest(newurl,
1166 headers=newheaders,
1167 origin_req_host=req.get_origin_req_host(),
1168 unverifiable=True)
1169 else:
1170 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1171
1172 class HTTPMethodFallback(urllib2.BaseHandler):
1173 """
1174 Fallback to GET if HEAD is not allowed (405 HTTP error)
1175 """
1176 def http_error_405(self, req, fp, code, msg, headers):
1177 fp.read()
1178 fp.close()
1179
1180 newheaders = dict((k,v) for k,v in req.headers.items()
1181 if k.lower() not in ("content-length", "content-type"))
1182 return self.parent.open(urllib2.Request(req.get_full_url(),
1183 headers=newheaders,
1184 origin_req_host=req.get_origin_req_host(),
1185 unverifiable=True))
1186
1187 # Build our opener
1188 opener = urllib2.OpenerDirector()
1189 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1190 HTTPMethodFallback, HEADRedirectHandler,
1191 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1192 opener.add_handler(handler())
1193
1194 response = opener.open(HeadRequest(url))
1195 new_url = response.geturl()
1196
1197 if url == new_url: return False
1198
1199 self.report_following_redirect(new_url)
1200 self._downloader.download([new_url])
1201 return True
1202
1203 def _real_extract(self, url):
1204 if self._test_redirect(url): return
1205
1206 video_id = url.split('/')[-1]
1207 request = urllib2.Request(url)
1208 try:
1209 self.report_download_webpage(video_id)
1210 webpage = urllib2.urlopen(request).read()
1211 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1212 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1213 return
1214 except ValueError, err:
1215 # since this is the last-resort InfoExtractor, if
1216 # this error is thrown, it'll be thrown here
1217 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1218 return
1219
1220 self.report_extraction(video_id)
1221 # Start with something easy: JW Player in SWFObject
1222 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1223 if mobj is None:
1224 # Broaden the search a little bit
1225 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1226 if mobj is None:
1227 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1228 return
1229
1230 # It's possible that one of the regexes
1231 # matched, but returned an empty group:
1232 if mobj.group(1) is None:
1233 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1234 return
1235
1236 video_url = urllib.unquote(mobj.group(1))
1237 video_id = os.path.basename(video_url)
1238
1239 # here's a fun little line of code for you:
1240 video_extension = os.path.splitext(video_id)[1][1:]
1241 video_id = os.path.splitext(video_id)[0]
1242
1243 # it's tempting to parse this further, but you would
1244 # have to take into account all the variations like
1245 # Video Title - Site Name
1246 # Site Name | Video Title
1247 # Video Title - Tagline | Site Name
1248 # and so on and so forth; it's just not practical
1249 mobj = re.search(r'<title>(.*)</title>', webpage)
1250 if mobj is None:
1251 self._downloader.trouble(u'ERROR: unable to extract title')
1252 return
1253 video_title = mobj.group(1).decode('utf-8')
1254
1255 # video uploader is domain name
1256 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1257 if mobj is None:
1258 self._downloader.trouble(u'ERROR: unable to extract title')
1259 return
1260 video_uploader = mobj.group(1).decode('utf-8')
1261
1262 return [{
1263 'id': video_id.decode('utf-8'),
1264 'url': video_url.decode('utf-8'),
1265 'uploader': video_uploader,
1266 'upload_date': u'NA',
1267 'title': video_title,
1268 'ext': video_extension.decode('utf-8'),
1269 'format': u'NA',
1270 'player_url': None,
1271 }]
1272
1273
1274 class YoutubeSearchIE(InfoExtractor):
1275 """Information Extractor for YouTube search queries."""
1276 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1277 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1278 _max_youtube_results = 1000
1279 IE_NAME = u'youtube:search'
1280
1281 def __init__(self, downloader=None):
1282 InfoExtractor.__init__(self, downloader)
1283
1284 def report_download_page(self, query, pagenum):
1285 """Report attempt to download search page with given number."""
1286 query = query.decode(preferredencoding())
1287 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1288
1289 def _real_extract(self, query):
1290 mobj = re.match(self._VALID_URL, query)
1291 if mobj is None:
1292 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1293 return
1294
1295 prefix, query = query.split(':')
1296 prefix = prefix[8:]
1297 query = query.encode('utf-8')
1298 if prefix == '':
1299 self._download_n_results(query, 1)
1300 return
1301 elif prefix == 'all':
1302 self._download_n_results(query, self._max_youtube_results)
1303 return
1304 else:
1305 try:
1306 n = long(prefix)
1307 if n <= 0:
1308 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1309 return
1310 elif n > self._max_youtube_results:
1311 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1312 n = self._max_youtube_results
1313 self._download_n_results(query, n)
1314 return
1315 except ValueError: # parsing prefix as integer fails
1316 self._download_n_results(query, 1)
1317 return
1318
1319 def _download_n_results(self, query, n):
1320 """Downloads a specified number of results for a query"""
1321
1322 video_ids = []
1323 pagenum = 0
1324 limit = n
1325
1326 while (50 * pagenum) < limit:
1327 self.report_download_page(query, pagenum+1)
1328 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1329 request = urllib2.Request(result_url)
1330 try:
1331 data = urllib2.urlopen(request).read()
1332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1334 return
1335 api_response = json.loads(data)['data']
1336
1337 new_ids = list(video['id'] for video in api_response['items'])
1338 video_ids += new_ids
1339
1340 limit = min(n, api_response['totalItems'])
1341 pagenum += 1
1342
1343 if len(video_ids) > n:
1344 video_ids = video_ids[:n]
1345 for id in video_ids:
1346 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1347 return
1348
1349
1350 class GoogleSearchIE(InfoExtractor):
1351 """Information Extractor for Google Video search queries."""
1352 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1353 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1354 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1355 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1356 _max_google_results = 1000
1357 IE_NAME = u'video.google:search'
1358
1359 def __init__(self, downloader=None):
1360 InfoExtractor.__init__(self, downloader)
1361
1362 def report_download_page(self, query, pagenum):
1363 """Report attempt to download playlist page with given number."""
1364 query = query.decode(preferredencoding())
1365 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1366
1367 def _real_extract(self, query):
1368 mobj = re.match(self._VALID_URL, query)
1369 if mobj is None:
1370 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1371 return
1372
1373 prefix, query = query.split(':')
1374 prefix = prefix[8:]
1375 query = query.encode('utf-8')
1376 if prefix == '':
1377 self._download_n_results(query, 1)
1378 return
1379 elif prefix == 'all':
1380 self._download_n_results(query, self._max_google_results)
1381 return
1382 else:
1383 try:
1384 n = long(prefix)
1385 if n <= 0:
1386 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1387 return
1388 elif n > self._max_google_results:
1389 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1390 n = self._max_google_results
1391 self._download_n_results(query, n)
1392 return
1393 except ValueError: # parsing prefix as integer fails
1394 self._download_n_results(query, 1)
1395 return
1396
1397 def _download_n_results(self, query, n):
1398 """Downloads a specified number of results for a query"""
1399
1400 video_ids = []
1401 pagenum = 0
1402
1403 while True:
1404 self.report_download_page(query, pagenum)
1405 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1406 request = urllib2.Request(result_url)
1407 try:
1408 page = urllib2.urlopen(request).read()
1409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1411 return
1412
1413 # Extract video identifiers
1414 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1415 video_id = mobj.group(1)
1416 if video_id not in video_ids:
1417 video_ids.append(video_id)
1418 if len(video_ids) == n:
1419 # Specified n videos reached
1420 for id in video_ids:
1421 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1422 return
1423
1424 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1425 for id in video_ids:
1426 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1427 return
1428
1429 pagenum = pagenum + 1
1430
1431
1432 class YahooSearchIE(InfoExtractor):
1433 """Information Extractor for Yahoo! Video search queries."""
1434 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1435 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1436 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1437 _MORE_PAGES_INDICATOR = r'\s*Next'
1438 _max_yahoo_results = 1000
1439 IE_NAME = u'video.yahoo:search'
1440
1441 def __init__(self, downloader=None):
1442 InfoExtractor.__init__(self, downloader)
1443
1444 def report_download_page(self, query, pagenum):
1445 """Report attempt to download playlist page with given number."""
1446 query = query.decode(preferredencoding())
1447 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1448
1449 def _real_extract(self, query):
1450 mobj = re.match(self._VALID_URL, query)
1451 if mobj is None:
1452 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1453 return
1454
1455 prefix, query = query.split(':')
1456 prefix = prefix[8:]
1457 query = query.encode('utf-8')
1458 if prefix == '':
1459 self._download_n_results(query, 1)
1460 return
1461 elif prefix == 'all':
1462 self._download_n_results(query, self._max_yahoo_results)
1463 return
1464 else:
1465 try:
1466 n = long(prefix)
1467 if n <= 0:
1468 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1469 return
1470 elif n > self._max_yahoo_results:
1471 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1472 n = self._max_yahoo_results
1473 self._download_n_results(query, n)
1474 return
1475 except ValueError: # parsing prefix as integer fails
1476 self._download_n_results(query, 1)
1477 return
1478
1479 def _download_n_results(self, query, n):
1480 """Downloads a specified number of results for a query"""
1481
1482 video_ids = []
1483 already_seen = set()
1484 pagenum = 1
1485
1486 while True:
1487 self.report_download_page(query, pagenum)
1488 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1489 request = urllib2.Request(result_url)
1490 try:
1491 page = urllib2.urlopen(request).read()
1492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1494 return
1495
1496 # Extract video identifiers
1497 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1498 video_id = mobj.group(1)
1499 if video_id not in already_seen:
1500 video_ids.append(video_id)
1501 already_seen.add(video_id)
1502 if len(video_ids) == n:
1503 # Specified n videos reached
1504 for id in video_ids:
1505 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1506 return
1507
1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509 for id in video_ids:
1510 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1511 return
1512
1513 pagenum = pagenum + 1
1514
1515
1516 class YoutubePlaylistIE(InfoExtractor):
1517 """Information Extractor for YouTube playlists."""
1518
1519 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1520 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1521 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1522 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1523 IE_NAME = u'youtube:playlist'
1524
1525 def __init__(self, downloader=None):
1526 InfoExtractor.__init__(self, downloader)
1527
1528 def report_download_page(self, playlist_id, pagenum):
1529 """Report attempt to download playlist page with given number."""
1530 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1531
1532 def _real_extract(self, url):
1533 # Extract playlist id
1534 mobj = re.match(self._VALID_URL, url)
1535 if mobj is None:
1536 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1537 return
1538
1539 # Single video case
1540 if mobj.group(3) is not None:
1541 self._downloader.download([mobj.group(3)])
1542 return
1543
1544 # Download playlist pages
1545 # prefix is 'p' as default for playlists but there are other types that need extra care
1546 playlist_prefix = mobj.group(1)
1547 if playlist_prefix == 'a':
1548 playlist_access = 'artist'
1549 else:
1550 playlist_prefix = 'p'
1551 playlist_access = 'view_play_list'
1552 playlist_id = mobj.group(2)
1553 video_ids = []
1554 pagenum = 1
1555
1556 while True:
1557 self.report_download_page(playlist_id, pagenum)
1558 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1559 request = urllib2.Request(url)
1560 try:
1561 page = urllib2.urlopen(request).read()
1562 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1564 return
1565
1566 # Extract video identifiers
1567 ids_in_page = []
1568 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1569 if mobj.group(1) not in ids_in_page:
1570 ids_in_page.append(mobj.group(1))
1571 video_ids.extend(ids_in_page)
1572
1573 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1574 break
1575 pagenum = pagenum + 1
1576
1577 playliststart = self._downloader.params.get('playliststart', 1) - 1
1578 playlistend = self._downloader.params.get('playlistend', -1)
1579 if playlistend == -1:
1580 video_ids = video_ids[playliststart:]
1581 else:
1582 video_ids = video_ids[playliststart:playlistend]
1583
1584 for id in video_ids:
1585 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1586 return
1587
1588
1589 class YoutubeChannelIE(InfoExtractor):
1590 """Information Extractor for YouTube channels."""
1591
1592 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1593 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1594 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1595 IE_NAME = u'youtube:channel'
1596
1597 def report_download_page(self, channel_id, pagenum):
1598 """Report attempt to download channel page with given number."""
1599 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1600
1601 def _real_extract(self, url):
1602 # Extract channel id
1603 mobj = re.match(self._VALID_URL, url)
1604 if mobj is None:
1605 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1606 return
1607
1608 # Download channel pages
1609 channel_id = mobj.group(1)
1610 video_ids = []
1611 pagenum = 1
1612
1613 while True:
1614 self.report_download_page(channel_id, pagenum)
1615 url = self._TEMPLATE_URL % (channel_id, pagenum)
1616 request = urllib2.Request(url)
1617 try:
1618 page = urllib2.urlopen(request).read()
1619 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1620 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1621 return
1622
1623 # Extract video identifiers
1624 ids_in_page = []
1625 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1626 if mobj.group(1) not in ids_in_page:
1627 ids_in_page.append(mobj.group(1))
1628 video_ids.extend(ids_in_page)
1629
1630 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1631 break
1632 pagenum = pagenum + 1
1633
1634 for id in video_ids:
1635 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1636 return
1637
1638
1639 class YoutubeUserIE(InfoExtractor):
1640 """Information Extractor for YouTube users."""
1641
1642 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1643 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1644 _GDATA_PAGE_SIZE = 50
1645 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1646 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1647 IE_NAME = u'youtube:user'
1648
1649 def __init__(self, downloader=None):
1650 InfoExtractor.__init__(self, downloader)
1651
1652 def report_download_page(self, username, start_index):
1653 """Report attempt to download user page."""
1654 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1655 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1656
1657 def _real_extract(self, url):
1658 # Extract username
1659 mobj = re.match(self._VALID_URL, url)
1660 if mobj is None:
1661 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1662 return
1663
1664 username = mobj.group(1)
1665
1666 # Download video ids using YouTube Data API. Result size per
1667 # query is limited (currently to 50 videos) so we need to query
1668 # page by page until there are no video ids - it means we got
1669 # all of them.
1670
1671 video_ids = []
1672 pagenum = 0
1673
1674 while True:
1675 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1676 self.report_download_page(username, start_index)
1677
1678 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1679
1680 try:
1681 page = urllib2.urlopen(request).read()
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1684 return
1685
1686 # Extract video identifiers
1687 ids_in_page = []
1688
1689 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690 if mobj.group(1) not in ids_in_page:
1691 ids_in_page.append(mobj.group(1))
1692
1693 video_ids.extend(ids_in_page)
1694
1695 # A little optimization - if current page is not
1696 # "full", ie. does not contain PAGE_SIZE video ids then
1697 # we can assume that this page is the last one - there
1698 # are no more ids on further pages - no need to query
1699 # again.
1700
1701 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1702 break
1703
1704 pagenum += 1
1705
1706 all_ids_count = len(video_ids)
1707 playliststart = self._downloader.params.get('playliststart', 1) - 1
1708 playlistend = self._downloader.params.get('playlistend', -1)
1709
1710 if playlistend == -1:
1711 video_ids = video_ids[playliststart:]
1712 else:
1713 video_ids = video_ids[playliststart:playlistend]
1714
1715 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1716 (username, all_ids_count, len(video_ids)))
1717
1718 for video_id in video_ids:
1719 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1720
1721
1722 class BlipTVUserIE(InfoExtractor):
1723 """Information Extractor for blip.tv users."""
1724
1725 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1726 _PAGE_SIZE = 12
1727 IE_NAME = u'blip.tv:user'
1728
1729 def __init__(self, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1731
1732 def report_download_page(self, username, pagenum):
1733 """Report attempt to download user page."""
1734 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1735 (self.IE_NAME, username, pagenum))
1736
1737 def _real_extract(self, url):
1738 # Extract username
1739 mobj = re.match(self._VALID_URL, url)
1740 if mobj is None:
1741 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1742 return
1743
1744 username = mobj.group(1)
1745
1746 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1747
1748 request = urllib2.Request(url)
1749
1750 try:
1751 page = urllib2.urlopen(request).read().decode('utf-8')
1752 mobj = re.search(r'data-users-id="([^"]+)"', page)
1753 page_base = page_base % mobj.group(1)
1754 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1755 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1756 return
1757
1758
1759 # Download video ids using BlipTV Ajax calls. Result size per
1760 # query is limited (currently to 12 videos) so we need to query
1761 # page by page until there are no video ids - it means we got
1762 # all of them.
1763
1764 video_ids = []
1765 pagenum = 1
1766
1767 while True:
1768 self.report_download_page(username, pagenum)
1769
1770 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1771
1772 try:
1773 page = urllib2.urlopen(request).read().decode('utf-8')
1774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1776 return
1777
1778 # Extract video identifiers
1779 ids_in_page = []
1780
1781 for mobj in re.finditer(r'href="/([^"]+)"', page):
1782 if mobj.group(1) not in ids_in_page:
1783 ids_in_page.append(unescapeHTML(mobj.group(1)))
1784
1785 video_ids.extend(ids_in_page)
1786
1787 # A little optimization - if current page is not
1788 # "full", ie. does not contain PAGE_SIZE video ids then
1789 # we can assume that this page is the last one - there
1790 # are no more ids on further pages - no need to query
1791 # again.
1792
1793 if len(ids_in_page) < self._PAGE_SIZE:
1794 break
1795
1796 pagenum += 1
1797
1798 all_ids_count = len(video_ids)
1799 playliststart = self._downloader.params.get('playliststart', 1) - 1
1800 playlistend = self._downloader.params.get('playlistend', -1)
1801
1802 if playlistend == -1:
1803 video_ids = video_ids[playliststart:]
1804 else:
1805 video_ids = video_ids[playliststart:playlistend]
1806
1807 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1808 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1809
1810 for video_id in video_ids:
1811 self._downloader.download([u'http://blip.tv/'+video_id])
1812
1813
1814 class DepositFilesIE(InfoExtractor):
1815 """Information extractor for depositfiles.com"""
1816
1817 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1818 IE_NAME = u'DepositFiles'
1819
1820 def __init__(self, downloader=None):
1821 InfoExtractor.__init__(self, downloader)
1822
1823 def report_download_webpage(self, file_id):
1824 """Report webpage download."""
1825 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1826
1827 def report_extraction(self, file_id):
1828 """Report information extraction."""
1829 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1830
1831 def _real_extract(self, url):
1832 file_id = url.split('/')[-1]
1833 # Rebuild url in english locale
1834 url = 'http://depositfiles.com/en/files/' + file_id
1835
1836 # Retrieve file webpage with 'Free download' button pressed
1837 free_download_indication = { 'gateway_result' : '1' }
1838 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1839 try:
1840 self.report_download_webpage(file_id)
1841 webpage = urllib2.urlopen(request).read()
1842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1844 return
1845
1846 # Search for the real file URL
1847 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1848 if (mobj is None) or (mobj.group(1) is None):
1849 # Try to figure out reason of the error.
1850 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1851 if (mobj is not None) and (mobj.group(1) is not None):
1852 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1853 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1854 else:
1855 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1856 return
1857
1858 file_url = mobj.group(1)
1859 file_extension = os.path.splitext(file_url)[1][1:]
1860
1861 # Search for file title
1862 mobj = re.search(r'<b title="(.*?)">', webpage)
1863 if mobj is None:
1864 self._downloader.trouble(u'ERROR: unable to extract title')
1865 return
1866 file_title = mobj.group(1).decode('utf-8')
1867
1868 return [{
1869 'id': file_id.decode('utf-8'),
1870 'url': file_url.decode('utf-8'),
1871 'uploader': u'NA',
1872 'upload_date': u'NA',
1873 'title': file_title,
1874 'ext': file_extension.decode('utf-8'),
1875 'format': u'NA',
1876 'player_url': None,
1877 }]
1878
1879
1880 class FacebookIE(InfoExtractor):
1881 """Information Extractor for Facebook"""
1882
1883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885 _NETRC_MACHINE = 'facebook'
1886 _available_formats = ['video', 'highqual', 'lowqual']
1887 _video_extensions = {
1888 'video': 'mp4',
1889 'highqual': 'mp4',
1890 'lowqual': 'mp4',
1891 }
1892 IE_NAME = u'facebook'
1893
1894 def __init__(self, downloader=None):
1895 InfoExtractor.__init__(self, downloader)
1896
1897 def _reporter(self, message):
1898 """Add header and report message."""
1899 self._downloader.to_screen(u'[facebook] %s' % message)
1900
1901 def report_login(self):
1902 """Report attempt to log in."""
1903 self._reporter(u'Logging in')
1904
1905 def report_video_webpage_download(self, video_id):
1906 """Report attempt to download video webpage."""
1907 self._reporter(u'%s: Downloading video webpage' % video_id)
1908
1909 def report_information_extraction(self, video_id):
1910 """Report attempt to extract video information."""
1911 self._reporter(u'%s: Extracting video information' % video_id)
1912
1913 def _parse_page(self, video_webpage):
1914 """Extract video information from page"""
1915 # General data
1916 data = {'title': r'\("video_title", "(.*?)"\)',
1917 'description': r'<div class="datawrap">(.*?)</div>',
1918 'owner': r'\("video_owner_name", "(.*?)"\)',
1919 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1920 }
1921 video_info = {}
1922 for piece in data.keys():
1923 mobj = re.search(data[piece], video_webpage)
1924 if mobj is not None:
1925 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1926
1927 # Video urls
1928 video_urls = {}
1929 for fmt in self._available_formats:
1930 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1931 if mobj is not None:
1932 # URL is in a Javascript segment inside an escaped Unicode format within
1933 # the generally utf-8 page
1934 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1935 video_info['video_urls'] = video_urls
1936
1937 return video_info
1938
1939 def _real_initialize(self):
1940 if self._downloader is None:
1941 return
1942
1943 useremail = None
1944 password = None
1945 downloader_params = self._downloader.params
1946
1947 # Attempt to use provided username and password or .netrc data
1948 if downloader_params.get('username', None) is not None:
1949 useremail = downloader_params['username']
1950 password = downloader_params['password']
1951 elif downloader_params.get('usenetrc', False):
1952 try:
1953 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1954 if info is not None:
1955 useremail = info[0]
1956 password = info[2]
1957 else:
1958 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1959 except (IOError, netrc.NetrcParseError), err:
1960 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1961 return
1962
1963 if useremail is None:
1964 return
1965
1966 # Log in
1967 login_form = {
1968 'email': useremail,
1969 'pass': password,
1970 'login': 'Log+In'
1971 }
1972 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1973 try:
1974 self.report_login()
1975 login_results = urllib2.urlopen(request).read()
1976 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1977 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1978 return
1979 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1980 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1981 return
1982
1983 def _real_extract(self, url):
1984 mobj = re.match(self._VALID_URL, url)
1985 if mobj is None:
1986 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1987 return
1988 video_id = mobj.group('ID')
1989
1990 # Get video webpage
1991 self.report_video_webpage_download(video_id)
1992 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1993 try:
1994 page = urllib2.urlopen(request)
1995 video_webpage = page.read()
1996 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1997 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1998 return
1999
2000 # Start extracting information
2001 self.report_information_extraction(video_id)
2002
2003 # Extract information
2004 video_info = self._parse_page(video_webpage)
2005
2006 # uploader
2007 if 'owner' not in video_info:
2008 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2009 return
2010 video_uploader = video_info['owner']
2011
2012 # title
2013 if 'title' not in video_info:
2014 self._downloader.trouble(u'ERROR: unable to extract video title')
2015 return
2016 video_title = video_info['title']
2017 video_title = video_title.decode('utf-8')
2018
2019 # thumbnail image
2020 if 'thumbnail' not in video_info:
2021 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2022 video_thumbnail = ''
2023 else:
2024 video_thumbnail = video_info['thumbnail']
2025
2026 # upload date
2027 upload_date = u'NA'
2028 if 'upload_date' in video_info:
2029 upload_time = video_info['upload_date']
2030 timetuple = email.utils.parsedate_tz(upload_time)
2031 if timetuple is not None:
2032 try:
2033 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2034 except:
2035 pass
2036
2037 # description
2038 video_description = video_info.get('description', 'No description available.')
2039
2040 url_map = video_info['video_urls']
2041 if len(url_map.keys()) > 0:
2042 # Decide which formats to download
2043 req_format = self._downloader.params.get('format', None)
2044 format_limit = self._downloader.params.get('format_limit', None)
2045
2046 if format_limit is not None and format_limit in self._available_formats:
2047 format_list = self._available_formats[self._available_formats.index(format_limit):]
2048 else:
2049 format_list = self._available_formats
2050 existing_formats = [x for x in format_list if x in url_map]
2051 if len(existing_formats) == 0:
2052 self._downloader.trouble(u'ERROR: no known formats available for video')
2053 return
2054 if req_format is None:
2055 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2056 elif req_format == 'worst':
2057 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2058 elif req_format == '-1':
2059 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2060 else:
2061 # Specific format
2062 if req_format not in url_map:
2063 self._downloader.trouble(u'ERROR: requested format not available')
2064 return
2065 video_url_list = [(req_format, url_map[req_format])] # Specific format
2066
2067 results = []
2068 for format_param, video_real_url in video_url_list:
2069 # Extension
2070 video_extension = self._video_extensions.get(format_param, 'mp4')
2071
2072 results.append({
2073 'id': video_id.decode('utf-8'),
2074 'url': video_real_url.decode('utf-8'),
2075 'uploader': video_uploader.decode('utf-8'),
2076 'upload_date': upload_date,
2077 'title': video_title,
2078 'ext': video_extension.decode('utf-8'),
2079 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2080 'thumbnail': video_thumbnail.decode('utf-8'),
2081 'description': video_description.decode('utf-8'),
2082 'player_url': None,
2083 })
2084 return results
2085
2086 class BlipTVIE(InfoExtractor):
2087 """Information extractor for blip.tv"""
2088
2089 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2090 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2091 IE_NAME = u'blip.tv'
2092
2093 def report_extraction(self, file_id):
2094 """Report information extraction."""
2095 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2096
2097 def report_direct_download(self, title):
2098 """Report information extraction."""
2099 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2100
2101 def _real_extract(self, url):
2102 mobj = re.match(self._VALID_URL, url)
2103 if mobj is None:
2104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2105 return
2106
2107 if '?' in url:
2108 cchar = '&'
2109 else:
2110 cchar = '?'
2111 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2112 request = urllib2.Request(json_url.encode('utf-8'))
2113 self.report_extraction(mobj.group(1))
2114 info = None
2115 try:
2116 urlh = urllib2.urlopen(request)
2117 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2118 basename = url.split('/')[-1]
2119 title,ext = os.path.splitext(basename)
2120 title = title.decode('UTF-8')
2121 ext = ext.replace('.', '')
2122 self.report_direct_download(title)
2123 info = {
2124 'id': title,
2125 'url': url,
2126 'title': title,
2127 'ext': ext,
2128 'urlhandle': urlh
2129 }
2130 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2131 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2132 return
2133 if info is None: # Regular URL
2134 try:
2135 json_code = urlh.read()
2136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2138 return
2139
2140 try:
2141 json_data = json.loads(json_code)
2142 if 'Post' in json_data:
2143 data = json_data['Post']
2144 else:
2145 data = json_data
2146
2147 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2148 video_url = data['media']['url']
2149 umobj = re.match(self._URL_EXT, video_url)
2150 if umobj is None:
2151 raise ValueError('Can not determine filename extension')
2152 ext = umobj.group(1)
2153
2154 info = {
2155 'id': data['item_id'],
2156 'url': video_url,
2157 'uploader': data['display_name'],
2158 'upload_date': upload_date,
2159 'title': data['title'],
2160 'ext': ext,
2161 'format': data['media']['mimeType'],
2162 'thumbnail': data['thumbnailUrl'],
2163 'description': data['description'],
2164 'player_url': data['embedUrl']
2165 }
2166 except (ValueError,KeyError), err:
2167 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2168 return
2169
2170 std_headers['User-Agent'] = 'iTunes/10.6.1'
2171 return [info]
2172
2173
2174 class MyVideoIE(InfoExtractor):
2175 """Information Extractor for myvideo.de."""
2176
2177 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2178 IE_NAME = u'myvideo'
2179
2180 def __init__(self, downloader=None):
2181 InfoExtractor.__init__(self, downloader)
2182
2183 def report_download_webpage(self, video_id):
2184 """Report webpage download."""
2185 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2186
2187 def report_extraction(self, video_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2190
2191 def _real_extract(self,url):
2192 mobj = re.match(self._VALID_URL, url)
2193 if mobj is None:
2194 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2195 return
2196
2197 video_id = mobj.group(1)
2198
2199 # Get video webpage
2200 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2201 try:
2202 self.report_download_webpage(video_id)
2203 webpage = urllib2.urlopen(request).read()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2206 return
2207
2208 self.report_extraction(video_id)
2209 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2210 webpage)
2211 if mobj is None:
2212 self._downloader.trouble(u'ERROR: unable to extract media URL')
2213 return
2214 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2215
2216 mobj = re.search('<title>([^<]+)</title>', webpage)
2217 if mobj is None:
2218 self._downloader.trouble(u'ERROR: unable to extract title')
2219 return
2220
2221 video_title = mobj.group(1)
2222
2223 return [{
2224 'id': video_id,
2225 'url': video_url,
2226 'uploader': u'NA',
2227 'upload_date': u'NA',
2228 'title': video_title,
2229 'ext': u'flv',
2230 'format': u'NA',
2231 'player_url': None,
2232 }]
2233
2234 class ComedyCentralIE(InfoExtractor):
2235 """Information extractor for The Daily Show and Colbert Report """
2236
2237 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2238 IE_NAME = u'comedycentral'
2239
2240 def report_extraction(self, episode_id):
2241 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2242
2243 def report_config_download(self, episode_id):
2244 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2245
2246 def report_index_download(self, episode_id):
2247 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2248
2249 def report_player_url(self, episode_id):
2250 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2251
2252 def _real_extract(self, url):
2253 mobj = re.match(self._VALID_URL, url)
2254 if mobj is None:
2255 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2256 return
2257
2258 if mobj.group('shortname'):
2259 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2260 url = u'http://www.thedailyshow.com/full-episodes/'
2261 else:
2262 url = u'http://www.colbertnation.com/full-episodes/'
2263 mobj = re.match(self._VALID_URL, url)
2264 assert mobj is not None
2265
2266 dlNewest = not mobj.group('episode')
2267 if dlNewest:
2268 epTitle = mobj.group('showname')
2269 else:
2270 epTitle = mobj.group('episode')
2271
2272 req = urllib2.Request(url)
2273 self.report_extraction(epTitle)
2274 try:
2275 htmlHandle = urllib2.urlopen(req)
2276 html = htmlHandle.read()
2277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2278 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2279 return
2280 if dlNewest:
2281 url = htmlHandle.geturl()
2282 mobj = re.match(self._VALID_URL, url)
2283 if mobj is None:
2284 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2285 return
2286 if mobj.group('episode') == '':
2287 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2288 return
2289 epTitle = mobj.group('episode')
2290
2291 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2292 if len(mMovieParams) == 0:
2293 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2294 return
2295
2296 playerUrl_raw = mMovieParams[0][0]
2297 self.report_player_url(epTitle)
2298 try:
2299 urlHandle = urllib2.urlopen(playerUrl_raw)
2300 playerUrl = urlHandle.geturl()
2301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2302 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2303 return
2304
2305 uri = mMovieParams[0][1]
2306 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2307 self.report_index_download(epTitle)
2308 try:
2309 indexXml = urllib2.urlopen(indexUrl).read()
2310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2312 return
2313
2314 results = []
2315
2316 idoc = xml.etree.ElementTree.fromstring(indexXml)
2317 itemEls = idoc.findall('.//item')
2318 for itemEl in itemEls:
2319 mediaId = itemEl.findall('./guid')[0].text
2320 shortMediaId = mediaId.split(':')[-1]
2321 showId = mediaId.split(':')[-2].replace('.com', '')
2322 officialTitle = itemEl.findall('./title')[0].text
2323 officialDate = itemEl.findall('./pubDate')[0].text
2324
2325 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2326 urllib.urlencode({'uri': mediaId}))
2327 configReq = urllib2.Request(configUrl)
2328 self.report_config_download(epTitle)
2329 try:
2330 configXml = urllib2.urlopen(configReq).read()
2331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2333 return
2334
2335 cdoc = xml.etree.ElementTree.fromstring(configXml)
2336 turls = []
2337 for rendition in cdoc.findall('.//rendition'):
2338 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2339 turls.append(finfo)
2340
2341 if len(turls) == 0:
2342 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2343 continue
2344
2345 # For now, just pick the highest bitrate
2346 format,video_url = turls[-1]
2347
2348 effTitle = showId + u'-' + epTitle
2349 info = {
2350 'id': shortMediaId,
2351 'url': video_url,
2352 'uploader': showId,
2353 'upload_date': officialDate,
2354 'title': effTitle,
2355 'ext': 'mp4',
2356 'format': format,
2357 'thumbnail': None,
2358 'description': officialTitle,
2359 'player_url': playerUrl
2360 }
2361
2362 results.append(info)
2363
2364 return results
2365
2366
2367 class EscapistIE(InfoExtractor):
2368 """Information extractor for The Escapist """
2369
2370 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371 IE_NAME = u'escapist'
2372
2373 def report_extraction(self, showName):
2374 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2375
2376 def report_config_download(self, showName):
2377 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2378
2379 def _real_extract(self, url):
2380 mobj = re.match(self._VALID_URL, url)
2381 if mobj is None:
2382 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2383 return
2384 showName = mobj.group('showname')
2385 videoId = mobj.group('episode')
2386
2387 self.report_extraction(showName)
2388 try:
2389 webPage = urllib2.urlopen(url)
2390 webPageBytes = webPage.read()
2391 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2392 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2395 return
2396
2397 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2398 description = unescapeHTML(descMatch.group(1))
2399 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2400 imgUrl = unescapeHTML(imgMatch.group(1))
2401 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2402 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2403 configUrlMatch = re.search('config=(.*)$', playerUrl)
2404 configUrl = urllib2.unquote(configUrlMatch.group(1))
2405
2406 self.report_config_download(showName)
2407 try:
2408 configJSON = urllib2.urlopen(configUrl).read()
2409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2410 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2411 return
2412
2413 # Technically, it's JavaScript, not JSON
2414 configJSON = configJSON.replace("'", '"')
2415
2416 try:
2417 config = json.loads(configJSON)
2418 except (ValueError,), err:
2419 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2420 return
2421
2422 playlist = config['playlist']
2423 videoUrl = playlist[1]['url']
2424
2425 info = {
2426 'id': videoId,
2427 'url': videoUrl,
2428 'uploader': showName,
2429 'upload_date': None,
2430 'title': showName,
2431 'ext': 'flv',
2432 'format': 'flv',
2433 'thumbnail': imgUrl,
2434 'description': description,
2435 'player_url': playerUrl,
2436 }
2437
2438 return [info]
2439
2440
2441 class CollegeHumorIE(InfoExtractor):
2442 """Information extractor for collegehumor.com"""
2443
2444 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2445 IE_NAME = u'collegehumor'
2446
2447 def report_webpage(self, video_id):
2448 """Report information extraction."""
2449 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2450
2451 def report_extraction(self, video_id):
2452 """Report information extraction."""
2453 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2454
2455 def _real_extract(self, url):
2456 mobj = re.match(self._VALID_URL, url)
2457 if mobj is None:
2458 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2459 return
2460 video_id = mobj.group('videoid')
2461
2462 self.report_webpage(video_id)
2463 request = urllib2.Request(url)
2464 try:
2465 webpage = urllib2.urlopen(request).read()
2466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2468 return
2469
2470 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2471 if m is None:
2472 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2473 return
2474 internal_video_id = m.group('internalvideoid')
2475
2476 info = {
2477 'id': video_id,
2478 'internal_id': internal_video_id,
2479 }
2480
2481 self.report_extraction(video_id)
2482 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2483 try:
2484 metaXml = urllib2.urlopen(xmlUrl).read()
2485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2487 return
2488
2489 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2490 try:
2491 videoNode = mdoc.findall('./video')[0]
2492 info['description'] = videoNode.findall('./description')[0].text
2493 info['title'] = videoNode.findall('./caption')[0].text
2494 info['url'] = videoNode.findall('./file')[0].text
2495 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2496 info['ext'] = info['url'].rpartition('.')[2]
2497 info['format'] = info['ext']
2498 except IndexError:
2499 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2500 return
2501
2502 return [info]
2503
2504
2505 class XVideosIE(InfoExtractor):
2506 """Information extractor for xvideos.com"""
2507
2508 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2509 IE_NAME = u'xvideos'
2510
2511 def report_webpage(self, video_id):
2512 """Report information extraction."""
2513 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2514
2515 def report_extraction(self, video_id):
2516 """Report information extraction."""
2517 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2518
2519 def _real_extract(self, url):
2520 mobj = re.match(self._VALID_URL, url)
2521 if mobj is None:
2522 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2523 return
2524 video_id = mobj.group(1).decode('utf-8')
2525
2526 self.report_webpage(video_id)
2527
2528 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2529 try:
2530 webpage = urllib2.urlopen(request).read()
2531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2533 return
2534
2535 self.report_extraction(video_id)
2536
2537
2538 # Extract video URL
2539 mobj = re.search(r'flv_url=(.+?)&', webpage)
2540 if mobj is None:
2541 self._downloader.trouble(u'ERROR: unable to extract video url')
2542 return
2543 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2544
2545
2546 # Extract title
2547 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2548 if mobj is None:
2549 self._downloader.trouble(u'ERROR: unable to extract video title')
2550 return
2551 video_title = mobj.group(1).decode('utf-8')
2552
2553
2554 # Extract video thumbnail
2555 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2556 if mobj is None:
2557 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2558 return
2559 video_thumbnail = mobj.group(0).decode('utf-8')
2560
2561 info = {
2562 'id': video_id,
2563 'url': video_url,
2564 'uploader': None,
2565 'upload_date': None,
2566 'title': video_title,
2567 'ext': 'flv',
2568 'format': 'flv',
2569 'thumbnail': video_thumbnail,
2570 'description': None,
2571 'player_url': None,
2572 }
2573
2574 return [info]
2575
2576
2577 class SoundcloudIE(InfoExtractor):
2578 """Information extractor for soundcloud.com
2579 To access the media, the uid of the song and a stream token
2580 must be extracted from the page source and the script must make
2581 a request to media.soundcloud.com/crossdomain.xml. Then
2582 the media can be grabbed by requesting from an url composed
2583 of the stream token and uid
2584 """
2585
2586 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2587 IE_NAME = u'soundcloud'
2588
2589 def __init__(self, downloader=None):
2590 InfoExtractor.__init__(self, downloader)
2591
2592 def report_webpage(self, video_id):
2593 """Report information extraction."""
2594 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2595
2596 def report_extraction(self, video_id):
2597 """Report information extraction."""
2598 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2599
2600 def _real_extract(self, url):
2601 mobj = re.match(self._VALID_URL, url)
2602 if mobj is None:
2603 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2604 return
2605
2606 # extract uploader (which is in the url)
2607 uploader = mobj.group(1).decode('utf-8')
2608 # extract simple title (uploader + slug of song title)
2609 slug_title = mobj.group(2).decode('utf-8')
2610 simple_title = uploader + u'-' + slug_title
2611
2612 self.report_webpage('%s/%s' % (uploader, slug_title))
2613
2614 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2615 try:
2616 webpage = urllib2.urlopen(request).read()
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2619 return
2620
2621 self.report_extraction('%s/%s' % (uploader, slug_title))
2622
2623 # extract uid and stream token that soundcloud hands out for access
2624 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2625 if mobj:
2626 video_id = mobj.group(1)
2627 stream_token = mobj.group(2)
2628
2629 # extract unsimplified title
2630 mobj = re.search('"title":"(.*?)",', webpage)
2631 if mobj:
2632 title = mobj.group(1).decode('utf-8')
2633 else:
2634 title = simple_title
2635
2636 # construct media url (with uid/token)
2637 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2638 mediaURL = mediaURL % (video_id, stream_token)
2639
2640 # description
2641 description = u'No description available'
2642 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2643 if mobj:
2644 description = mobj.group(1)
2645
2646 # upload date
2647 upload_date = None
2648 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2649 if mobj:
2650 try:
2651 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2652 except Exception, e:
2653 self._downloader.to_stderr(str(e))
2654
2655 # for soundcloud, a request to a cross domain is required for cookies
2656 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2657
2658 return [{
2659 'id': video_id.decode('utf-8'),
2660 'url': mediaURL,
2661 'uploader': uploader.decode('utf-8'),
2662 'upload_date': upload_date,
2663 'title': title,
2664 'ext': u'mp3',
2665 'format': u'NA',
2666 'player_url': None,
2667 'description': description.decode('utf-8')
2668 }]
2669
2670
2671 class InfoQIE(InfoExtractor):
2672 """Information extractor for infoq.com"""
2673
2674 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2675 IE_NAME = u'infoq'
2676
2677 def report_webpage(self, video_id):
2678 """Report information extraction."""
2679 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2680
2681 def report_extraction(self, video_id):
2682 """Report information extraction."""
2683 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2684
2685 def _real_extract(self, url):
2686 mobj = re.match(self._VALID_URL, url)
2687 if mobj is None:
2688 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2689 return
2690
2691 self.report_webpage(url)
2692
2693 request = urllib2.Request(url)
2694 try:
2695 webpage = urllib2.urlopen(request).read()
2696 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2697 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2698 return
2699
2700 self.report_extraction(url)
2701
2702
2703 # Extract video URL
2704 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2705 if mobj is None:
2706 self._downloader.trouble(u'ERROR: unable to extract video url')
2707 return
2708 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2709
2710
2711 # Extract title
2712 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2713 if mobj is None:
2714 self._downloader.trouble(u'ERROR: unable to extract video title')
2715 return
2716 video_title = mobj.group(1).decode('utf-8')
2717
2718 # Extract description
2719 video_description = u'No description available.'
2720 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2721 if mobj is not None:
2722 video_description = mobj.group(1).decode('utf-8')
2723
2724 video_filename = video_url.split('/')[-1]
2725 video_id, extension = video_filename.split('.')
2726
2727 info = {
2728 'id': video_id,
2729 'url': video_url,
2730 'uploader': None,
2731 'upload_date': None,
2732 'title': video_title,
2733 'ext': extension,
2734 'format': extension, # Extension is always(?) mp4, but seems to be flv
2735 'thumbnail': None,
2736 'description': video_description,
2737 'player_url': None,
2738 }
2739
2740 return [info]
2741
2742 class MixcloudIE(InfoExtractor):
2743 """Information extractor for www.mixcloud.com"""
2744 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2745 IE_NAME = u'mixcloud'
2746
2747 def __init__(self, downloader=None):
2748 InfoExtractor.__init__(self, downloader)
2749
2750 def report_download_json(self, file_id):
2751 """Report JSON download."""
2752 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2753
2754 def report_extraction(self, file_id):
2755 """Report information extraction."""
2756 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2757
2758 def get_urls(self, jsonData, fmt, bitrate='best'):
2759 """Get urls from 'audio_formats' section in json"""
2760 file_url = None
2761 try:
2762 bitrate_list = jsonData[fmt]
2763 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764 bitrate = max(bitrate_list) # select highest
2765
2766 url_list = jsonData[fmt][bitrate]
2767 except TypeError: # we have no bitrate info.
2768 url_list = jsonData[fmt]
2769 return url_list
2770
2771 def check_urls(self, url_list):
2772 """Returns 1st active url from list"""
2773 for url in url_list:
2774 try:
2775 urllib2.urlopen(url)
2776 return url
2777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778 url = None
2779
2780 return None
2781
2782 def _print_formats(self, formats):
2783 print 'Available formats:'
2784 for fmt in formats.keys():
2785 for b in formats[fmt]:
2786 try:
2787 ext = formats[fmt][b][0]
2788 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2789 except TypeError: # we have no bitrate info
2790 ext = formats[fmt][0]
2791 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2792 break
2793
2794 def _real_extract(self, url):
2795 mobj = re.match(self._VALID_URL, url)
2796 if mobj is None:
2797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798 return
2799 # extract uploader & filename from url
2800 uploader = mobj.group(1).decode('utf-8')
2801 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2802
2803 # construct API request
2804 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2805 # retrieve .json file with links to files
2806 request = urllib2.Request(file_url)
2807 try:
2808 self.report_download_json(file_url)
2809 jsonData = urllib2.urlopen(request).read()
2810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2812 return
2813
2814 # parse JSON
2815 json_data = json.loads(jsonData)
2816 player_url = json_data['player_swf_url']
2817 formats = dict(json_data['audio_formats'])
2818
2819 req_format = self._downloader.params.get('format', None)
2820 bitrate = None
2821
2822 if self._downloader.params.get('listformats', None):
2823 self._print_formats(formats)
2824 return
2825
2826 if req_format is None or req_format == 'best':
2827 for format_param in formats.keys():
2828 url_list = self.get_urls(formats, format_param)
2829 # check urls
2830 file_url = self.check_urls(url_list)
2831 if file_url is not None:
2832 break # got it!
2833 else:
2834 if req_format not in formats.keys():
2835 self._downloader.trouble(u'ERROR: format is not available')
2836 return
2837
2838 url_list = self.get_urls(formats, req_format)
2839 file_url = self.check_urls(url_list)
2840 format_param = req_format
2841
2842 return [{
2843 'id': file_id.decode('utf-8'),
2844 'url': file_url.decode('utf-8'),
2845 'uploader': uploader.decode('utf-8'),
2846 'upload_date': u'NA',
2847 'title': json_data['name'],
2848 'ext': file_url.split('.')[-1].decode('utf-8'),
2849 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2850 'thumbnail': json_data['thumbnail_url'],
2851 'description': json_data['description'],
2852 'player_url': player_url.decode('utf-8'),
2853 }]
2854
2855 class StanfordOpenClassroomIE(InfoExtractor):
2856 """Information extractor for Stanford's Open ClassRoom"""
2857
2858 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2859 IE_NAME = u'stanfordoc'
2860
2861 def report_download_webpage(self, objid):
2862 """Report information extraction."""
2863 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2864
2865 def report_extraction(self, video_id):
2866 """Report information extraction."""
2867 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2868
2869 def _real_extract(self, url):
2870 mobj = re.match(self._VALID_URL, url)
2871 if mobj is None:
2872 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2873 return
2874
2875 if mobj.group('course') and mobj.group('video'): # A specific video
2876 course = mobj.group('course')
2877 video = mobj.group('video')
2878 info = {
2879 'id': course + '_' + video,
2880 }
2881
2882 self.report_extraction(info['id'])
2883 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2884 xmlUrl = baseUrl + video + '.xml'
2885 try:
2886 metaXml = urllib2.urlopen(xmlUrl).read()
2887 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2888 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2889 return
2890 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2891 try:
2892 info['title'] = mdoc.findall('./title')[0].text
2893 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2894 except IndexError:
2895 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2896 return
2897 info['ext'] = info['url'].rpartition('.')[2]
2898 info['format'] = info['ext']
2899 return [info]
2900 elif mobj.group('course'): # A course page
2901 course = mobj.group('course')
2902 info = {
2903 'id': course,
2904 'type': 'playlist',
2905 }
2906
2907 self.report_download_webpage(info['id'])
2908 try:
2909 coursepage = urllib2.urlopen(url).read()
2910 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2911 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2912 return
2913
2914 m = re.search('<h1>([^<]+)</h1>', coursepage)
2915 if m:
2916 info['title'] = unescapeHTML(m.group(1))
2917 else:
2918 info['title'] = info['id']
2919
2920 m = re.search('<description>([^<]+)</description>', coursepage)
2921 if m:
2922 info['description'] = unescapeHTML(m.group(1))
2923
2924 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2925 info['list'] = [
2926 {
2927 'type': 'reference',
2928 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2929 }
2930 for vpage in links]
2931 results = []
2932 for entry in info['list']:
2933 assert entry['type'] == 'reference'
2934 results += self.extract(entry['url'])
2935 return results
2936
2937 else: # Root page
2938 info = {
2939 'id': 'Stanford OpenClassroom',
2940 'type': 'playlist',
2941 }
2942
2943 self.report_download_webpage(info['id'])
2944 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2945 try:
2946 rootpage = urllib2.urlopen(rootURL).read()
2947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2948 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2949 return
2950
2951 info['title'] = info['id']
2952
2953 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2954 info['list'] = [
2955 {
2956 'type': 'reference',
2957 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2958 }
2959 for cpage in links]
2960
2961 results = []
2962 for entry in info['list']:
2963 assert entry['type'] == 'reference'
2964 results += self.extract(entry['url'])
2965 return results
2966
2967 class MTVIE(InfoExtractor):
2968 """Information extractor for MTV.com"""
2969
2970 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2971 IE_NAME = u'mtv'
2972
2973 def report_webpage(self, video_id):
2974 """Report information extraction."""
2975 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2976
2977 def report_extraction(self, video_id):
2978 """Report information extraction."""
2979 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2980
2981 def _real_extract(self, url):
2982 mobj = re.match(self._VALID_URL, url)
2983 if mobj is None:
2984 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2985 return
2986 if not mobj.group('proto'):
2987 url = 'http://' + url
2988 video_id = mobj.group('videoid')
2989 self.report_webpage(video_id)
2990
2991 request = urllib2.Request(url)
2992 try:
2993 webpage = urllib2.urlopen(request).read()
2994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2996 return
2997
2998 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2999 if mobj is None:
3000 self._downloader.trouble(u'ERROR: unable to extract song name')
3001 return
3002 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3003 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3004 if mobj is None:
3005 self._downloader.trouble(u'ERROR: unable to extract performer')
3006 return
3007 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3008 video_title = performer + ' - ' + song_name
3009
3010 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3011 if mobj is None:
3012 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3013 return
3014 mtvn_uri = mobj.group(1)
3015
3016 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3017 if mobj is None:
3018 self._downloader.trouble(u'ERROR: unable to extract content id')
3019 return
3020 content_id = mobj.group(1)
3021
3022 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3023 self.report_extraction(video_id)
3024 request = urllib2.Request(videogen_url)
3025 try:
3026 metadataXml = urllib2.urlopen(request).read()
3027 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3028 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3029 return
3030
3031 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3032 renditions = mdoc.findall('.//rendition')
3033
3034 # For now, always pick the highest quality.
3035 rendition = renditions[-1]
3036
3037 try:
3038 _,_,ext = rendition.attrib['type'].partition('/')
3039 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3040 video_url = rendition.find('./src').text
3041 except KeyError:
3042 self._downloader.trouble('Invalid rendition field.')
3043 return
3044
3045 info = {
3046 'id': video_id,
3047 'url': video_url,
3048 'uploader': performer,
3049 'title': video_title,
3050 'ext': ext,
3051 'format': format,
3052 }
3053
3054 return [info]
3055
3056
3057 class YoukuIE(InfoExtractor):
3058
3059 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3060 IE_NAME = u'Youku'
3061
3062 def __init__(self, downloader=None):
3063 InfoExtractor.__init__(self, downloader)
3064
3065 def report_download_webpage(self, file_id):
3066 """Report webpage download."""
3067 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3068
3069 def report_extraction(self, file_id):
3070 """Report information extraction."""
3071 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3072
3073 def _gen_sid(self):
3074 nowTime = int(time.time() * 1000)
3075 random1 = random.randint(1000,1998)
3076 random2 = random.randint(1000,9999)
3077
3078 return "%d%d%d" %(nowTime,random1,random2)
3079
3080 def _get_file_ID_mix_string(self, seed):
3081 mixed = []
3082 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3083 seed = float(seed)
3084 for i in range(len(source)):
3085 seed = (seed * 211 + 30031 ) % 65536
3086 index = math.floor(seed / 65536 * len(source) )
3087 mixed.append(source[int(index)])
3088 source.remove(source[int(index)])
3089 #return ''.join(mixed)
3090 return mixed
3091
3092 def _get_file_id(self, fileId, seed):
3093 mixed = self._get_file_ID_mix_string(seed)
3094 ids = fileId.split('*')
3095 realId = []
3096 for ch in ids:
3097 if ch:
3098 realId.append(mixed[int(ch)])
3099 return ''.join(realId)
3100
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3103 if mobj is None:
3104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105 return
3106 video_id = mobj.group('ID')
3107
3108 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3109
3110 request = urllib2.Request(info_url, None, std_headers)
3111 try:
3112 self.report_download_webpage(video_id)
3113 jsondata = urllib2.urlopen(request).read()
3114 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3115 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3116 return
3117
3118 self.report_extraction(video_id)
3119 try:
3120 config = json.loads(jsondata)
3121
3122 video_title = config['data'][0]['title']
3123 seed = config['data'][0]['seed']
3124
3125 format = self._downloader.params.get('format', None)
3126 supported_format = config['data'][0]['streamfileids'].keys()
3127
3128 if format is None or format == 'best':
3129 if 'hd2' in supported_format:
3130 format = 'hd2'
3131 else:
3132 format = 'flv'
3133 ext = u'flv'
3134 elif format == 'worst':
3135 format = 'mp4'
3136 ext = u'mp4'
3137 else:
3138 format = 'flv'
3139 ext = u'flv'
3140
3141
3142 fileid = config['data'][0]['streamfileids'][format]
3143 seg_number = len(config['data'][0]['segs'][format])
3144
3145 keys=[]
3146 for i in xrange(seg_number):
3147 keys.append(config['data'][0]['segs'][format][i]['k'])
3148
3149 #TODO check error
3150 #youku only could be viewed from mainland china
3151 except:
3152 self._downloader.trouble(u'ERROR: unable to extract info section')
3153 return
3154
3155 files_info=[]
3156 sid = self._gen_sid()
3157 fileid = self._get_file_id(fileid, seed)
3158
3159 #column 8,9 of fileid represent the segment number
3160 #fileid[7:9] should be changed
3161 for index, key in enumerate(keys):
3162
3163 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3164 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3165
3166 info = {
3167 'id': '%s_part%02d' % (video_id, index),
3168 'url': download_url,
3169 'uploader': None,
3170 'title': video_title,
3171 'ext': ext,
3172 'format': u'NA'
3173 }
3174 files_info.append(info)
3175
3176 return files_info
3177
3178
3179 class XNXXIE(InfoExtractor):
3180 """Information extractor for xnxx.com"""
3181
3182 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3183 IE_NAME = u'xnxx'
3184 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3185 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3186 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3187
3188 def report_webpage(self, video_id):
3189 """Report information extraction"""
3190 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3191
3192 def report_extraction(self, video_id):
3193 """Report information extraction"""
3194 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3195
3196 def _real_extract(self, url):
3197 mobj = re.match(self._VALID_URL, url)
3198 if mobj is None:
3199 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3200 return
3201 video_id = mobj.group(1).decode('utf-8')
3202
3203 self.report_webpage(video_id)
3204
3205 # Get webpage content
3206 try:
3207 webpage = urllib2.urlopen(url).read()
3208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3210 return
3211
3212 result = re.search(self.VIDEO_URL_RE, webpage)
3213 if result is None:
3214 self._downloader.trouble(u'ERROR: unable to extract video url')
3215 return
3216 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3217
3218 result = re.search(self.VIDEO_TITLE_RE, webpage)
3219 if result is None:
3220 self._downloader.trouble(u'ERROR: unable to extract video title')
3221 return
3222 video_title = result.group(1).decode('utf-8')
3223
3224 result = re.search(self.VIDEO_THUMB_RE, webpage)
3225 if result is None:
3226 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3227 return
3228 video_thumbnail = result.group(1).decode('utf-8')
3229
3230 info = {'id': video_id,
3231 'url': video_url,
3232 'uploader': None,
3233 'upload_date': None,
3234 'title': video_title,
3235 'ext': 'flv',
3236 'format': 'flv',
3237 'thumbnail': video_thumbnail,
3238 'description': None,
3239 'player_url': None}
3240
3241 return [info]
3242
3243
3244 class GooglePlusIE(InfoExtractor):
3245 """Information extractor for plus.google.com."""
3246
3247 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3248 IE_NAME = u'plus.google'
3249
3250 def __init__(self, downloader=None):
3251 InfoExtractor.__init__(self, downloader)
3252
3253 def report_extract_entry(self, url):
3254 """Report downloading extry"""
3255 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3256
3257 def report_date(self, upload_date):
3258 """Report downloading extry"""
3259 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3260
3261 def report_uploader(self, uploader):
3262 """Report downloading extry"""
3263 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3264
3265 def report_title(self, video_title):
3266 """Report downloading extry"""
3267 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3268
3269 def report_extract_vid_page(self, video_page):
3270 """Report information extraction."""
3271 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3272
3273 def _real_extract(self, url):
3274 # Extract id from URL
3275 mobj = re.match(self._VALID_URL, url)
3276 if mobj is None:
3277 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3278 return
3279
3280 post_url = mobj.group(0)
3281 video_id = mobj.group(2)
3282
3283 video_extension = 'flv'
3284
3285 # Step 1, Retrieve post webpage to extract further information
3286 self.report_extract_entry(post_url)
3287 request = urllib2.Request(post_url)
3288 try:
3289 webpage = urllib2.urlopen(request).read()
3290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3291 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3292 return
3293
3294 # Extract update date
3295 upload_date = u'NA'
3296 pattern = 'title="Timestamp">(.*?)</a>'
3297 mobj = re.search(pattern, webpage)
3298 if mobj:
3299 upload_date = mobj.group(1)
3300 # Convert timestring to a format suitable for filename
3301 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3302 upload_date = upload_date.strftime('%Y%m%d')
3303 self.report_date(upload_date)
3304
3305 # Extract uploader
3306 uploader = u'NA'
3307 pattern = r'rel\="author".*?>(.*?)</a>'
3308 mobj = re.search(pattern, webpage)
3309 if mobj:
3310 uploader = mobj.group(1)
3311 self.report_uploader(uploader)
3312
3313 # Extract title
3314 # Get the first line for title
3315 video_title = u'NA'
3316 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3317 mobj = re.search(pattern, webpage)
3318 if mobj:
3319 video_title = mobj.group(1)
3320 self.report_title(video_title)
3321
3322 # Step 2, Stimulate clicking the image box to launch video
3323 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3324 mobj = re.search(pattern, webpage)
3325 if mobj is None:
3326 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3327
3328 video_page = mobj.group(1)
3329 request = urllib2.Request(video_page)
3330 try:
3331 webpage = urllib2.urlopen(request).read()
3332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3333 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3334 return
3335 self.report_extract_vid_page(video_page)
3336
3337
3338 # Extract video links on video page
3339 """Extract video links of all sizes"""
3340 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3341 mobj = re.findall(pattern, webpage)
3342 if len(mobj) == 0:
3343 self._downloader.trouble(u'ERROR: unable to extract video links')
3344
3345 # Sort in resolution
3346 links = sorted(mobj)
3347
3348 # Choose the lowest of the sort, i.e. highest resolution
3349 video_url = links[-1]
3350 # Only get the url. The resolution part in the tuple has no use anymore
3351 video_url = video_url[-1]
3352 # Treat escaped \u0026 style hex
3353 video_url = unicode(video_url, "unicode_escape")
3354
3355
3356 return [{
3357 'id': video_id.decode('utf-8'),
3358 'url': video_url,
3359 'uploader': uploader.decode('utf-8'),
3360 'upload_date': upload_date.decode('utf-8'),
3361 'title': video_title.decode('utf-8'),
3362 'ext': video_extension.decode('utf-8'),
3363 'format': u'NA',
3364 'player_url': None,
3365 }]