]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Use u instead of str in Python 2
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs
19
20 try:
21 import cStringIO as StringIO
22 except ImportError:
23 import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
38 the following fields:
39
40 id: Video identifier.
41 url: Final video URL.
42 uploader: Nickname of the video uploader.
43 title: Literal title.
44 ext: Video filename extension.
45 format: Video format.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
52
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59 """
60
61 _ready = False
62 _downloader = None
63
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
66 self._ready = False
67 self.set_downloader(downloader)
68
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
72
73 def initialize(self):
74 """Initializes an instance (authentication, etc)."""
75 if not self._ready:
76 self._real_initialize()
77 self._ready = True
78
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
81 self.initialize()
82 return self._real_extract(url)
83
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
87
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
90 pass
91
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
94 pass
95
96
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
99
100 _VALID_URL = r"""^
101 (
102 (?:https?://)? # http(s):// (optional)
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
105 (?:.*?\#/)? # handle anchor (#/) redirect urls
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
113 v=
114 )
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
119 $"""
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
128 _video_extensions = {
129 '13': '3gp',
130 '17': 'mp4',
131 '18': 'mp4',
132 '22': 'mp4',
133 '37': 'mp4',
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
135 '43': 'webm',
136 '44': 'webm',
137 '45': 'webm',
138 '46': 'webm',
139 }
140 _video_dimensions = {
141 '5': '240x400',
142 '6': '???',
143 '13': '???',
144 '17': '144x176',
145 '18': '360x640',
146 '22': '720x1280',
147 '34': '360x640',
148 '35': '480x854',
149 '37': '1080x1920',
150 '38': '3072x4096',
151 '43': '360x640',
152 '44': '480x854',
153 '45': '720x1280',
154 '46': '1080x1920',
155 }
156 IE_NAME = u'youtube'
157
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
165
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
169
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
173
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
197
198 def _closed_captions_xml_to_srt(self, xml_string):
199 srt = ''
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
204 start = float(start)
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
208 caption = unescapeHTML(caption)
209 caption = unescapeHTML(caption) # double cycle, intentional
210 srt += str(n+1) + '\n'
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
213 return srt
214
215 def _print_formats(self, formats):
216 print('Available formats:')
217 for x in formats:
218 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
219
220 def _real_initialize(self):
221 if self._downloader is None:
222 return
223
224 username = None
225 password = None
226 downloader_params = self._downloader.params
227
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
233 try:
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
235 if info is not None:
236 username = info[0]
237 password = info[2]
238 else:
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
242 return
243
244 # Set language
245 request = urllib2.Request(self._LANG_URL)
246 try:
247 self.report_lang()
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % u(err))
251 return
252
253 # No authentication to be performed
254 if username is None:
255 return
256
257 # Log in
258 login_form = {
259 'current_form': 'loginForm',
260 'next': '/',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
264 }
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
266 try:
267 self.report_login()
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 return
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
274 return
275
276 # Confirm age
277 age_form = {
278 'next_url': '/',
279 'action_confirm': 'Confirm',
280 }
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 try:
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
287 return
288
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
292 if mobj:
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294
295 # Extract video id from URL
296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
297 if mobj is None:
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 return
300 video_id = mobj.group(2)
301
302 # Get video webpage
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 try:
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
309 return
310
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 if mobj is not None:
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
315 else:
316 player_url = None
317
318 # Get video info
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
324 try:
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
328 break
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
331 return
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 else:
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
337 return
338
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
342 return
343
344 # Start extracting information
345 self.report_information_extraction(video_id)
346
347 # uploader
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 return
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
352
353 # title
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
356 return
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
359
360 # thumbnail image
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 video_thumbnail = ''
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
366
367 # upload date
368 upload_date = u'NA'
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 if mobj is not None:
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
374 try:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
376 except:
377 pass
378
379 # description
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
382 else: video_description = ''
383
384 # closed captions
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
387 try:
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 try:
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
401 srt_lang = 'en'
402 else:
403 srt_lang = srt_lang_list.keys()[0]
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
407 try:
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
411 if not srt_xml:
412 raise Trouble(u'WARNING: unable to download video subtitles')
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
416
417 if 'length_seconds' not in video_info:
418 self._downloader.trouble(u'WARNING: unable to extract video duration')
419 video_duration = ''
420 else:
421 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
422
423 # token
424 video_token = urllib.unquote_plus(video_info['token'][0])
425
426 # Decide which formats to download
427 req_format = self._downloader.params.get('format', None)
428
429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
430 self.report_rtmp_download()
431 video_url_list = [(None, video_info['conn'][0])]
432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
433 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
434 url_data = [parse_qs(uds) for uds in url_data_strs]
435 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
436 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
437
438 format_limit = self._downloader.params.get('format_limit', None)
439 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
440 if format_limit is not None and format_limit in available_formats:
441 format_list = available_formats[available_formats.index(format_limit):]
442 else:
443 format_list = available_formats
444 existing_formats = [x for x in format_list if x in url_map]
445 if len(existing_formats) == 0:
446 self._downloader.trouble(u'ERROR: no known formats available for video')
447 return
448 if self._downloader.params.get('listformats', None):
449 self._print_formats(existing_formats)
450 return
451 if req_format is None or req_format == 'best':
452 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
453 elif req_format == 'worst':
454 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
455 elif req_format in ('-1', 'all'):
456 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
457 else:
458 # Specific formats. We pick the first in a slash-delimeted sequence.
459 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
460 req_formats = req_format.split('/')
461 video_url_list = None
462 for rf in req_formats:
463 if rf in url_map:
464 video_url_list = [(rf, url_map[rf])]
465 break
466 if video_url_list is None:
467 self._downloader.trouble(u'ERROR: requested format not available')
468 return
469 else:
470 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
471 return
472
473 results = []
474 for format_param, video_real_url in video_url_list:
475 # Extension
476 video_extension = self._video_extensions.get(format_param, 'flv')
477
478 results.append({
479 'id': video_id.decode('utf-8'),
480 'url': video_real_url.decode('utf-8'),
481 'uploader': video_uploader.decode('utf-8'),
482 'upload_date': upload_date,
483 'title': video_title,
484 'ext': video_extension.decode('utf-8'),
485 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
486 'thumbnail': video_thumbnail.decode('utf-8'),
487 'description': video_description,
488 'player_url': player_url,
489 'subtitles': video_subtitles,
490 'duration': video_duration
491 })
492 return results
493
494
495 class MetacafeIE(InfoExtractor):
496 """Information Extractor for metacafe.com."""
497
498 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
499 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
500 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
501 IE_NAME = u'metacafe'
502
503 def __init__(self, downloader=None):
504 InfoExtractor.__init__(self, downloader)
505
506 def report_disclaimer(self):
507 """Report disclaimer retrieval."""
508 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
509
510 def report_age_confirmation(self):
511 """Report attempt to confirm age."""
512 self._downloader.to_screen(u'[metacafe] Confirming age')
513
514 def report_download_webpage(self, video_id):
515 """Report webpage download."""
516 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
517
518 def report_extraction(self, video_id):
519 """Report information extraction."""
520 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
521
522 def _real_initialize(self):
523 # Retrieve disclaimer
524 request = urllib2.Request(self._DISCLAIMER)
525 try:
526 self.report_disclaimer()
527 disclaimer = urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % u(err))
530 return
531
532 # Confirm age
533 disclaimer_form = {
534 'filters': '0',
535 'submit': "Continue - I'm over 18",
536 }
537 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
538 try:
539 self.report_age_confirmation()
540 disclaimer = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
542 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
543 return
544
545 def _real_extract(self, url):
546 # Extract id and simplified title from URL
547 mobj = re.match(self._VALID_URL, url)
548 if mobj is None:
549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
550 return
551
552 video_id = mobj.group(1)
553
554 # Check if video comes from YouTube
555 mobj2 = re.match(r'^yt-(.*)$', video_id)
556 if mobj2 is not None:
557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
558 return
559
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
562 try:
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
567 return
568
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
572 if mobj is not None:
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
575
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
578 if mobj is None:
579 video_url = mediaURL
580 else:
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
583 else:
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
585 if mobj is None:
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
587 return
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
591 return
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
593 if mobj is None:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
599
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: unable to extract title')
603 return
604 video_title = mobj.group(1).decode('utf-8')
605
606 mobj = re.search(r'submitter=(.*?);', webpage)
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
609 return
610 video_uploader = mobj.group(1)
611
612 return [{
613 'id': video_id.decode('utf-8'),
614 'url': video_url.decode('utf-8'),
615 'uploader': video_uploader.decode('utf-8'),
616 'upload_date': u'NA',
617 'title': video_title,
618 'ext': video_extension.decode('utf-8'),
619 'format': u'NA',
620 'player_url': None,
621 }]
622
623
624 class DailymotionIE(InfoExtractor):
625 """Information Extractor for Dailymotion"""
626
627 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
628 IE_NAME = u'dailymotion'
629
630 def __init__(self, downloader=None):
631 InfoExtractor.__init__(self, downloader)
632
633 def report_download_webpage(self, video_id):
634 """Report webpage download."""
635 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
636
637 def report_extraction(self, video_id):
638 """Report information extraction."""
639 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
640
641 def _real_extract(self, url):
642 # Extract id and simplified title from URL
643 mobj = re.match(self._VALID_URL, url)
644 if mobj is None:
645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 return
647
648 video_id = mobj.group(1).split('_')[0].split('?')[0]
649
650 video_extension = 'mp4'
651
652 # Retrieve video webpage to extract further information
653 request = urllib2.Request(url)
654 request.add_header('Cookie', 'family_filter=off')
655 try:
656 self.report_download_webpage(video_id)
657 webpage = urllib2.urlopen(request).read()
658 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
659 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
660 return
661
662 # Extract URL, uploader and title from webpage
663 self.report_extraction(video_id)
664 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
665 if mobj is None:
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
667 return
668 flashvars = urllib.unquote(mobj.group(1))
669
670 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
671 if key in flashvars:
672 max_quality = key
673 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
674 break
675 else:
676 self._downloader.trouble(u'ERROR: unable to extract video URL')
677 return
678
679 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
680 if mobj is None:
681 self._downloader.trouble(u'ERROR: unable to extract video URL')
682 return
683
684 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
685
686 # TODO: support choosing qualities
687
688 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
689 if mobj is None:
690 self._downloader.trouble(u'ERROR: unable to extract title')
691 return
692 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
693
694 video_uploader = u'NA'
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
696 if mobj is None:
697 # lookin for official user
698 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
699 if mobj_official is None:
700 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
701 else:
702 video_uploader = mobj_official.group(1)
703 else:
704 video_uploader = mobj.group(1)
705
706 video_upload_date = u'NA'
707 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
708 if mobj is not None:
709 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
710
711 return [{
712 'id': video_id.decode('utf-8'),
713 'url': video_url.decode('utf-8'),
714 'uploader': video_uploader.decode('utf-8'),
715 'upload_date': video_upload_date,
716 'title': video_title,
717 'ext': video_extension.decode('utf-8'),
718 'format': u'NA',
719 'player_url': None,
720 }]
721
722
723 class GoogleIE(InfoExtractor):
724 """Information extractor for video.google.com."""
725
726 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
727 IE_NAME = u'video.google'
728
729 def __init__(self, downloader=None):
730 InfoExtractor.__init__(self, downloader)
731
732 def report_download_webpage(self, video_id):
733 """Report webpage download."""
734 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
735
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
739
740 def _real_extract(self, url):
741 # Extract id from URL
742 mobj = re.match(self._VALID_URL, url)
743 if mobj is None:
744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
745 return
746
747 video_id = mobj.group(1)
748
749 video_extension = 'mp4'
750
751 # Retrieve video webpage to extract further information
752 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
753 try:
754 self.report_download_webpage(video_id)
755 webpage = urllib2.urlopen(request).read()
756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
758 return
759
760 # Extract URL, uploader, and title from webpage
761 self.report_extraction(video_id)
762 mobj = re.search(r"download_url:'([^']+)'", webpage)
763 if mobj is None:
764 video_extension = 'flv'
765 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
766 if mobj is None:
767 self._downloader.trouble(u'ERROR: unable to extract media URL')
768 return
769 mediaURL = urllib.unquote(mobj.group(1))
770 mediaURL = mediaURL.replace('\\x3d', '\x3d')
771 mediaURL = mediaURL.replace('\\x26', '\x26')
772
773 video_url = mediaURL
774
775 mobj = re.search(r'<title>(.*)</title>', webpage)
776 if mobj is None:
777 self._downloader.trouble(u'ERROR: unable to extract title')
778 return
779 video_title = mobj.group(1).decode('utf-8')
780
781 # Extract video description
782 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: unable to extract video description')
785 return
786 video_description = mobj.group(1).decode('utf-8')
787 if not video_description:
788 video_description = 'No description available.'
789
790 # Extract video thumbnail
791 if self._downloader.params.get('forcethumbnail', False):
792 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
793 try:
794 webpage = urllib2.urlopen(request).read()
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
797 return
798 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
799 if mobj is None:
800 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
801 return
802 video_thumbnail = mobj.group(1)
803 else: # we need something to pass to process_info
804 video_thumbnail = ''
805
806 return [{
807 'id': video_id.decode('utf-8'),
808 'url': video_url.decode('utf-8'),
809 'uploader': u'NA',
810 'upload_date': u'NA',
811 'title': video_title,
812 'ext': video_extension.decode('utf-8'),
813 'format': u'NA',
814 'player_url': None,
815 }]
816
817
818 class PhotobucketIE(InfoExtractor):
819 """Information extractor for photobucket.com."""
820
821 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
822 IE_NAME = u'photobucket'
823
824 def __init__(self, downloader=None):
825 InfoExtractor.__init__(self, downloader)
826
827 def report_download_webpage(self, video_id):
828 """Report webpage download."""
829 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
830
831 def report_extraction(self, video_id):
832 """Report information extraction."""
833 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
834
835 def _real_extract(self, url):
836 # Extract id from URL
837 mobj = re.match(self._VALID_URL, url)
838 if mobj is None:
839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
840 return
841
842 video_id = mobj.group(1)
843
844 video_extension = 'flv'
845
846 # Retrieve video webpage to extract further information
847 request = urllib2.Request(url)
848 try:
849 self.report_download_webpage(video_id)
850 webpage = urllib2.urlopen(request).read()
851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
852 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
853 return
854
855 # Extract URL, uploader, and title from webpage
856 self.report_extraction(video_id)
857 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
858 if mobj is None:
859 self._downloader.trouble(u'ERROR: unable to extract media URL')
860 return
861 mediaURL = urllib.unquote(mobj.group(1))
862
863 video_url = mediaURL
864
865 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
866 if mobj is None:
867 self._downloader.trouble(u'ERROR: unable to extract title')
868 return
869 video_title = mobj.group(1).decode('utf-8')
870
871 video_uploader = mobj.group(2).decode('utf-8')
872
873 return [{
874 'id': video_id.decode('utf-8'),
875 'url': video_url.decode('utf-8'),
876 'uploader': video_uploader,
877 'upload_date': u'NA',
878 'title': video_title,
879 'ext': video_extension.decode('utf-8'),
880 'format': u'NA',
881 'player_url': None,
882 }]
883
884
885 class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
887
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
893
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
896
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
910 return
911
912 video_id = mobj.group(2)
913 video_extension = 'flv'
914
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = urllib2.Request(url)
919 try:
920 webpage = urllib2.urlopen(request).read()
921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
923 return
924
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
928 return
929 yahoo_id = mobj.group(1)
930
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934 return
935 yahoo_vid = mobj.group(1)
936
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
939
940 # Retrieve video webpage to extract further information
941 request = urllib2.Request(url)
942 try:
943 self.report_download_webpage(video_id)
944 webpage = urllib2.urlopen(request).read()
945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
947 return
948
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952 if mobj is None:
953 self._downloader.trouble(u'ERROR: unable to extract video title')
954 return
955 video_title = mobj.group(1).decode('utf-8')
956
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
960 return
961 video_uploader = mobj.group(1).decode('utf-8')
962
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965 if mobj is None:
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967 return
968 video_thumbnail = mobj.group(1).decode('utf-8')
969
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video description')
974 return
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
978
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: unable to extract video height')
983 return
984 yv_video_height = mobj.group(1)
985
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video width')
989 return
990 yv_video_width = mobj.group(1)
991
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000 try:
1001 self.report_download_webpage(video_id)
1002 webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1005 return
1006
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009 if mobj is None:
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011 return
1012 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013 video_url = unescapeHTML(video_url)
1014
1015 return [{
1016 'id': video_id.decode('utf-8'),
1017 'url': video_url,
1018 'uploader': video_uploader,
1019 'upload_date': u'NA',
1020 'title': video_title,
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1024 'thumbnail': video_thumbnail,
1025 'player_url': None,
1026 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1031
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034 IE_NAME = u'vimeo'
1035
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1038
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1050 if mobj is None:
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052 return
1053
1054 video_id = mobj.group(1)
1055
1056 # Retrieve video webpage to extract further information
1057 request = urllib2.Request(url, None, std_headers)
1058 try:
1059 self.report_download_webpage(video_id)
1060 webpage = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1063 return
1064
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1069
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072 try:
1073 config = json.loads(config)
1074 except:
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1076 return
1077
1078 # Extract title
1079 video_title = config["video"]["title"]
1080
1081 # Extract uploader
1082 video_uploader = config["video"]["owner"]["name"]
1083
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1086
1087 # Extract video description
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1091
1092 # Extract upload date
1093 video_upload_date = u'NA'
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1097
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1101
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1113 else:
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122 break
1123 else:
1124 self._downloader.trouble(u'ERROR: no known codec found')
1125 return
1126
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130 return [{
1131 'id': video_id,
1132 'url': video_url,
1133 'uploader': video_uploader,
1134 'upload_date': video_upload_date,
1135 'title': video_title,
1136 'ext': video_extension,
1137 'thumbnail': video_thumbnail,
1138 'description': video_description,
1139 'player_url': None,
1140 }]
1141
1142
1143 class ArteTvIE(InfoExtractor):
1144 """arte.tv information extractor."""
1145
1146 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1147 _LIVE_URL = r'index-[0-9]+\.html$'
1148
1149 IE_NAME = u'arte.tv'
1150
1151 def __init__(self, downloader=None):
1152 InfoExtractor.__init__(self, downloader)
1153
1154 def report_download_webpage(self, video_id):
1155 """Report webpage download."""
1156 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1157
1158 def report_extraction(self, video_id):
1159 """Report information extraction."""
1160 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1161
1162 def fetch_webpage(self, url):
1163 self._downloader.increment_downloads()
1164 request = urllib2.Request(url)
1165 try:
1166 self.report_download_webpage(url)
1167 webpage = urllib2.urlopen(request).read()
1168 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1170 return
1171 except ValueError, err:
1172 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173 return
1174 return webpage
1175
1176 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1177 page = self.fetch_webpage(url)
1178 mobj = re.search(regex, page, regexFlags)
1179 info = {}
1180
1181 if mobj is None:
1182 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183 return
1184
1185 for (i, key, err) in matchTuples:
1186 if mobj.group(i) is None:
1187 self._downloader.trouble(err)
1188 return
1189 else:
1190 info[key] = mobj.group(i)
1191
1192 return info
1193
1194 def extractLiveStream(self, url):
1195 video_lang = url.split('/')[-4]
1196 info = self.grep_webpage(
1197 url,
1198 r'src="(.*?/videothek_js.*?\.js)',
1199 0,
1200 [
1201 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1202 ]
1203 )
1204 http_host = url.split('/')[2]
1205 next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
1206 info = self.grep_webpage(
1207 next_url,
1208 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1209 '(http://.*?\.swf).*?' +
1210 '(rtmp://.*?)\'',
1211 re.DOTALL,
1212 [
1213 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1214 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1215 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1216 ]
1217 )
1218 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1219
1220 def extractPlus7Stream(self, url):
1221 video_lang = url.split('/')[-3]
1222 info = self.grep_webpage(
1223 url,
1224 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1225 0,
1226 [
1227 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1228 ]
1229 )
1230 next_url = urllib.unquote(info.get('url'))
1231 info = self.grep_webpage(
1232 next_url,
1233 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1234 0,
1235 [
1236 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1237 ]
1238 )
1239 next_url = urllib.unquote(info.get('url'))
1240
1241 info = self.grep_webpage(
1242 next_url,
1243 r'<video id="(.*?)".*?>.*?' +
1244 '<name>(.*?)</name>.*?' +
1245 '<dateVideo>(.*?)</dateVideo>.*?' +
1246 '<url quality="hd">(.*?)</url>',
1247 re.DOTALL,
1248 [
1249 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1250 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1251 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1252 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1253 ]
1254 )
1255
1256 return {
1257 'id': info.get('id'),
1258 'url': urllib.unquote(info.get('url')),
1259 'uploader': u'arte.tv',
1260 'upload_date': info.get('date'),
1261 'title': info.get('title'),
1262 'ext': u'mp4',
1263 'format': u'NA',
1264 'player_url': None,
1265 }
1266
1267 def _real_extract(self, url):
1268 video_id = url.split('/')[-1]
1269 self.report_extraction(video_id)
1270
1271 if re.search(self._LIVE_URL, video_id) is not None:
1272 self.extractLiveStream(url)
1273 return
1274 else:
1275 info = self.extractPlus7Stream(url)
1276
1277 return [info]
1278
1279
1280 class GenericIE(InfoExtractor):
1281 """Generic last-resort information extractor."""
1282
1283 _VALID_URL = r'.*'
1284 IE_NAME = u'generic'
1285
1286 def __init__(self, downloader=None):
1287 InfoExtractor.__init__(self, downloader)
1288
1289 def report_download_webpage(self, video_id):
1290 """Report webpage download."""
1291 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1292 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1293
1294 def report_extraction(self, video_id):
1295 """Report information extraction."""
1296 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1297
1298 def report_following_redirect(self, new_url):
1299 """Report information extraction."""
1300 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1301
1302 def _test_redirect(self, url):
1303 """Check if it is a redirect, like url shorteners, in case restart chain."""
1304 class HeadRequest(urllib2.Request):
1305 def get_method(self):
1306 return "HEAD"
1307
1308 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1309 """
1310 Subclass the HTTPRedirectHandler to make it use our
1311 HeadRequest also on the redirected URL
1312 """
1313 def redirect_request(self, req, fp, code, msg, headers, newurl):
1314 if code in (301, 302, 303, 307):
1315 newurl = newurl.replace(' ', '%20')
1316 newheaders = dict((k,v) for k,v in req.headers.items()
1317 if k.lower() not in ("content-length", "content-type"))
1318 return HeadRequest(newurl,
1319 headers=newheaders,
1320 origin_req_host=req.get_origin_req_host(),
1321 unverifiable=True)
1322 else:
1323 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1324
1325 class HTTPMethodFallback(urllib2.BaseHandler):
1326 """
1327 Fallback to GET if HEAD is not allowed (405 HTTP error)
1328 """
1329 def http_error_405(self, req, fp, code, msg, headers):
1330 fp.read()
1331 fp.close()
1332
1333 newheaders = dict((k,v) for k,v in req.headers.items()
1334 if k.lower() not in ("content-length", "content-type"))
1335 return self.parent.open(urllib2.Request(req.get_full_url(),
1336 headers=newheaders,
1337 origin_req_host=req.get_origin_req_host(),
1338 unverifiable=True))
1339
1340 # Build our opener
1341 opener = urllib2.OpenerDirector()
1342 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1343 HTTPMethodFallback, HEADRedirectHandler,
1344 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1345 opener.add_handler(handler())
1346
1347 response = opener.open(HeadRequest(url))
1348 new_url = response.geturl()
1349
1350 if url == new_url: return False
1351
1352 self.report_following_redirect(new_url)
1353 self._downloader.download([new_url])
1354 return True
1355
1356 def _real_extract(self, url):
1357 if self._test_redirect(url): return
1358
1359 video_id = url.split('/')[-1]
1360 request = urllib2.Request(url)
1361 try:
1362 self.report_download_webpage(video_id)
1363 webpage = urllib2.urlopen(request).read()
1364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1365 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1366 return
1367 except ValueError, err:
1368 # since this is the last-resort InfoExtractor, if
1369 # this error is thrown, it'll be thrown here
1370 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371 return
1372
1373 self.report_extraction(video_id)
1374 # Start with something easy: JW Player in SWFObject
1375 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1376 if mobj is None:
1377 # Broaden the search a little bit
1378 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1379 if mobj is None:
1380 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381 return
1382
1383 # It's possible that one of the regexes
1384 # matched, but returned an empty group:
1385 if mobj.group(1) is None:
1386 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387 return
1388
1389 video_url = urllib.unquote(mobj.group(1))
1390 video_id = os.path.basename(video_url)
1391
1392 # here's a fun little line of code for you:
1393 video_extension = os.path.splitext(video_id)[1][1:]
1394 video_id = os.path.splitext(video_id)[0]
1395
1396 # it's tempting to parse this further, but you would
1397 # have to take into account all the variations like
1398 # Video Title - Site Name
1399 # Site Name | Video Title
1400 # Video Title - Tagline | Site Name
1401 # and so on and so forth; it's just not practical
1402 mobj = re.search(r'<title>(.*)</title>', webpage)
1403 if mobj is None:
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1405 return
1406 video_title = mobj.group(1).decode('utf-8')
1407
1408 # video uploader is domain name
1409 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410 if mobj is None:
1411 self._downloader.trouble(u'ERROR: unable to extract title')
1412 return
1413 video_uploader = mobj.group(1).decode('utf-8')
1414
1415 return [{
1416 'id': video_id.decode('utf-8'),
1417 'url': video_url.decode('utf-8'),
1418 'uploader': video_uploader,
1419 'upload_date': u'NA',
1420 'title': video_title,
1421 'ext': video_extension.decode('utf-8'),
1422 'format': u'NA',
1423 'player_url': None,
1424 }]
1425
1426
1427 class YoutubeSearchIE(InfoExtractor):
1428 """Information Extractor for YouTube search queries."""
1429 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1430 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1431 _max_youtube_results = 1000
1432 IE_NAME = u'youtube:search'
1433
1434 def __init__(self, downloader=None):
1435 InfoExtractor.__init__(self, downloader)
1436
1437 def report_download_page(self, query, pagenum):
1438 """Report attempt to download search page with given number."""
1439 query = query.decode(preferredencoding())
1440 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1441
1442 def _real_extract(self, query):
1443 mobj = re.match(self._VALID_URL, query)
1444 if mobj is None:
1445 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446 return
1447
1448 prefix, query = query.split(':')
1449 prefix = prefix[8:]
1450 query = query.encode('utf-8')
1451 if prefix == '':
1452 self._download_n_results(query, 1)
1453 return
1454 elif prefix == 'all':
1455 self._download_n_results(query, self._max_youtube_results)
1456 return
1457 else:
1458 try:
1459 n = int(prefix)
1460 if n <= 0:
1461 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462 return
1463 elif n > self._max_youtube_results:
1464 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1465 n = self._max_youtube_results
1466 self._download_n_results(query, n)
1467 return
1468 except ValueError: # parsing prefix as integer fails
1469 self._download_n_results(query, 1)
1470 return
1471
1472 def _download_n_results(self, query, n):
1473 """Downloads a specified number of results for a query"""
1474
1475 video_ids = []
1476 pagenum = 0
1477 limit = n
1478
1479 while (50 * pagenum) < limit:
1480 self.report_download_page(query, pagenum+1)
1481 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1482 request = urllib2.Request(result_url)
1483 try:
1484 data = urllib2.urlopen(request).read()
1485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486 self._downloader.trouble(u'ERROR: unable to download API page: %s' % u(err))
1487 return
1488 api_response = json.loads(data)['data']
1489
1490 new_ids = list(video['id'] for video in api_response['items'])
1491 video_ids += new_ids
1492
1493 limit = min(n, api_response['totalItems'])
1494 pagenum += 1
1495
1496 if len(video_ids) > n:
1497 video_ids = video_ids[:n]
1498 for id in video_ids:
1499 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1500 return
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504 """Information Extractor for Google Video search queries."""
1505 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509 _max_google_results = 1000
1510 IE_NAME = u'video.google:search'
1511
1512 def __init__(self, downloader=None):
1513 InfoExtractor.__init__(self, downloader)
1514
1515 def report_download_page(self, query, pagenum):
1516 """Report attempt to download playlist page with given number."""
1517 query = query.decode(preferredencoding())
1518 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1519
1520 def _real_extract(self, query):
1521 mobj = re.match(self._VALID_URL, query)
1522 if mobj is None:
1523 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1524 return
1525
1526 prefix, query = query.split(':')
1527 prefix = prefix[8:]
1528 query = query.encode('utf-8')
1529 if prefix == '':
1530 self._download_n_results(query, 1)
1531 return
1532 elif prefix == 'all':
1533 self._download_n_results(query, self._max_google_results)
1534 return
1535 else:
1536 try:
1537 n = int(prefix)
1538 if n <= 0:
1539 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1540 return
1541 elif n > self._max_google_results:
1542 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1543 n = self._max_google_results
1544 self._download_n_results(query, n)
1545 return
1546 except ValueError: # parsing prefix as integer fails
1547 self._download_n_results(query, 1)
1548 return
1549
1550 def _download_n_results(self, query, n):
1551 """Downloads a specified number of results for a query"""
1552
1553 video_ids = []
1554 pagenum = 0
1555
1556 while True:
1557 self.report_download_page(query, pagenum)
1558 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1559 request = urllib2.Request(result_url)
1560 try:
1561 page = urllib2.urlopen(request).read()
1562 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1564 return
1565
1566 # Extract video identifiers
1567 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1568 video_id = mobj.group(1)
1569 if video_id not in video_ids:
1570 video_ids.append(video_id)
1571 if len(video_ids) == n:
1572 # Specified n videos reached
1573 for id in video_ids:
1574 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575 return
1576
1577 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578 for id in video_ids:
1579 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580 return
1581
1582 pagenum = pagenum + 1
1583
1584
1585 class YahooSearchIE(InfoExtractor):
1586 """Information Extractor for Yahoo! Video search queries."""
1587 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1588 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1589 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1590 _MORE_PAGES_INDICATOR = r'\s*Next'
1591 _max_yahoo_results = 1000
1592 IE_NAME = u'video.yahoo:search'
1593
1594 def __init__(self, downloader=None):
1595 InfoExtractor.__init__(self, downloader)
1596
1597 def report_download_page(self, query, pagenum):
1598 """Report attempt to download playlist page with given number."""
1599 query = query.decode(preferredencoding())
1600 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1601
1602 def _real_extract(self, query):
1603 mobj = re.match(self._VALID_URL, query)
1604 if mobj is None:
1605 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1606 return
1607
1608 prefix, query = query.split(':')
1609 prefix = prefix[8:]
1610 query = query.encode('utf-8')
1611 if prefix == '':
1612 self._download_n_results(query, 1)
1613 return
1614 elif prefix == 'all':
1615 self._download_n_results(query, self._max_yahoo_results)
1616 return
1617 else:
1618 try:
1619 n = int(prefix)
1620 if n <= 0:
1621 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1622 return
1623 elif n > self._max_yahoo_results:
1624 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1625 n = self._max_yahoo_results
1626 self._download_n_results(query, n)
1627 return
1628 except ValueError: # parsing prefix as integer fails
1629 self._download_n_results(query, 1)
1630 return
1631
1632 def _download_n_results(self, query, n):
1633 """Downloads a specified number of results for a query"""
1634
1635 video_ids = []
1636 already_seen = set()
1637 pagenum = 1
1638
1639 while True:
1640 self.report_download_page(query, pagenum)
1641 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1642 request = urllib2.Request(result_url)
1643 try:
1644 page = urllib2.urlopen(request).read()
1645 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1647 return
1648
1649 # Extract video identifiers
1650 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651 video_id = mobj.group(1)
1652 if video_id not in already_seen:
1653 video_ids.append(video_id)
1654 already_seen.add(video_id)
1655 if len(video_ids) == n:
1656 # Specified n videos reached
1657 for id in video_ids:
1658 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659 return
1660
1661 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662 for id in video_ids:
1663 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664 return
1665
1666 pagenum = pagenum + 1
1667
1668
1669 class YoutubePlaylistIE(InfoExtractor):
1670 """Information Extractor for YouTube playlists."""
1671
1672 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1673 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1674 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1675 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1676 IE_NAME = u'youtube:playlist'
1677
1678 def __init__(self, downloader=None):
1679 InfoExtractor.__init__(self, downloader)
1680
1681 def report_download_page(self, playlist_id, pagenum):
1682 """Report attempt to download playlist page with given number."""
1683 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1684
1685 def _real_extract(self, url):
1686 # Extract playlist id
1687 mobj = re.match(self._VALID_URL, url)
1688 if mobj is None:
1689 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690 return
1691
1692 # Single video case
1693 if mobj.group(3) is not None:
1694 self._downloader.download([mobj.group(3)])
1695 return
1696
1697 # Download playlist pages
1698 # prefix is 'p' as default for playlists but there are other types that need extra care
1699 playlist_prefix = mobj.group(1)
1700 if playlist_prefix == 'a':
1701 playlist_access = 'artist'
1702 else:
1703 playlist_prefix = 'p'
1704 playlist_access = 'view_play_list'
1705 playlist_id = mobj.group(2)
1706 video_ids = []
1707 pagenum = 1
1708
1709 while True:
1710 self.report_download_page(playlist_id, pagenum)
1711 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1712 request = urllib2.Request(url)
1713 try:
1714 page = urllib2.urlopen(request).read()
1715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1716 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1717 return
1718
1719 # Extract video identifiers
1720 ids_in_page = []
1721 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1722 if mobj.group(1) not in ids_in_page:
1723 ids_in_page.append(mobj.group(1))
1724 video_ids.extend(ids_in_page)
1725
1726 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1727 break
1728 pagenum = pagenum + 1
1729
1730 playliststart = self._downloader.params.get('playliststart', 1) - 1
1731 playlistend = self._downloader.params.get('playlistend', -1)
1732 if playlistend == -1:
1733 video_ids = video_ids[playliststart:]
1734 else:
1735 video_ids = video_ids[playliststart:playlistend]
1736
1737 for id in video_ids:
1738 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739 return
1740
1741
1742 class YoutubeChannelIE(InfoExtractor):
1743 """Information Extractor for YouTube channels."""
1744
1745 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1748 IE_NAME = u'youtube:channel'
1749
1750 def report_download_page(self, channel_id, pagenum):
1751 """Report attempt to download channel page with given number."""
1752 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753
1754 def _real_extract(self, url):
1755 # Extract channel id
1756 mobj = re.match(self._VALID_URL, url)
1757 if mobj is None:
1758 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759 return
1760
1761 # Download channel pages
1762 channel_id = mobj.group(1)
1763 video_ids = []
1764 pagenum = 1
1765
1766 while True:
1767 self.report_download_page(channel_id, pagenum)
1768 url = self._TEMPLATE_URL % (channel_id, pagenum)
1769 request = urllib2.Request(url)
1770 try:
1771 page = urllib2.urlopen(request).read()
1772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1774 return
1775
1776 # Extract video identifiers
1777 ids_in_page = []
1778 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779 if mobj.group(1) not in ids_in_page:
1780 ids_in_page.append(mobj.group(1))
1781 video_ids.extend(ids_in_page)
1782
1783 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1784 break
1785 pagenum = pagenum + 1
1786
1787 for id in video_ids:
1788 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1789 return
1790
1791
1792 class YoutubeUserIE(InfoExtractor):
1793 """Information Extractor for YouTube users."""
1794
1795 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1796 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1797 _GDATA_PAGE_SIZE = 50
1798 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1799 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1800 IE_NAME = u'youtube:user'
1801
1802 def __init__(self, downloader=None):
1803 InfoExtractor.__init__(self, downloader)
1804
1805 def report_download_page(self, username, start_index):
1806 """Report attempt to download user page."""
1807 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1808 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1809
1810 def _real_extract(self, url):
1811 # Extract username
1812 mobj = re.match(self._VALID_URL, url)
1813 if mobj is None:
1814 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815 return
1816
1817 username = mobj.group(1)
1818
1819 # Download video ids using YouTube Data API. Result size per
1820 # query is limited (currently to 50 videos) so we need to query
1821 # page by page until there are no video ids - it means we got
1822 # all of them.
1823
1824 video_ids = []
1825 pagenum = 0
1826
1827 while True:
1828 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1829 self.report_download_page(username, start_index)
1830
1831 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1832
1833 try:
1834 page = urllib2.urlopen(request).read()
1835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1836 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1837 return
1838
1839 # Extract video identifiers
1840 ids_in_page = []
1841
1842 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1843 if mobj.group(1) not in ids_in_page:
1844 ids_in_page.append(mobj.group(1))
1845
1846 video_ids.extend(ids_in_page)
1847
1848 # A little optimization - if current page is not
1849 # "full", ie. does not contain PAGE_SIZE video ids then
1850 # we can assume that this page is the last one - there
1851 # are no more ids on further pages - no need to query
1852 # again.
1853
1854 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1855 break
1856
1857 pagenum += 1
1858
1859 all_ids_count = len(video_ids)
1860 playliststart = self._downloader.params.get('playliststart', 1) - 1
1861 playlistend = self._downloader.params.get('playlistend', -1)
1862
1863 if playlistend == -1:
1864 video_ids = video_ids[playliststart:]
1865 else:
1866 video_ids = video_ids[playliststart:playlistend]
1867
1868 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1869 (username, all_ids_count, len(video_ids)))
1870
1871 for video_id in video_ids:
1872 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1873
1874
1875 class BlipTVUserIE(InfoExtractor):
1876 """Information Extractor for blip.tv users."""
1877
1878 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1879 _PAGE_SIZE = 12
1880 IE_NAME = u'blip.tv:user'
1881
1882 def __init__(self, downloader=None):
1883 InfoExtractor.__init__(self, downloader)
1884
1885 def report_download_page(self, username, pagenum):
1886 """Report attempt to download user page."""
1887 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1888 (self.IE_NAME, username, pagenum))
1889
1890 def _real_extract(self, url):
1891 # Extract username
1892 mobj = re.match(self._VALID_URL, url)
1893 if mobj is None:
1894 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895 return
1896
1897 username = mobj.group(1)
1898
1899 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1900
1901 request = urllib2.Request(url)
1902
1903 try:
1904 page = urllib2.urlopen(request).read().decode('utf-8')
1905 mobj = re.search(r'data-users-id="([^"]+)"', page)
1906 page_base = page_base % mobj.group(1)
1907 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1909 return
1910
1911
1912 # Download video ids using BlipTV Ajax calls. Result size per
1913 # query is limited (currently to 12 videos) so we need to query
1914 # page by page until there are no video ids - it means we got
1915 # all of them.
1916
1917 video_ids = []
1918 pagenum = 1
1919
1920 while True:
1921 self.report_download_page(username, pagenum)
1922
1923 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1924
1925 try:
1926 page = urllib2.urlopen(request).read().decode('utf-8')
1927 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1928 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1929 return
1930
1931 # Extract video identifiers
1932 ids_in_page = []
1933
1934 for mobj in re.finditer(r'href="/([^"]+)"', page):
1935 if mobj.group(1) not in ids_in_page:
1936 ids_in_page.append(unescapeHTML(mobj.group(1)))
1937
1938 video_ids.extend(ids_in_page)
1939
1940 # A little optimization - if current page is not
1941 # "full", ie. does not contain PAGE_SIZE video ids then
1942 # we can assume that this page is the last one - there
1943 # are no more ids on further pages - no need to query
1944 # again.
1945
1946 if len(ids_in_page) < self._PAGE_SIZE:
1947 break
1948
1949 pagenum += 1
1950
1951 all_ids_count = len(video_ids)
1952 playliststart = self._downloader.params.get('playliststart', 1) - 1
1953 playlistend = self._downloader.params.get('playlistend', -1)
1954
1955 if playlistend == -1:
1956 video_ids = video_ids[playliststart:]
1957 else:
1958 video_ids = video_ids[playliststart:playlistend]
1959
1960 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1961 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1962
1963 for video_id in video_ids:
1964 self._downloader.download([u'http://blip.tv/'+video_id])
1965
1966
1967 class DepositFilesIE(InfoExtractor):
1968 """Information extractor for depositfiles.com"""
1969
1970 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971 IE_NAME = u'DepositFiles'
1972
1973 def __init__(self, downloader=None):
1974 InfoExtractor.__init__(self, downloader)
1975
1976 def report_download_webpage(self, file_id):
1977 """Report webpage download."""
1978 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1979
1980 def report_extraction(self, file_id):
1981 """Report information extraction."""
1982 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1983
1984 def _real_extract(self, url):
1985 file_id = url.split('/')[-1]
1986 # Rebuild url in english locale
1987 url = 'http://depositfiles.com/en/files/' + file_id
1988
1989 # Retrieve file webpage with 'Free download' button pressed
1990 free_download_indication = { 'gateway_result' : '1' }
1991 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1992 try:
1993 self.report_download_webpage(file_id)
1994 webpage = urllib2.urlopen(request).read()
1995 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1996 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % u(err))
1997 return
1998
1999 # Search for the real file URL
2000 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2001 if (mobj is None) or (mobj.group(1) is None):
2002 # Try to figure out reason of the error.
2003 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2004 if (mobj is not None) and (mobj.group(1) is not None):
2005 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2006 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2007 else:
2008 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2009 return
2010
2011 file_url = mobj.group(1)
2012 file_extension = os.path.splitext(file_url)[1][1:]
2013
2014 # Search for file title
2015 mobj = re.search(r'<b title="(.*?)">', webpage)
2016 if mobj is None:
2017 self._downloader.trouble(u'ERROR: unable to extract title')
2018 return
2019 file_title = mobj.group(1).decode('utf-8')
2020
2021 return [{
2022 'id': file_id.decode('utf-8'),
2023 'url': file_url.decode('utf-8'),
2024 'uploader': u'NA',
2025 'upload_date': u'NA',
2026 'title': file_title,
2027 'ext': file_extension.decode('utf-8'),
2028 'format': u'NA',
2029 'player_url': None,
2030 }]
2031
2032
2033 class FacebookIE(InfoExtractor):
2034 """Information Extractor for Facebook"""
2035
2036 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2037 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2038 _NETRC_MACHINE = 'facebook'
2039 _available_formats = ['video', 'highqual', 'lowqual']
2040 _video_extensions = {
2041 'video': 'mp4',
2042 'highqual': 'mp4',
2043 'lowqual': 'mp4',
2044 }
2045 IE_NAME = u'facebook'
2046
2047 def __init__(self, downloader=None):
2048 InfoExtractor.__init__(self, downloader)
2049
2050 def _reporter(self, message):
2051 """Add header and report message."""
2052 self._downloader.to_screen(u'[facebook] %s' % message)
2053
2054 def report_login(self):
2055 """Report attempt to log in."""
2056 self._reporter(u'Logging in')
2057
2058 def report_video_webpage_download(self, video_id):
2059 """Report attempt to download video webpage."""
2060 self._reporter(u'%s: Downloading video webpage' % video_id)
2061
2062 def report_information_extraction(self, video_id):
2063 """Report attempt to extract video information."""
2064 self._reporter(u'%s: Extracting video information' % video_id)
2065
2066 def _parse_page(self, video_webpage):
2067 """Extract video information from page"""
2068 # General data
2069 data = {'title': r'\("video_title", "(.*?)"\)',
2070 'description': r'<div class="datawrap">(.*?)</div>',
2071 'owner': r'\("video_owner_name", "(.*?)"\)',
2072 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2073 }
2074 video_info = {}
2075 for piece in data.keys():
2076 mobj = re.search(data[piece], video_webpage)
2077 if mobj is not None:
2078 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2079
2080 # Video urls
2081 video_urls = {}
2082 for fmt in self._available_formats:
2083 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2084 if mobj is not None:
2085 # URL is in a Javascript segment inside an escaped Unicode format within
2086 # the generally utf-8 page
2087 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2088 video_info['video_urls'] = video_urls
2089
2090 return video_info
2091
2092 def _real_initialize(self):
2093 if self._downloader is None:
2094 return
2095
2096 useremail = None
2097 password = None
2098 downloader_params = self._downloader.params
2099
2100 # Attempt to use provided username and password or .netrc data
2101 if downloader_params.get('username', None) is not None:
2102 useremail = downloader_params['username']
2103 password = downloader_params['password']
2104 elif downloader_params.get('usenetrc', False):
2105 try:
2106 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2107 if info is not None:
2108 useremail = info[0]
2109 password = info[2]
2110 else:
2111 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2112 except (IOError, netrc.NetrcParseError), err:
2113 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
2114 return
2115
2116 if useremail is None:
2117 return
2118
2119 # Log in
2120 login_form = {
2121 'email': useremail,
2122 'pass': password,
2123 'login': 'Log+In'
2124 }
2125 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2126 try:
2127 self.report_login()
2128 login_results = urllib2.urlopen(request).read()
2129 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2130 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2131 return
2132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
2134 return
2135
2136 def _real_extract(self, url):
2137 mobj = re.match(self._VALID_URL, url)
2138 if mobj is None:
2139 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2140 return
2141 video_id = mobj.group('ID')
2142
2143 # Get video webpage
2144 self.report_video_webpage_download(video_id)
2145 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2146 try:
2147 page = urllib2.urlopen(request)
2148 video_webpage = page.read()
2149 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2151 return
2152
2153 # Start extracting information
2154 self.report_information_extraction(video_id)
2155
2156 # Extract information
2157 video_info = self._parse_page(video_webpage)
2158
2159 # uploader
2160 if 'owner' not in video_info:
2161 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2162 return
2163 video_uploader = video_info['owner']
2164
2165 # title
2166 if 'title' not in video_info:
2167 self._downloader.trouble(u'ERROR: unable to extract video title')
2168 return
2169 video_title = video_info['title']
2170 video_title = video_title.decode('utf-8')
2171
2172 # thumbnail image
2173 if 'thumbnail' not in video_info:
2174 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2175 video_thumbnail = ''
2176 else:
2177 video_thumbnail = video_info['thumbnail']
2178
2179 # upload date
2180 upload_date = u'NA'
2181 if 'upload_date' in video_info:
2182 upload_time = video_info['upload_date']
2183 timetuple = email.utils.parsedate_tz(upload_time)
2184 if timetuple is not None:
2185 try:
2186 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2187 except:
2188 pass
2189
2190 # description
2191 video_description = video_info.get('description', 'No description available.')
2192
2193 url_map = video_info['video_urls']
2194 if len(url_map.keys()) > 0:
2195 # Decide which formats to download
2196 req_format = self._downloader.params.get('format', None)
2197 format_limit = self._downloader.params.get('format_limit', None)
2198
2199 if format_limit is not None and format_limit in self._available_formats:
2200 format_list = self._available_formats[self._available_formats.index(format_limit):]
2201 else:
2202 format_list = self._available_formats
2203 existing_formats = [x for x in format_list if x in url_map]
2204 if len(existing_formats) == 0:
2205 self._downloader.trouble(u'ERROR: no known formats available for video')
2206 return
2207 if req_format is None:
2208 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2209 elif req_format == 'worst':
2210 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2211 elif req_format == '-1':
2212 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2213 else:
2214 # Specific format
2215 if req_format not in url_map:
2216 self._downloader.trouble(u'ERROR: requested format not available')
2217 return
2218 video_url_list = [(req_format, url_map[req_format])] # Specific format
2219
2220 results = []
2221 for format_param, video_real_url in video_url_list:
2222 # Extension
2223 video_extension = self._video_extensions.get(format_param, 'mp4')
2224
2225 results.append({
2226 'id': video_id.decode('utf-8'),
2227 'url': video_real_url.decode('utf-8'),
2228 'uploader': video_uploader.decode('utf-8'),
2229 'upload_date': upload_date,
2230 'title': video_title,
2231 'ext': video_extension.decode('utf-8'),
2232 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2233 'thumbnail': video_thumbnail.decode('utf-8'),
2234 'description': video_description.decode('utf-8'),
2235 'player_url': None,
2236 })
2237 return results
2238
2239 class BlipTVIE(InfoExtractor):
2240 """Information extractor for blip.tv"""
2241
2242 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2243 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2244 IE_NAME = u'blip.tv'
2245
2246 def report_extraction(self, file_id):
2247 """Report information extraction."""
2248 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2249
2250 def report_direct_download(self, title):
2251 """Report information extraction."""
2252 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2253
2254 def _real_extract(self, url):
2255 mobj = re.match(self._VALID_URL, url)
2256 if mobj is None:
2257 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258 return
2259
2260 if '?' in url:
2261 cchar = '&'
2262 else:
2263 cchar = '?'
2264 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2265 request = urllib2.Request(json_url.encode('utf-8'))
2266 self.report_extraction(mobj.group(1))
2267 info = None
2268 try:
2269 urlh = urllib2.urlopen(request)
2270 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2271 basename = url.split('/')[-1]
2272 title,ext = os.path.splitext(basename)
2273 title = title.decode('UTF-8')
2274 ext = ext.replace('.', '')
2275 self.report_direct_download(title)
2276 info = {
2277 'id': title,
2278 'url': url,
2279 'title': title,
2280 'ext': ext,
2281 'urlhandle': urlh
2282 }
2283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2284 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
2285 return
2286 if info is None: # Regular URL
2287 try:
2288 json_code = urlh.read()
2289 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2290 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % u(err))
2291 return
2292
2293 try:
2294 json_data = json.loads(json_code)
2295 if 'Post' in json_data:
2296 data = json_data['Post']
2297 else:
2298 data = json_data
2299
2300 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2301 video_url = data['media']['url']
2302 umobj = re.match(self._URL_EXT, video_url)
2303 if umobj is None:
2304 raise ValueError('Can not determine filename extension')
2305 ext = umobj.group(1)
2306
2307 info = {
2308 'id': data['item_id'],
2309 'url': video_url,
2310 'uploader': data['display_name'],
2311 'upload_date': upload_date,
2312 'title': data['title'],
2313 'ext': ext,
2314 'format': data['media']['mimeType'],
2315 'thumbnail': data['thumbnailUrl'],
2316 'description': data['description'],
2317 'player_url': data['embedUrl']
2318 }
2319 except (ValueError,KeyError), err:
2320 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2321 return
2322
2323 std_headers['User-Agent'] = 'iTunes/10.6.1'
2324 return [info]
2325
2326
2327 class MyVideoIE(InfoExtractor):
2328 """Information Extractor for myvideo.de."""
2329
2330 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2331 IE_NAME = u'myvideo'
2332
2333 def __init__(self, downloader=None):
2334 InfoExtractor.__init__(self, downloader)
2335
2336 def report_download_webpage(self, video_id):
2337 """Report webpage download."""
2338 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2339
2340 def report_extraction(self, video_id):
2341 """Report information extraction."""
2342 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2343
2344 def _real_extract(self,url):
2345 mobj = re.match(self._VALID_URL, url)
2346 if mobj is None:
2347 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2348 return
2349
2350 video_id = mobj.group(1)
2351
2352 # Get video webpage
2353 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2354 try:
2355 self.report_download_webpage(video_id)
2356 webpage = urllib2.urlopen(request).read()
2357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2358 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
2359 return
2360
2361 self.report_extraction(video_id)
2362 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2363 webpage)
2364 if mobj is None:
2365 self._downloader.trouble(u'ERROR: unable to extract media URL')
2366 return
2367 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2368
2369 mobj = re.search('<title>([^<]+)</title>', webpage)
2370 if mobj is None:
2371 self._downloader.trouble(u'ERROR: unable to extract title')
2372 return
2373
2374 video_title = mobj.group(1)
2375
2376 return [{
2377 'id': video_id,
2378 'url': video_url,
2379 'uploader': u'NA',
2380 'upload_date': u'NA',
2381 'title': video_title,
2382 'ext': u'flv',
2383 'format': u'NA',
2384 'player_url': None,
2385 }]
2386
2387 class ComedyCentralIE(InfoExtractor):
2388 """Information extractor for The Daily Show and Colbert Report """
2389
2390 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2391 IE_NAME = u'comedycentral'
2392
2393 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2394
2395 _video_extensions = {
2396 '3500': 'mp4',
2397 '2200': 'mp4',
2398 '1700': 'mp4',
2399 '1200': 'mp4',
2400 '750': 'mp4',
2401 '400': 'mp4',
2402 }
2403 _video_dimensions = {
2404 '3500': '1280x720',
2405 '2200': '960x540',
2406 '1700': '768x432',
2407 '1200': '640x360',
2408 '750': '512x288',
2409 '400': '384x216',
2410 }
2411
2412 def report_extraction(self, episode_id):
2413 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2414
2415 def report_config_download(self, episode_id):
2416 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2417
2418 def report_index_download(self, episode_id):
2419 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2420
2421 def report_player_url(self, episode_id):
2422 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2423
2424
2425 def _print_formats(self, formats):
2426 print('Available formats:')
2427 for x in formats:
2428 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2429
2430
2431 def _real_extract(self, url):
2432 mobj = re.match(self._VALID_URL, url)
2433 if mobj is None:
2434 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435 return
2436
2437 if mobj.group('shortname'):
2438 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2439 url = u'http://www.thedailyshow.com/full-episodes/'
2440 else:
2441 url = u'http://www.colbertnation.com/full-episodes/'
2442 mobj = re.match(self._VALID_URL, url)
2443 assert mobj is not None
2444
2445 dlNewest = not mobj.group('episode')
2446 if dlNewest:
2447 epTitle = mobj.group('showname')
2448 else:
2449 epTitle = mobj.group('episode')
2450
2451 req = urllib2.Request(url)
2452 self.report_extraction(epTitle)
2453 try:
2454 htmlHandle = urllib2.urlopen(req)
2455 html = htmlHandle.read()
2456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2458 return
2459 if dlNewest:
2460 url = htmlHandle.geturl()
2461 mobj = re.match(self._VALID_URL, url)
2462 if mobj is None:
2463 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2464 return
2465 if mobj.group('episode') == '':
2466 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2467 return
2468 epTitle = mobj.group('episode')
2469
2470 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2471
2472 if len(mMovieParams) == 0:
2473 # The Colbert Report embeds the information in a without
2474 # a URL prefix; so extract the alternate reference
2475 # and then add the URL prefix manually.
2476
2477 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2478 if len(altMovieParams) == 0:
2479 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2480 return
2481 else:
2482 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2483
2484 playerUrl_raw = mMovieParams[0][0]
2485 self.report_player_url(epTitle)
2486 try:
2487 urlHandle = urllib2.urlopen(playerUrl_raw)
2488 playerUrl = urlHandle.geturl()
2489 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2490 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + u(err))
2491 return
2492
2493 uri = mMovieParams[0][1]
2494 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2495 self.report_index_download(epTitle)
2496 try:
2497 indexXml = urllib2.urlopen(indexUrl).read()
2498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2499 self._downloader.trouble(u'ERROR: unable to download episode index: ' + u(err))
2500 return
2501
2502 results = []
2503
2504 idoc = xml.etree.ElementTree.fromstring(indexXml)
2505 itemEls = idoc.findall('.//item')
2506 for itemEl in itemEls:
2507 mediaId = itemEl.findall('./guid')[0].text
2508 shortMediaId = mediaId.split(':')[-1]
2509 showId = mediaId.split(':')[-2].replace('.com', '')
2510 officialTitle = itemEl.findall('./title')[0].text
2511 officialDate = itemEl.findall('./pubDate')[0].text
2512
2513 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2514 urllib.urlencode({'uri': mediaId}))
2515 configReq = urllib2.Request(configUrl)
2516 self.report_config_download(epTitle)
2517 try:
2518 configXml = urllib2.urlopen(configReq).read()
2519 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2520 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2521 return
2522
2523 cdoc = xml.etree.ElementTree.fromstring(configXml)
2524 turls = []
2525 for rendition in cdoc.findall('.//rendition'):
2526 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2527 turls.append(finfo)
2528
2529 if len(turls) == 0:
2530 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2531 continue
2532
2533 if self._downloader.params.get('listformats', None):
2534 self._print_formats([i[0] for i in turls])
2535 return
2536
2537 # For now, just pick the highest bitrate
2538 format,video_url = turls[-1]
2539
2540 # Get the format arg from the arg stream
2541 req_format = self._downloader.params.get('format', None)
2542
2543 # Select format if we can find one
2544 for f,v in turls:
2545 if f == req_format:
2546 format, video_url = f, v
2547 break
2548
2549 # Patch to download from alternative CDN, which does not
2550 # break on current RTMPDump builds
2551 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2552 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2553
2554 if video_url.startswith(broken_cdn):
2555 video_url = video_url.replace(broken_cdn, better_cdn)
2556
2557 effTitle = showId + u'-' + epTitle
2558 info = {
2559 'id': shortMediaId,
2560 'url': video_url,
2561 'uploader': showId,
2562 'upload_date': officialDate,
2563 'title': effTitle,
2564 'ext': 'mp4',
2565 'format': format,
2566 'thumbnail': None,
2567 'description': officialTitle,
2568 'player_url': None #playerUrl
2569 }
2570
2571 results.append(info)
2572
2573 return results
2574
2575
2576 class EscapistIE(InfoExtractor):
2577 """Information extractor for The Escapist """
2578
2579 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2580 IE_NAME = u'escapist'
2581
2582 def report_extraction(self, showName):
2583 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2584
2585 def report_config_download(self, showName):
2586 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2587
2588 def _real_extract(self, url):
2589 mobj = re.match(self._VALID_URL, url)
2590 if mobj is None:
2591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2592 return
2593 showName = mobj.group('showname')
2594 videoId = mobj.group('episode')
2595
2596 self.report_extraction(showName)
2597 try:
2598 webPage = urllib2.urlopen(url)
2599 webPageBytes = webPage.read()
2600 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2601 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2602 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2603 self._downloader.trouble(u'ERROR: unable to download webpage: ' + u(err))
2604 return
2605
2606 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2607 description = unescapeHTML(descMatch.group(1))
2608 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2609 imgUrl = unescapeHTML(imgMatch.group(1))
2610 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2611 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2612 configUrlMatch = re.search('config=(.*)$', playerUrl)
2613 configUrl = urllib2.unquote(configUrlMatch.group(1))
2614
2615 self.report_config_download(showName)
2616 try:
2617 configJSON = urllib2.urlopen(configUrl).read()
2618 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2619 self._downloader.trouble(u'ERROR: unable to download configuration: ' + u(err))
2620 return
2621
2622 # Technically, it's JavaScript, not JSON
2623 configJSON = configJSON.replace("'", '"')
2624
2625 try:
2626 config = json.loads(configJSON)
2627 except (ValueError,), err:
2628 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + u(err))
2629 return
2630
2631 playlist = config['playlist']
2632 videoUrl = playlist[1]['url']
2633
2634 info = {
2635 'id': videoId,
2636 'url': videoUrl,
2637 'uploader': showName,
2638 'upload_date': None,
2639 'title': showName,
2640 'ext': 'flv',
2641 'format': 'flv',
2642 'thumbnail': imgUrl,
2643 'description': description,
2644 'player_url': playerUrl,
2645 }
2646
2647 return [info]
2648
2649
2650 class CollegeHumorIE(InfoExtractor):
2651 """Information extractor for collegehumor.com"""
2652
2653 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2654 IE_NAME = u'collegehumor'
2655
2656 def report_webpage(self, video_id):
2657 """Report information extraction."""
2658 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2659
2660 def report_extraction(self, video_id):
2661 """Report information extraction."""
2662 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2663
2664 def _real_extract(self, url):
2665 mobj = re.match(self._VALID_URL, url)
2666 if mobj is None:
2667 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2668 return
2669 video_id = mobj.group('videoid')
2670
2671 self.report_webpage(video_id)
2672 request = urllib2.Request(url)
2673 try:
2674 webpage = urllib2.urlopen(request).read()
2675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2676 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2677 return
2678
2679 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2680 if m is None:
2681 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2682 return
2683 internal_video_id = m.group('internalvideoid')
2684
2685 info = {
2686 'id': video_id,
2687 'internal_id': internal_video_id,
2688 }
2689
2690 self.report_extraction(video_id)
2691 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2692 try:
2693 metaXml = urllib2.urlopen(xmlUrl).read()
2694 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2695 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
2696 return
2697
2698 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2699 try:
2700 videoNode = mdoc.findall('./video')[0]
2701 info['description'] = videoNode.findall('./description')[0].text
2702 info['title'] = videoNode.findall('./caption')[0].text
2703 info['url'] = videoNode.findall('./file')[0].text
2704 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2705 info['ext'] = info['url'].rpartition('.')[2]
2706 info['format'] = info['ext']
2707 except IndexError:
2708 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2709 return
2710
2711 return [info]
2712
2713
2714 class XVideosIE(InfoExtractor):
2715 """Information extractor for xvideos.com"""
2716
2717 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2718 IE_NAME = u'xvideos'
2719
2720 def report_webpage(self, video_id):
2721 """Report information extraction."""
2722 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2723
2724 def report_extraction(self, video_id):
2725 """Report information extraction."""
2726 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2727
2728 def _real_extract(self, url):
2729 mobj = re.match(self._VALID_URL, url)
2730 if mobj is None:
2731 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2732 return
2733 video_id = mobj.group(1).decode('utf-8')
2734
2735 self.report_webpage(video_id)
2736
2737 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2738 try:
2739 webpage = urllib2.urlopen(request).read()
2740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2741 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2742 return
2743
2744 self.report_extraction(video_id)
2745
2746
2747 # Extract video URL
2748 mobj = re.search(r'flv_url=(.+?)&', webpage)
2749 if mobj is None:
2750 self._downloader.trouble(u'ERROR: unable to extract video url')
2751 return
2752 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2753
2754
2755 # Extract title
2756 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2757 if mobj is None:
2758 self._downloader.trouble(u'ERROR: unable to extract video title')
2759 return
2760 video_title = mobj.group(1).decode('utf-8')
2761
2762
2763 # Extract video thumbnail
2764 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2765 if mobj is None:
2766 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2767 return
2768 video_thumbnail = mobj.group(0).decode('utf-8')
2769
2770 info = {
2771 'id': video_id,
2772 'url': video_url,
2773 'uploader': None,
2774 'upload_date': None,
2775 'title': video_title,
2776 'ext': 'flv',
2777 'format': 'flv',
2778 'thumbnail': video_thumbnail,
2779 'description': None,
2780 'player_url': None,
2781 }
2782
2783 return [info]
2784
2785
2786 class SoundcloudIE(InfoExtractor):
2787 """Information extractor for soundcloud.com
2788 To access the media, the uid of the song and a stream token
2789 must be extracted from the page source and the script must make
2790 a request to media.soundcloud.com/crossdomain.xml. Then
2791 the media can be grabbed by requesting from an url composed
2792 of the stream token and uid
2793 """
2794
2795 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796 IE_NAME = u'soundcloud'
2797
2798 def __init__(self, downloader=None):
2799 InfoExtractor.__init__(self, downloader)
2800
2801 def report_webpage(self, video_id):
2802 """Report information extraction."""
2803 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2804
2805 def report_extraction(self, video_id):
2806 """Report information extraction."""
2807 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2808
2809 def _real_extract(self, url):
2810 mobj = re.match(self._VALID_URL, url)
2811 if mobj is None:
2812 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813 return
2814
2815 # extract uploader (which is in the url)
2816 uploader = mobj.group(1).decode('utf-8')
2817 # extract simple title (uploader + slug of song title)
2818 slug_title = mobj.group(2).decode('utf-8')
2819 simple_title = uploader + u'-' + slug_title
2820
2821 self.report_webpage('%s/%s' % (uploader, slug_title))
2822
2823 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2824 try:
2825 webpage = urllib2.urlopen(request).read()
2826 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2828 return
2829
2830 self.report_extraction('%s/%s' % (uploader, slug_title))
2831
2832 # extract uid and stream token that soundcloud hands out for access
2833 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2834 if mobj:
2835 video_id = mobj.group(1)
2836 stream_token = mobj.group(2)
2837
2838 # extract unsimplified title
2839 mobj = re.search('"title":"(.*?)",', webpage)
2840 if mobj:
2841 title = mobj.group(1).decode('utf-8')
2842 else:
2843 title = simple_title
2844
2845 # construct media url (with uid/token)
2846 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2847 mediaURL = mediaURL % (video_id, stream_token)
2848
2849 # description
2850 description = u'No description available'
2851 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2852 if mobj:
2853 description = mobj.group(1)
2854
2855 # upload date
2856 upload_date = None
2857 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2858 if mobj:
2859 try:
2860 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2861 except Exception, e:
2862 self._downloader.to_stderr(u(e))
2863
2864 # for soundcloud, a request to a cross domain is required for cookies
2865 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2866
2867 return [{
2868 'id': video_id.decode('utf-8'),
2869 'url': mediaURL,
2870 'uploader': uploader.decode('utf-8'),
2871 'upload_date': upload_date,
2872 'title': title,
2873 'ext': u'mp3',
2874 'format': u'NA',
2875 'player_url': None,
2876 'description': description.decode('utf-8')
2877 }]
2878
2879
2880 class InfoQIE(InfoExtractor):
2881 """Information extractor for infoq.com"""
2882
2883 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2884 IE_NAME = u'infoq'
2885
2886 def report_webpage(self, video_id):
2887 """Report information extraction."""
2888 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2889
2890 def report_extraction(self, video_id):
2891 """Report information extraction."""
2892 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2893
2894 def _real_extract(self, url):
2895 mobj = re.match(self._VALID_URL, url)
2896 if mobj is None:
2897 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2898 return
2899
2900 self.report_webpage(url)
2901
2902 request = urllib2.Request(url)
2903 try:
2904 webpage = urllib2.urlopen(request).read()
2905 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2906 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2907 return
2908
2909 self.report_extraction(url)
2910
2911
2912 # Extract video URL
2913 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2914 if mobj is None:
2915 self._downloader.trouble(u'ERROR: unable to extract video url')
2916 return
2917 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2918
2919
2920 # Extract title
2921 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2922 if mobj is None:
2923 self._downloader.trouble(u'ERROR: unable to extract video title')
2924 return
2925 video_title = mobj.group(1).decode('utf-8')
2926
2927 # Extract description
2928 video_description = u'No description available.'
2929 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2930 if mobj is not None:
2931 video_description = mobj.group(1).decode('utf-8')
2932
2933 video_filename = video_url.split('/')[-1]
2934 video_id, extension = video_filename.split('.')
2935
2936 info = {
2937 'id': video_id,
2938 'url': video_url,
2939 'uploader': None,
2940 'upload_date': None,
2941 'title': video_title,
2942 'ext': extension,
2943 'format': extension, # Extension is always(?) mp4, but seems to be flv
2944 'thumbnail': None,
2945 'description': video_description,
2946 'player_url': None,
2947 }
2948
2949 return [info]
2950
2951 class MixcloudIE(InfoExtractor):
2952 """Information extractor for www.mixcloud.com"""
2953 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2954 IE_NAME = u'mixcloud'
2955
2956 def __init__(self, downloader=None):
2957 InfoExtractor.__init__(self, downloader)
2958
2959 def report_download_json(self, file_id):
2960 """Report JSON download."""
2961 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2962
2963 def report_extraction(self, file_id):
2964 """Report information extraction."""
2965 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2966
2967 def get_urls(self, jsonData, fmt, bitrate='best'):
2968 """Get urls from 'audio_formats' section in json"""
2969 file_url = None
2970 try:
2971 bitrate_list = jsonData[fmt]
2972 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2973 bitrate = max(bitrate_list) # select highest
2974
2975 url_list = jsonData[fmt][bitrate]
2976 except TypeError: # we have no bitrate info.
2977 url_list = jsonData[fmt]
2978 return url_list
2979
2980 def check_urls(self, url_list):
2981 """Returns 1st active url from list"""
2982 for url in url_list:
2983 try:
2984 urllib2.urlopen(url)
2985 return url
2986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987 url = None
2988
2989 return None
2990
2991 def _print_formats(self, formats):
2992 print('Available formats:')
2993 for fmt in formats.keys():
2994 for b in formats[fmt]:
2995 try:
2996 ext = formats[fmt][b][0]
2997 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2998 except TypeError: # we have no bitrate info
2999 ext = formats[fmt][0]
3000 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3001 break
3002
3003 def _real_extract(self, url):
3004 mobj = re.match(self._VALID_URL, url)
3005 if mobj is None:
3006 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3007 return
3008 # extract uploader & filename from url
3009 uploader = mobj.group(1).decode('utf-8')
3010 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3011
3012 # construct API request
3013 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3014 # retrieve .json file with links to files
3015 request = urllib2.Request(file_url)
3016 try:
3017 self.report_download_json(file_url)
3018 jsonData = urllib2.urlopen(request).read()
3019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3020 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % u(err))
3021 return
3022
3023 # parse JSON
3024 json_data = json.loads(jsonData)
3025 player_url = json_data['player_swf_url']
3026 formats = dict(json_data['audio_formats'])
3027
3028 req_format = self._downloader.params.get('format', None)
3029 bitrate = None
3030
3031 if self._downloader.params.get('listformats', None):
3032 self._print_formats(formats)
3033 return
3034
3035 if req_format is None or req_format == 'best':
3036 for format_param in formats.keys():
3037 url_list = self.get_urls(formats, format_param)
3038 # check urls
3039 file_url = self.check_urls(url_list)
3040 if file_url is not None:
3041 break # got it!
3042 else:
3043 if req_format not in formats.keys():
3044 self._downloader.trouble(u'ERROR: format is not available')
3045 return
3046
3047 url_list = self.get_urls(formats, req_format)
3048 file_url = self.check_urls(url_list)
3049 format_param = req_format
3050
3051 return [{
3052 'id': file_id.decode('utf-8'),
3053 'url': file_url.decode('utf-8'),
3054 'uploader': uploader.decode('utf-8'),
3055 'upload_date': u'NA',
3056 'title': json_data['name'],
3057 'ext': file_url.split('.')[-1].decode('utf-8'),
3058 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3059 'thumbnail': json_data['thumbnail_url'],
3060 'description': json_data['description'],
3061 'player_url': player_url.decode('utf-8'),
3062 }]
3063
3064 class StanfordOpenClassroomIE(InfoExtractor):
3065 """Information extractor for Stanford's Open ClassRoom"""
3066
3067 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3068 IE_NAME = u'stanfordoc'
3069
3070 def report_download_webpage(self, objid):
3071 """Report information extraction."""
3072 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3073
3074 def report_extraction(self, video_id):
3075 """Report information extraction."""
3076 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3077
3078 def _real_extract(self, url):
3079 mobj = re.match(self._VALID_URL, url)
3080 if mobj is None:
3081 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3082 return
3083
3084 if mobj.group('course') and mobj.group('video'): # A specific video
3085 course = mobj.group('course')
3086 video = mobj.group('video')
3087 info = {
3088 'id': course + '_' + video,
3089 }
3090
3091 self.report_extraction(info['id'])
3092 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3093 xmlUrl = baseUrl + video + '.xml'
3094 try:
3095 metaXml = urllib2.urlopen(xmlUrl).read()
3096 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3097 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
3098 return
3099 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3100 try:
3101 info['title'] = mdoc.findall('./title')[0].text
3102 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3103 except IndexError:
3104 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3105 return
3106 info['ext'] = info['url'].rpartition('.')[2]
3107 info['format'] = info['ext']
3108 return [info]
3109 elif mobj.group('course'): # A course page
3110 course = mobj.group('course')
3111 info = {
3112 'id': course,
3113 'type': 'playlist',
3114 }
3115
3116 self.report_download_webpage(info['id'])
3117 try:
3118 coursepage = urllib2.urlopen(url).read()
3119 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3120 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3121 return
3122
3123 m = re.search('<h1>([^<]+)</h1>', coursepage)
3124 if m:
3125 info['title'] = unescapeHTML(m.group(1))
3126 else:
3127 info['title'] = info['id']
3128
3129 m = re.search('<description>([^<]+)</description>', coursepage)
3130 if m:
3131 info['description'] = unescapeHTML(m.group(1))
3132
3133 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3134 info['list'] = [
3135 {
3136 'type': 'reference',
3137 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3138 }
3139 for vpage in links]
3140 results = []
3141 for entry in info['list']:
3142 assert entry['type'] == 'reference'
3143 results += self.extract(entry['url'])
3144 return results
3145
3146 else: # Root page
3147 info = {
3148 'id': 'Stanford OpenClassroom',
3149 'type': 'playlist',
3150 }
3151
3152 self.report_download_webpage(info['id'])
3153 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3154 try:
3155 rootpage = urllib2.urlopen(rootURL).read()
3156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3158 return
3159
3160 info['title'] = info['id']
3161
3162 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3163 info['list'] = [
3164 {
3165 'type': 'reference',
3166 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3167 }
3168 for cpage in links]
3169
3170 results = []
3171 for entry in info['list']:
3172 assert entry['type'] == 'reference'
3173 results += self.extract(entry['url'])
3174 return results
3175
3176 class MTVIE(InfoExtractor):
3177 """Information extractor for MTV.com"""
3178
3179 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3180 IE_NAME = u'mtv'
3181
3182 def report_webpage(self, video_id):
3183 """Report information extraction."""
3184 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3185
3186 def report_extraction(self, video_id):
3187 """Report information extraction."""
3188 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3189
3190 def _real_extract(self, url):
3191 mobj = re.match(self._VALID_URL, url)
3192 if mobj is None:
3193 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3194 return
3195 if not mobj.group('proto'):
3196 url = 'http://' + url
3197 video_id = mobj.group('videoid')
3198 self.report_webpage(video_id)
3199
3200 request = urllib2.Request(url)
3201 try:
3202 webpage = urllib2.urlopen(request).read()
3203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3204 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
3205 return
3206
3207 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3208 if mobj is None:
3209 self._downloader.trouble(u'ERROR: unable to extract song name')
3210 return
3211 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3212 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3213 if mobj is None:
3214 self._downloader.trouble(u'ERROR: unable to extract performer')
3215 return
3216 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3217 video_title = performer + ' - ' + song_name
3218
3219 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3220 if mobj is None:
3221 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3222 return
3223 mtvn_uri = mobj.group(1)
3224
3225 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3226 if mobj is None:
3227 self._downloader.trouble(u'ERROR: unable to extract content id')
3228 return
3229 content_id = mobj.group(1)
3230
3231 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3232 self.report_extraction(video_id)
3233 request = urllib2.Request(videogen_url)
3234 try:
3235 metadataXml = urllib2.urlopen(request).read()
3236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3237 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % u(err))
3238 return
3239
3240 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3241 renditions = mdoc.findall('.//rendition')
3242
3243 # For now, always pick the highest quality.
3244 rendition = renditions[-1]
3245
3246 try:
3247 _,_,ext = rendition.attrib['type'].partition('/')
3248 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3249 video_url = rendition.find('./src').text
3250 except KeyError:
3251 self._downloader.trouble('Invalid rendition field.')
3252 return
3253
3254 info = {
3255 'id': video_id,
3256 'url': video_url,
3257 'uploader': performer,
3258 'title': video_title,
3259 'ext': ext,
3260 'format': format,
3261 }
3262
3263 return [info]
3264
3265
3266 class YoukuIE(InfoExtractor):
3267
3268 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3269 IE_NAME = u'Youku'
3270
3271 def __init__(self, downloader=None):
3272 InfoExtractor.__init__(self, downloader)
3273
3274 def report_download_webpage(self, file_id):
3275 """Report webpage download."""
3276 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3277
3278 def report_extraction(self, file_id):
3279 """Report information extraction."""
3280 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3281
3282 def _gen_sid(self):
3283 nowTime = int(time.time() * 1000)
3284 random1 = random.randint(1000,1998)
3285 random2 = random.randint(1000,9999)
3286
3287 return "%d%d%d" %(nowTime,random1,random2)
3288
3289 def _get_file_ID_mix_string(self, seed):
3290 mixed = []
3291 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3292 seed = float(seed)
3293 for i in range(len(source)):
3294 seed = (seed * 211 + 30031 ) % 65536
3295 index = math.floor(seed / 65536 * len(source) )
3296 mixed.append(source[int(index)])
3297 source.remove(source[int(index)])
3298 #return ''.join(mixed)
3299 return mixed
3300
3301 def _get_file_id(self, fileId, seed):
3302 mixed = self._get_file_ID_mix_string(seed)
3303 ids = fileId.split('*')
3304 realId = []
3305 for ch in ids:
3306 if ch:
3307 realId.append(mixed[int(ch)])
3308 return ''.join(realId)
3309
3310 def _real_extract(self, url):
3311 mobj = re.match(self._VALID_URL, url)
3312 if mobj is None:
3313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3314 return
3315 video_id = mobj.group('ID')
3316
3317 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3318
3319 request = urllib2.Request(info_url, None, std_headers)
3320 try:
3321 self.report_download_webpage(video_id)
3322 jsondata = urllib2.urlopen(request).read()
3323 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3324 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3325 return
3326
3327 self.report_extraction(video_id)
3328 try:
3329 config = json.loads(jsondata)
3330
3331 video_title = config['data'][0]['title']
3332 seed = config['data'][0]['seed']
3333
3334 format = self._downloader.params.get('format', None)
3335 supported_format = config['data'][0]['streamfileids'].keys()
3336
3337 if format is None or format == 'best':
3338 if 'hd2' in supported_format:
3339 format = 'hd2'
3340 else:
3341 format = 'flv'
3342 ext = u'flv'
3343 elif format == 'worst':
3344 format = 'mp4'
3345 ext = u'mp4'
3346 else:
3347 format = 'flv'
3348 ext = u'flv'
3349
3350
3351 fileid = config['data'][0]['streamfileids'][format]
3352 seg_number = len(config['data'][0]['segs'][format])
3353
3354 keys=[]
3355 for i in xrange(seg_number):
3356 keys.append(config['data'][0]['segs'][format][i]['k'])
3357
3358 #TODO check error
3359 #youku only could be viewed from mainland china
3360 except:
3361 self._downloader.trouble(u'ERROR: unable to extract info section')
3362 return
3363
3364 files_info=[]
3365 sid = self._gen_sid()
3366 fileid = self._get_file_id(fileid, seed)
3367
3368 #column 8,9 of fileid represent the segment number
3369 #fileid[7:9] should be changed
3370 for index, key in enumerate(keys):
3371
3372 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3373 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3374
3375 info = {
3376 'id': '%s_part%02d' % (video_id, index),
3377 'url': download_url,
3378 'uploader': None,
3379 'title': video_title,
3380 'ext': ext,
3381 'format': u'NA'
3382 }
3383 files_info.append(info)
3384
3385 return files_info
3386
3387
3388 class XNXXIE(InfoExtractor):
3389 """Information extractor for xnxx.com"""
3390
3391 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3392 IE_NAME = u'xnxx'
3393 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3394 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3395 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3396
3397 def report_webpage(self, video_id):
3398 """Report information extraction"""
3399 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3400
3401 def report_extraction(self, video_id):
3402 """Report information extraction"""
3403 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3404
3405 def _real_extract(self, url):
3406 mobj = re.match(self._VALID_URL, url)
3407 if mobj is None:
3408 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3409 return
3410 video_id = mobj.group(1).decode('utf-8')
3411
3412 self.report_webpage(video_id)
3413
3414 # Get webpage content
3415 try:
3416 webpage = urllib2.urlopen(url).read()
3417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3418 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3419 return
3420
3421 result = re.search(self.VIDEO_URL_RE, webpage)
3422 if result is None:
3423 self._downloader.trouble(u'ERROR: unable to extract video url')
3424 return
3425 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3426
3427 result = re.search(self.VIDEO_TITLE_RE, webpage)
3428 if result is None:
3429 self._downloader.trouble(u'ERROR: unable to extract video title')
3430 return
3431 video_title = result.group(1).decode('utf-8')
3432
3433 result = re.search(self.VIDEO_THUMB_RE, webpage)
3434 if result is None:
3435 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3436 return
3437 video_thumbnail = result.group(1).decode('utf-8')
3438
3439 info = {'id': video_id,
3440 'url': video_url,
3441 'uploader': None,
3442 'upload_date': None,
3443 'title': video_title,
3444 'ext': 'flv',
3445 'format': 'flv',
3446 'thumbnail': video_thumbnail,
3447 'description': None,
3448 'player_url': None}
3449
3450 return [info]
3451
3452
3453 class GooglePlusIE(InfoExtractor):
3454 """Information extractor for plus.google.com."""
3455
3456 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3457 IE_NAME = u'plus.google'
3458
3459 def __init__(self, downloader=None):
3460 InfoExtractor.__init__(self, downloader)
3461
3462 def report_extract_entry(self, url):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3465
3466 def report_date(self, upload_date):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3469
3470 def report_uploader(self, uploader):
3471 """Report downloading extry"""
3472 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3473
3474 def report_title(self, video_title):
3475 """Report downloading extry"""
3476 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3477
3478 def report_extract_vid_page(self, video_page):
3479 """Report information extraction."""
3480 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3481
3482 def _real_extract(self, url):
3483 # Extract id from URL
3484 mobj = re.match(self._VALID_URL, url)
3485 if mobj is None:
3486 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3487 return
3488
3489 post_url = mobj.group(0)
3490 video_id = mobj.group(2)
3491
3492 video_extension = 'flv'
3493
3494 # Step 1, Retrieve post webpage to extract further information
3495 self.report_extract_entry(post_url)
3496 request = urllib2.Request(post_url)
3497 try:
3498 webpage = urllib2.urlopen(request).read()
3499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3500 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % u(err))
3501 return
3502
3503 # Extract update date
3504 upload_date = u'NA'
3505 pattern = 'title="Timestamp">(.*?)</a>'
3506 mobj = re.search(pattern, webpage)
3507 if mobj:
3508 upload_date = mobj.group(1)
3509 # Convert timestring to a format suitable for filename
3510 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3511 upload_date = upload_date.strftime('%Y%m%d')
3512 self.report_date(upload_date)
3513
3514 # Extract uploader
3515 uploader = u'NA'
3516 pattern = r'rel\="author".*?>(.*?)</a>'
3517 mobj = re.search(pattern, webpage)
3518 if mobj:
3519 uploader = mobj.group(1)
3520 self.report_uploader(uploader)
3521
3522 # Extract title
3523 # Get the first line for title
3524 video_title = u'NA'
3525 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3526 mobj = re.search(pattern, webpage)
3527 if mobj:
3528 video_title = mobj.group(1)
3529 self.report_title(video_title)
3530
3531 # Step 2, Stimulate clicking the image box to launch video
3532 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3533 mobj = re.search(pattern, webpage)
3534 if mobj is None:
3535 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3536
3537 video_page = mobj.group(1)
3538 request = urllib2.Request(video_page)
3539 try:
3540 webpage = urllib2.urlopen(request).read()
3541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3542 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3543 return
3544 self.report_extract_vid_page(video_page)
3545
3546
3547 # Extract video links on video page
3548 """Extract video links of all sizes"""
3549 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3550 mobj = re.findall(pattern, webpage)
3551 if len(mobj) == 0:
3552 self._downloader.trouble(u'ERROR: unable to extract video links')
3553
3554 # Sort in resolution
3555 links = sorted(mobj)
3556
3557 # Choose the lowest of the sort, i.e. highest resolution
3558 video_url = links[-1]
3559 # Only get the url. The resolution part in the tuple has no use anymore
3560 video_url = video_url[-1]
3561 # Treat escaped \u0026 style hex
3562 video_url = unicode(video_url, "unicode_escape")
3563
3564
3565 return [{
3566 'id': video_id.decode('utf-8'),
3567 'url': video_url,
3568 'uploader': uploader.decode('utf-8'),
3569 'upload_date': upload_date.decode('utf-8'),
3570 'title': video_title.decode('utf-8'),
3571 'ext': video_extension.decode('utf-8'),
3572 'format': u'NA',
3573 'player_url': None,
3574 }]