]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Fix EscapistMagazine IE
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import datetime
5import HTMLParser
6import httplib
7import netrc
8import os
9import re
10import socket
11import time
12import urllib
13import urllib2
14import email.utils
921a1455
FV
15import xml.etree.ElementTree
16from urlparse import parse_qs
d77c3dfd
FV
17
18try:
19 import cStringIO as StringIO
20except ImportError:
21 import StringIO
22
d11d05d0 23from utils import *
d77c3dfd
FV
24
25
26class InfoExtractor(object):
27 """Information Extractor class.
28
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
36 the following fields:
37
38 id: Video identifier.
39 url: Final video URL.
40 uploader: Nickname of the video uploader.
41 title: Literal title.
d77c3dfd
FV
42 ext: Video filename extension.
43 format: Video format.
44 player_url: SWF Player URL (may be None).
45
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
50
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57 """
58
59 _ready = False
60 _downloader = None
61
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
64 self._ready = False
65 self.set_downloader(downloader)
66
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
70
71 def initialize(self):
72 """Initializes an instance (authentication, etc)."""
73 if not self._ready:
74 self._real_initialize()
75 self._ready = True
76
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
79 self.initialize()
80 return self._real_extract(url)
81
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
85
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
88 pass
89
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
92 pass
93
94
95class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
97
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
3fe294e4
FV
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
d77c3dfd
FV
107 _video_extensions = {
108 '13': '3gp',
109 '17': 'mp4',
110 '18': 'mp4',
111 '22': 'mp4',
112 '37': 'mp4',
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
114 '43': 'webm',
115 '44': 'webm',
116 '45': 'webm',
3fe294e4 117 '46': 'webm',
d77c3dfd
FV
118 }
119 _video_dimensions = {
120 '5': '240x400',
121 '6': '???',
122 '13': '???',
123 '17': '144x176',
124 '18': '360x640',
125 '22': '720x1280',
126 '34': '360x640',
127 '35': '480x854',
128 '37': '1080x1920',
129 '38': '3072x4096',
130 '43': '360x640',
131 '44': '480x854',
132 '45': '720x1280',
3fe294e4 133 '46': '1080x1920',
d77c3dfd
FV
134 }
135 IE_NAME = u'youtube'
136
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
140
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
144
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
148
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
152
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
156
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
160
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
164
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
168
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
172
173 def _closed_captions_xml_to_srt(self, xml_string):
174 srt = ''
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
179 start = float(start)
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
9e6dd238 183 caption = unescapeHTML(caption)
6ab92c8b 184 caption = unescapeHTML(caption) # double cycle, intentional
54041793 185 srt += str(n+1) + '\n'
d77c3dfd
FV
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
188 return srt
189
190 def _print_formats(self, formats):
191 print 'Available formats:'
192 for x in formats:
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
194
195 def _real_initialize(self):
196 if self._downloader is None:
197 return
198
199 username = None
200 password = None
201 downloader_params = self._downloader.params
202
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
208 try:
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
210 if info is not None:
211 username = info[0]
212 password = info[2]
213 else:
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
217 return
218
219 # Set language
220 request = urllib2.Request(self._LANG_URL)
221 try:
222 self.report_lang()
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
226 return
227
228 # No authentication to be performed
229 if username is None:
230 return
231
232 # Log in
233 login_form = {
234 'current_form': 'loginForm',
235 'next': '/',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
239 }
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
241 try:
242 self.report_login()
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
246 return
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
249 return
250
251 # Confirm age
252 age_form = {
253 'next_url': '/',
254 'action_confirm': 'Confirm',
255 }
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
257 try:
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
262 return
263
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
267 if mobj:
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
269
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
272 if mobj is None:
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
274 return
275 video_id = mobj.group(2)
276
277 # Get video webpage
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
280 try:
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
284 return
285
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
288 if mobj is not None:
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
290 else:
291 player_url = None
292
293 # Get video info
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
299 try:
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
303 break
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
306 return
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
310 else:
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
312 return
313
7df97fb5
FV
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
317 return
318
d77c3dfd
FV
319 # Start extracting information
320 self.report_information_extraction(video_id)
321
322 # uploader
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
325 return
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
327
328 # title
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
331 return
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
d77c3dfd
FV
334
335 # thumbnail image
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
338 video_thumbnail = ''
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
341
342 # upload date
343 upload_date = u'NA'
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
345 if mobj is not None:
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
349 try:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
351 except:
352 pass
353
354 # description
9beb5af8
FV
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
9e6dd238 357 else: video_description = ''
d77c3dfd
FV
358
359 # closed captions
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
d77c3dfd 362 try:
0b8c922d
FV
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
365 try:
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
d77c3dfd 369 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
0b8c922d
FV
370 if not srt_lang_list:
371 raise Trouble(u'WARNING: video has no closed captions')
372 if self._downloader.params.get('subtitleslang', False):
373 srt_lang = self._downloader.params.get('subtitleslang')
374 elif 'en' in srt_lang_list:
375 srt_lang = 'en'
d77c3dfd 376 else:
0b8c922d
FV
377 srt_lang = srt_lang_list[0]
378 if not srt_lang in srt_lang_list:
379 raise Trouble(u'WARNING: no closed captions found in the specified language')
380 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
381 try:
382 srt_xml = urllib2.urlopen(request).read()
383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
385 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386 except Trouble as trouble:
387 self._downloader.trouble(trouble[0])
d77c3dfd
FV
388
389 # token
390 video_token = urllib.unquote_plus(video_info['token'][0])
391
392 # Decide which formats to download
393 req_format = self._downloader.params.get('format', None)
394
395 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
396 self.report_rtmp_download()
397 video_url_list = [(None, video_info['conn'][0])]
398 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
399 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
400 url_data = [parse_qs(uds) for uds in url_data_strs]
401 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
402 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
403
404 format_limit = self._downloader.params.get('format_limit', None)
405 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
406 if format_limit is not None and format_limit in available_formats:
407 format_list = available_formats[available_formats.index(format_limit):]
408 else:
409 format_list = available_formats
410 existing_formats = [x for x in format_list if x in url_map]
411 if len(existing_formats) == 0:
412 self._downloader.trouble(u'ERROR: no known formats available for video')
413 return
414 if self._downloader.params.get('listformats', None):
415 self._print_formats(existing_formats)
416 return
417 if req_format is None or req_format == 'best':
418 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
419 elif req_format == 'worst':
420 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
421 elif req_format in ('-1', 'all'):
422 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
423 else:
424 # Specific formats. We pick the first in a slash-delimeted sequence.
425 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
426 req_formats = req_format.split('/')
427 video_url_list = None
428 for rf in req_formats:
429 if rf in url_map:
430 video_url_list = [(rf, url_map[rf])]
431 break
432 if video_url_list is None:
433 self._downloader.trouble(u'ERROR: requested format not available')
434 return
435 else:
436 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
437 return
438
58ca755f 439 results = []
d77c3dfd 440 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
441 # Extension
442 video_extension = self._video_extensions.get(format_param, 'flv')
443
58ca755f
FV
444 results.append({
445 'id': video_id.decode('utf-8'),
446 'url': video_real_url.decode('utf-8'),
447 'uploader': video_uploader.decode('utf-8'),
448 'upload_date': upload_date,
449 'title': video_title,
58ca755f
FV
450 'ext': video_extension.decode('utf-8'),
451 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
452 'thumbnail': video_thumbnail.decode('utf-8'),
453 'description': video_description,
454 'player_url': player_url,
455 'subtitles': video_subtitles
456 })
457 return results
d77c3dfd
FV
458
459
460class MetacafeIE(InfoExtractor):
461 """Information Extractor for metacafe.com."""
462
463 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
464 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
465 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
d77c3dfd
FV
466 IE_NAME = u'metacafe'
467
58ca755f 468 def __init__(self, downloader=None):
d77c3dfd 469 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
470
471 def report_disclaimer(self):
472 """Report disclaimer retrieval."""
473 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
474
475 def report_age_confirmation(self):
476 """Report attempt to confirm age."""
477 self._downloader.to_screen(u'[metacafe] Confirming age')
478
479 def report_download_webpage(self, video_id):
480 """Report webpage download."""
481 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
482
483 def report_extraction(self, video_id):
484 """Report information extraction."""
485 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
486
487 def _real_initialize(self):
488 # Retrieve disclaimer
489 request = urllib2.Request(self._DISCLAIMER)
490 try:
491 self.report_disclaimer()
492 disclaimer = urllib2.urlopen(request).read()
493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
494 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
495 return
496
497 # Confirm age
498 disclaimer_form = {
499 'filters': '0',
500 'submit': "Continue - I'm over 18",
501 }
502 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
503 try:
504 self.report_age_confirmation()
505 disclaimer = urllib2.urlopen(request).read()
506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
508 return
509
510 def _real_extract(self, url):
511 # Extract id and simplified title from URL
512 mobj = re.match(self._VALID_URL, url)
513 if mobj is None:
514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
515 return
516
517 video_id = mobj.group(1)
518
519 # Check if video comes from YouTube
520 mobj2 = re.match(r'^yt-(.*)$', video_id)
521 if mobj2 is not None:
58ca755f 522 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
d77c3dfd
FV
523 return
524
d77c3dfd
FV
525 # Retrieve video webpage to extract further information
526 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
527 try:
528 self.report_download_webpage(video_id)
529 webpage = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
532 return
533
534 # Extract URL, uploader and title from webpage
535 self.report_extraction(video_id)
536 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
537 if mobj is not None:
538 mediaURL = urllib.unquote(mobj.group(1))
539 video_extension = mediaURL[-3:]
540
541 # Extract gdaKey if available
542 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
543 if mobj is None:
544 video_url = mediaURL
545 else:
546 gdaKey = mobj.group(1)
547 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
548 else:
549 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
550 if mobj is None:
551 self._downloader.trouble(u'ERROR: unable to extract media URL')
552 return
553 vardict = parse_qs(mobj.group(1))
554 if 'mediaData' not in vardict:
555 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 return
557 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
558 if mobj is None:
559 self._downloader.trouble(u'ERROR: unable to extract media URL')
560 return
561 mediaURL = mobj.group(1).replace('\\/', '/')
562 video_extension = mediaURL[-3:]
563 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
564
565 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
566 if mobj is None:
567 self._downloader.trouble(u'ERROR: unable to extract title')
568 return
569 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
570
571 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
572 if mobj is None:
573 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
574 return
575 video_uploader = mobj.group(1)
576
58ca755f
FV
577 return [{
578 'id': video_id.decode('utf-8'),
579 'url': video_url.decode('utf-8'),
580 'uploader': video_uploader.decode('utf-8'),
581 'upload_date': u'NA',
582 'title': video_title,
58ca755f
FV
583 'ext': video_extension.decode('utf-8'),
584 'format': u'NA',
585 'player_url': None,
586 }]
d77c3dfd
FV
587
588
589class DailymotionIE(InfoExtractor):
590 """Information Extractor for Dailymotion"""
591
592 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
593 IE_NAME = u'dailymotion'
594
595 def __init__(self, downloader=None):
596 InfoExtractor.__init__(self, downloader)
597
598 def report_download_webpage(self, video_id):
599 """Report webpage download."""
600 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
601
602 def report_extraction(self, video_id):
603 """Report information extraction."""
604 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
605
606 def _real_extract(self, url):
607 # Extract id and simplified title from URL
608 mobj = re.match(self._VALID_URL, url)
609 if mobj is None:
610 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
611 return
612
d77c3dfd
FV
613 video_id = mobj.group(1)
614
615 video_extension = 'flv'
616
617 # Retrieve video webpage to extract further information
618 request = urllib2.Request(url)
619 request.add_header('Cookie', 'family_filter=off')
620 try:
621 self.report_download_webpage(video_id)
622 webpage = urllib2.urlopen(request).read()
623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
624 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
625 return
626
627 # Extract URL, uploader and title from webpage
628 self.report_extraction(video_id)
629 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
630 if mobj is None:
631 self._downloader.trouble(u'ERROR: unable to extract media URL')
632 return
633 sequence = urllib.unquote(mobj.group(1))
634 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
635 if mobj is None:
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
637 return
638 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
639
640 # if needed add http://www.dailymotion.com/ if relative URL
641
642 video_url = mediaURL
643
644 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
645 if mobj is None:
646 self._downloader.trouble(u'ERROR: unable to extract title')
647 return
648 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
d77c3dfd
FV
649
650 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
651 if mobj is None:
652 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
653 return
654 video_uploader = mobj.group(1)
655
58ca755f
FV
656 return [{
657 'id': video_id.decode('utf-8'),
658 'url': video_url.decode('utf-8'),
659 'uploader': video_uploader.decode('utf-8'),
660 'upload_date': u'NA',
661 'title': video_title,
58ca755f
FV
662 'ext': video_extension.decode('utf-8'),
663 'format': u'NA',
664 'player_url': None,
665 }]
d77c3dfd
FV
666
667
668class GoogleIE(InfoExtractor):
669 """Information extractor for video.google.com."""
670
671 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
672 IE_NAME = u'video.google'
673
674 def __init__(self, downloader=None):
675 InfoExtractor.__init__(self, downloader)
676
677 def report_download_webpage(self, video_id):
678 """Report webpage download."""
679 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
680
681 def report_extraction(self, video_id):
682 """Report information extraction."""
683 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
684
685 def _real_extract(self, url):
686 # Extract id from URL
687 mobj = re.match(self._VALID_URL, url)
688 if mobj is None:
689 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
690 return
691
d77c3dfd
FV
692 video_id = mobj.group(1)
693
694 video_extension = 'mp4'
695
696 # Retrieve video webpage to extract further information
697 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
698 try:
699 self.report_download_webpage(video_id)
700 webpage = urllib2.urlopen(request).read()
701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
703 return
704
705 # Extract URL, uploader, and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r"download_url:'([^']+)'", webpage)
708 if mobj is None:
709 video_extension = 'flv'
710 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
711 if mobj is None:
712 self._downloader.trouble(u'ERROR: unable to extract media URL')
713 return
714 mediaURL = urllib.unquote(mobj.group(1))
715 mediaURL = mediaURL.replace('\\x3d', '\x3d')
716 mediaURL = mediaURL.replace('\\x26', '\x26')
717
718 video_url = mediaURL
719
720 mobj = re.search(r'<title>(.*)</title>', webpage)
721 if mobj is None:
722 self._downloader.trouble(u'ERROR: unable to extract title')
723 return
724 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
725
726 # Extract video description
727 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
728 if mobj is None:
729 self._downloader.trouble(u'ERROR: unable to extract video description')
730 return
731 video_description = mobj.group(1).decode('utf-8')
732 if not video_description:
733 video_description = 'No description available.'
734
735 # Extract video thumbnail
736 if self._downloader.params.get('forcethumbnail', False):
737 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
738 try:
739 webpage = urllib2.urlopen(request).read()
740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
742 return
743 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
744 if mobj is None:
745 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
746 return
747 video_thumbnail = mobj.group(1)
748 else: # we need something to pass to process_info
749 video_thumbnail = ''
750
58ca755f
FV
751 return [{
752 'id': video_id.decode('utf-8'),
753 'url': video_url.decode('utf-8'),
754 'uploader': u'NA',
755 'upload_date': u'NA',
756 'title': video_title,
58ca755f
FV
757 'ext': video_extension.decode('utf-8'),
758 'format': u'NA',
759 'player_url': None,
760 }]
d77c3dfd
FV
761
762
763class PhotobucketIE(InfoExtractor):
764 """Information extractor for photobucket.com."""
765
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
768
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
771
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
775
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
779
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
785 return
786
d77c3dfd
FV
787 video_id = mobj.group(1)
788
789 video_extension = 'flv'
790
791 # Retrieve video webpage to extract further information
792 request = urllib2.Request(url)
793 try:
794 self.report_download_webpage(video_id)
795 webpage = urllib2.urlopen(request).read()
796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
798 return
799
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
803 if mobj is None:
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
805 return
806 mediaURL = urllib.unquote(mobj.group(1))
807
808 video_url = mediaURL
809
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
811 if mobj is None:
812 self._downloader.trouble(u'ERROR: unable to extract title')
813 return
814 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
815
816 video_uploader = mobj.group(2).decode('utf-8')
817
58ca755f
FV
818 return [{
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
822 'upload_date': u'NA',
823 'title': video_title,
58ca755f
FV
824 'ext': video_extension.decode('utf-8'),
825 'format': u'NA',
826 'player_url': None,
827 }]
d77c3dfd
FV
828
829
830class YahooIE(InfoExtractor):
831 """Information extractor for video.yahoo.com."""
832
833 # _VALID_URL matches all Yahoo! Video URLs
834 # _VPAGE_URL matches only the extractable '/watch/' URLs
835 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837 IE_NAME = u'video.yahoo'
838
839 def __init__(self, downloader=None):
840 InfoExtractor.__init__(self, downloader)
841
842 def report_download_webpage(self, video_id):
843 """Report webpage download."""
844 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
845
846 def report_extraction(self, video_id):
847 """Report information extraction."""
848 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
849
850 def _real_extract(self, url, new_video=True):
851 # Extract ID from URL
852 mobj = re.match(self._VALID_URL, url)
853 if mobj is None:
854 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
855 return
856
d77c3dfd
FV
857 video_id = mobj.group(2)
858 video_extension = 'flv'
859
860 # Rewrite valid but non-extractable URLs as
861 # extractable English language /watch/ URLs
862 if re.match(self._VPAGE_URL, url) is None:
863 request = urllib2.Request(url)
864 try:
865 webpage = urllib2.urlopen(request).read()
866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
868 return
869
870 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
871 if mobj is None:
872 self._downloader.trouble(u'ERROR: Unable to extract id field')
873 return
874 yahoo_id = mobj.group(1)
875
876 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
877 if mobj is None:
878 self._downloader.trouble(u'ERROR: Unable to extract vid field')
879 return
880 yahoo_vid = mobj.group(1)
881
882 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883 return self._real_extract(url, new_video=False)
884
885 # Retrieve video webpage to extract further information
886 request = urllib2.Request(url)
887 try:
888 self.report_download_webpage(video_id)
889 webpage = urllib2.urlopen(request).read()
890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
892 return
893
894 # Extract uploader and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
897 if mobj is None:
898 self._downloader.trouble(u'ERROR: unable to extract video title')
899 return
900 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
901
902 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
903 if mobj is None:
904 self._downloader.trouble(u'ERROR: unable to extract video uploader')
905 return
906 video_uploader = mobj.group(1).decode('utf-8')
907
908 # Extract video thumbnail
909 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
910 if mobj is None:
911 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
912 return
913 video_thumbnail = mobj.group(1).decode('utf-8')
914
915 # Extract video description
916 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
917 if mobj is None:
918 self._downloader.trouble(u'ERROR: unable to extract video description')
919 return
920 video_description = mobj.group(1).decode('utf-8')
921 if not video_description:
922 video_description = 'No description available.'
923
924 # Extract video height and width
925 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: unable to extract video height')
928 return
929 yv_video_height = mobj.group(1)
930
931 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: unable to extract video width')
934 return
935 yv_video_width = mobj.group(1)
936
937 # Retrieve video playlist to extract media URL
938 # I'm not completely sure what all these options are, but we
939 # seem to need most of them, otherwise the server sends a 401.
940 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
941 yv_bitrate = '700' # according to Wikipedia this is hard-coded
942 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
945 try:
946 self.report_download_webpage(video_id)
947 webpage = urllib2.urlopen(request).read()
948 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
950 return
951
952 # Extract media URL from playlist XML
953 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
954 if mobj is None:
955 self._downloader.trouble(u'ERROR: Unable to extract media URL')
956 return
957 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
9e6dd238 958 video_url = unescapeHTML(video_url)
d77c3dfd 959
58ca755f
FV
960 return [{
961 'id': video_id.decode('utf-8'),
962 'url': video_url,
963 'uploader': video_uploader,
964 'upload_date': u'NA',
965 'title': video_title,
58ca755f
FV
966 'ext': video_extension.decode('utf-8'),
967 'thumbnail': video_thumbnail.decode('utf-8'),
968 'description': video_description,
969 'thumbnail': video_thumbnail,
970 'player_url': None,
971 }]
d77c3dfd
FV
972
973
974class VimeoIE(InfoExtractor):
975 """Information extractor for vimeo.com."""
976
977 # _VALID_URL matches Vimeo URLs
978 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
979 IE_NAME = u'vimeo'
980
981 def __init__(self, downloader=None):
982 InfoExtractor.__init__(self, downloader)
983
984 def report_download_webpage(self, video_id):
985 """Report webpage download."""
986 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
987
988 def report_extraction(self, video_id):
989 """Report information extraction."""
990 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
991
992 def _real_extract(self, url, new_video=True):
993 # Extract ID from URL
994 mobj = re.match(self._VALID_URL, url)
995 if mobj is None:
996 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
997 return
998
d77c3dfd
FV
999 video_id = mobj.group(1)
1000
1001 # Retrieve video webpage to extract further information
1002 request = urllib2.Request(url, None, std_headers)
1003 try:
1004 self.report_download_webpage(video_id)
1005 webpage = urllib2.urlopen(request).read()
1006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1008 return
1009
1010 # Now we begin extracting as much information as we can from what we
1011 # retrieved. First we extract the information common to all extractors,
1012 # and latter we extract those that are Vimeo specific.
1013 self.report_extraction(video_id)
1014
1015 # Extract the config JSON
1016 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017 try:
1018 config = json.loads(config)
1019 except:
1020 self._downloader.trouble(u'ERROR: unable to extract info section')
1021 return
1022
1023 # Extract title
1024 video_title = config["video"]["title"]
d77c3dfd
FV
1025
1026 # Extract uploader
1027 video_uploader = config["video"]["owner"]["name"]
1028
1029 # Extract video thumbnail
1030 video_thumbnail = config["video"]["thumbnail"]
1031
1032 # Extract video description
9beb5af8
FV
1033 video_description = get_element_by_id("description", webpage.decode('utf8'))
1034 if video_description: video_description = clean_html(video_description)
9e6dd238 1035 else: video_description = ''
d77c3dfd
FV
1036
1037 # Extract upload date
1038 video_upload_date = u'NA'
1039 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1040 if mobj is not None:
1041 video_upload_date = mobj.group(1)
1042
1043 # Vimeo specific: extract request signature and timestamp
1044 sig = config['request']['signature']
1045 timestamp = config['request']['timestamp']
1046
1047 # Vimeo specific: extract video codec and quality information
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 for codec in codecs:
1051 if codec[0] in config["video"]["files"]:
1052 video_codec = codec[0]
1053 video_extension = codec[1]
1054 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1055 else: quality = 'sd'
1056 break
1057 else:
1058 self._downloader.trouble(u'ERROR: no known codec found')
1059 return
1060
1061 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1062 %(video_id, sig, timestamp, quality, video_codec.upper())
1063
58ca755f
FV
1064 return [{
1065 'id': video_id,
1066 'url': video_url,
1067 'uploader': video_uploader,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
58ca755f
FV
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1073 'player_url': None,
1074 }]
d77c3dfd
FV
1075
1076
1077class GenericIE(InfoExtractor):
1078 """Generic last-resort information extractor."""
1079
1080 _VALID_URL = r'.*'
1081 IE_NAME = u'generic'
1082
1083 def __init__(self, downloader=None):
1084 InfoExtractor.__init__(self, downloader)
1085
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1089 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1090
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1094
1095 def report_following_redirect(self, new_url):
1096 """Report information extraction."""
1097 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1098
1099 def _test_redirect(self, url):
1100 """Check if it is a redirect, like url shorteners, in case restart chain."""
1101 class HeadRequest(urllib2.Request):
1102 def get_method(self):
1103 return "HEAD"
1104
1105 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1106 """
1107 Subclass the HTTPRedirectHandler to make it use our
1108 HeadRequest also on the redirected URL
1109 """
1110 def redirect_request(self, req, fp, code, msg, headers, newurl):
1111 if code in (301, 302, 303, 307):
303692b5
FV
1112 newurl = newurl.replace(' ', '%20')
1113 newheaders = dict((k,v) for k,v in req.headers.items()
1114 if k.lower() not in ("content-length", "content-type"))
1115 return HeadRequest(newurl,
1116 headers=newheaders,
1117 origin_req_host=req.get_origin_req_host(),
1118 unverifiable=True)
d77c3dfd 1119 else:
303692b5
FV
1120 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1121
d77c3dfd
FV
1122 class HTTPMethodFallback(urllib2.BaseHandler):
1123 """
1124 Fallback to GET if HEAD is not allowed (405 HTTP error)
1125 """
1126 def http_error_405(self, req, fp, code, msg, headers):
1127 fp.read()
1128 fp.close()
1129
1130 newheaders = dict((k,v) for k,v in req.headers.items()
303692b5 1131 if k.lower() not in ("content-length", "content-type"))
d77c3dfd 1132 return self.parent.open(urllib2.Request(req.get_full_url(),
303692b5
FV
1133 headers=newheaders,
1134 origin_req_host=req.get_origin_req_host(),
1135 unverifiable=True))
d77c3dfd
FV
1136
1137 # Build our opener
1138 opener = urllib2.OpenerDirector()
1139 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
303692b5
FV
1140 HTTPMethodFallback, HEADRedirectHandler,
1141 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
d77c3dfd
FV
1142 opener.add_handler(handler())
1143
1144 response = opener.open(HeadRequest(url))
1145 new_url = response.geturl()
1146
1147 if url == new_url: return False
1148
1149 self.report_following_redirect(new_url)
1150 self._downloader.download([new_url])
1151 return True
1152
1153 def _real_extract(self, url):
1154 if self._test_redirect(url): return
d77c3dfd
FV
1155
1156 video_id = url.split('/')[-1]
1157 request = urllib2.Request(url)
1158 try:
1159 self.report_download_webpage(video_id)
1160 webpage = urllib2.urlopen(request).read()
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1163 return
1164 except ValueError, err:
1165 # since this is the last-resort InfoExtractor, if
1166 # this error is thrown, it'll be thrown here
1167 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1168 return
1169
1170 self.report_extraction(video_id)
1171 # Start with something easy: JW Player in SWFObject
1172 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1173 if mobj is None:
1174 # Broaden the search a little bit
1175 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1176 if mobj is None:
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178 return
1179
1180 # It's possible that one of the regexes
1181 # matched, but returned an empty group:
1182 if mobj.group(1) is None:
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184 return
1185
1186 video_url = urllib.unquote(mobj.group(1))
1187 video_id = os.path.basename(video_url)
1188
1189 # here's a fun little line of code for you:
1190 video_extension = os.path.splitext(video_id)[1][1:]
1191 video_id = os.path.splitext(video_id)[0]
1192
1193 # it's tempting to parse this further, but you would
1194 # have to take into account all the variations like
1195 # Video Title - Site Name
1196 # Site Name | Video Title
1197 # Video Title - Tagline | Site Name
1198 # and so on and so forth; it's just not practical
1199 mobj = re.search(r'<title>(.*)</title>', webpage)
1200 if mobj is None:
1201 self._downloader.trouble(u'ERROR: unable to extract title')
1202 return
1203 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
1204
1205 # video uploader is domain name
1206 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1207 if mobj is None:
1208 self._downloader.trouble(u'ERROR: unable to extract title')
1209 return
1210 video_uploader = mobj.group(1).decode('utf-8')
1211
58ca755f
FV
1212 return [{
1213 'id': video_id.decode('utf-8'),
1214 'url': video_url.decode('utf-8'),
1215 'uploader': video_uploader,
1216 'upload_date': u'NA',
1217 'title': video_title,
58ca755f
FV
1218 'ext': video_extension.decode('utf-8'),
1219 'format': u'NA',
1220 'player_url': None,
1221 }]
d77c3dfd
FV
1222
1223
1224class YoutubeSearchIE(InfoExtractor):
1225 """Information Extractor for YouTube search queries."""
1226 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1227 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
d77c3dfd
FV
1228 _max_youtube_results = 1000
1229 IE_NAME = u'youtube:search'
1230
58ca755f 1231 def __init__(self, downloader=None):
d77c3dfd 1232 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1233
1234 def report_download_page(self, query, pagenum):
d4e16d3e 1235 """Report attempt to download search page with given number."""
d77c3dfd
FV
1236 query = query.decode(preferredencoding())
1237 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1238
d77c3dfd
FV
1239 def _real_extract(self, query):
1240 mobj = re.match(self._VALID_URL, query)
1241 if mobj is None:
1242 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1243 return
1244
1245 prefix, query = query.split(':')
1246 prefix = prefix[8:]
1247 query = query.encode('utf-8')
1248 if prefix == '':
1249 self._download_n_results(query, 1)
1250 return
1251 elif prefix == 'all':
1252 self._download_n_results(query, self._max_youtube_results)
1253 return
1254 else:
1255 try:
1256 n = long(prefix)
1257 if n <= 0:
1258 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1259 return
1260 elif n > self._max_youtube_results:
1261 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1262 n = self._max_youtube_results
1263 self._download_n_results(query, n)
1264 return
1265 except ValueError: # parsing prefix as integer fails
1266 self._download_n_results(query, 1)
1267 return
1268
1269 def _download_n_results(self, query, n):
1270 """Downloads a specified number of results for a query"""
1271
1272 video_ids = []
1273 pagenum = 0
1274 limit = n
1275
1276 while (50 * pagenum) < limit:
1277 self.report_download_page(query, pagenum+1)
1278 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1279 request = urllib2.Request(result_url)
1280 try:
1281 data = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1284 return
1285 api_response = json.loads(data)['data']
1286
1287 new_ids = list(video['id'] for video in api_response['items'])
1288 video_ids += new_ids
1289
1290 limit = min(n, api_response['totalItems'])
1291 pagenum += 1
1292
1293 if len(video_ids) > n:
1294 video_ids = video_ids[:n]
1295 for id in video_ids:
58ca755f 1296 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1297 return
1298
1299
1300class GoogleSearchIE(InfoExtractor):
1301 """Information Extractor for Google Video search queries."""
1302 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1303 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1304 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1305 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
d77c3dfd
FV
1306 _max_google_results = 1000
1307 IE_NAME = u'video.google:search'
1308
58ca755f 1309 def __init__(self, downloader=None):
d77c3dfd 1310 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1311
1312 def report_download_page(self, query, pagenum):
1313 """Report attempt to download playlist page with given number."""
1314 query = query.decode(preferredencoding())
1315 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1316
d77c3dfd
FV
1317 def _real_extract(self, query):
1318 mobj = re.match(self._VALID_URL, query)
1319 if mobj is None:
1320 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1321 return
1322
1323 prefix, query = query.split(':')
1324 prefix = prefix[8:]
1325 query = query.encode('utf-8')
1326 if prefix == '':
1327 self._download_n_results(query, 1)
1328 return
1329 elif prefix == 'all':
1330 self._download_n_results(query, self._max_google_results)
1331 return
1332 else:
1333 try:
1334 n = long(prefix)
1335 if n <= 0:
1336 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1337 return
1338 elif n > self._max_google_results:
1339 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1340 n = self._max_google_results
1341 self._download_n_results(query, n)
1342 return
1343 except ValueError: # parsing prefix as integer fails
1344 self._download_n_results(query, 1)
1345 return
1346
1347 def _download_n_results(self, query, n):
1348 """Downloads a specified number of results for a query"""
1349
1350 video_ids = []
1351 pagenum = 0
1352
1353 while True:
1354 self.report_download_page(query, pagenum)
1355 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1356 request = urllib2.Request(result_url)
1357 try:
1358 page = urllib2.urlopen(request).read()
1359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1360 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1361 return
1362
1363 # Extract video identifiers
1364 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1365 video_id = mobj.group(1)
1366 if video_id not in video_ids:
1367 video_ids.append(video_id)
1368 if len(video_ids) == n:
1369 # Specified n videos reached
1370 for id in video_ids:
58ca755f 1371 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1372 return
1373
1374 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1375 for id in video_ids:
58ca755f 1376 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1377 return
1378
1379 pagenum = pagenum + 1
1380
1381
1382class YahooSearchIE(InfoExtractor):
1383 """Information Extractor for Yahoo! Video search queries."""
1384 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1385 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1386 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1387 _MORE_PAGES_INDICATOR = r'\s*Next'
d77c3dfd
FV
1388 _max_yahoo_results = 1000
1389 IE_NAME = u'video.yahoo:search'
1390
58ca755f 1391 def __init__(self, downloader=None):
d77c3dfd 1392 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1393
1394 def report_download_page(self, query, pagenum):
1395 """Report attempt to download playlist page with given number."""
1396 query = query.decode(preferredencoding())
1397 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1398
d77c3dfd
FV
1399 def _real_extract(self, query):
1400 mobj = re.match(self._VALID_URL, query)
1401 if mobj is None:
1402 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1403 return
1404
1405 prefix, query = query.split(':')
1406 prefix = prefix[8:]
1407 query = query.encode('utf-8')
1408 if prefix == '':
1409 self._download_n_results(query, 1)
1410 return
1411 elif prefix == 'all':
1412 self._download_n_results(query, self._max_yahoo_results)
1413 return
1414 else:
1415 try:
1416 n = long(prefix)
1417 if n <= 0:
1418 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1419 return
1420 elif n > self._max_yahoo_results:
1421 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1422 n = self._max_yahoo_results
1423 self._download_n_results(query, n)
1424 return
1425 except ValueError: # parsing prefix as integer fails
1426 self._download_n_results(query, 1)
1427 return
1428
1429 def _download_n_results(self, query, n):
1430 """Downloads a specified number of results for a query"""
1431
1432 video_ids = []
1433 already_seen = set()
1434 pagenum = 1
1435
1436 while True:
1437 self.report_download_page(query, pagenum)
1438 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1439 request = urllib2.Request(result_url)
1440 try:
1441 page = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1444 return
1445
1446 # Extract video identifiers
1447 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1448 video_id = mobj.group(1)
1449 if video_id not in already_seen:
1450 video_ids.append(video_id)
1451 already_seen.add(video_id)
1452 if len(video_ids) == n:
1453 # Specified n videos reached
1454 for id in video_ids:
58ca755f 1455 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1456 return
1457
1458 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1459 for id in video_ids:
58ca755f 1460 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1461 return
1462
1463 pagenum = pagenum + 1
1464
1465
1466class YoutubePlaylistIE(InfoExtractor):
1467 """Information Extractor for YouTube playlists."""
1468
1469 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1470 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
d4e16d3e
FV
1471 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1472 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
d77c3dfd
FV
1473 IE_NAME = u'youtube:playlist'
1474
58ca755f 1475 def __init__(self, downloader=None):
d77c3dfd 1476 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1477
1478 def report_download_page(self, playlist_id, pagenum):
1479 """Report attempt to download playlist page with given number."""
1480 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1481
d77c3dfd
FV
1482 def _real_extract(self, url):
1483 # Extract playlist id
1484 mobj = re.match(self._VALID_URL, url)
1485 if mobj is None:
1486 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1487 return
1488
1489 # Single video case
1490 if mobj.group(3) is not None:
58ca755f 1491 self._downloader.download([mobj.group(3)])
d77c3dfd
FV
1492 return
1493
1494 # Download playlist pages
1495 # prefix is 'p' as default for playlists but there are other types that need extra care
1496 playlist_prefix = mobj.group(1)
1497 if playlist_prefix == 'a':
1498 playlist_access = 'artist'
1499 else:
1500 playlist_prefix = 'p'
1501 playlist_access = 'view_play_list'
1502 playlist_id = mobj.group(2)
1503 video_ids = []
1504 pagenum = 1
1505
1506 while True:
1507 self.report_download_page(playlist_id, pagenum)
1508 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1509 request = urllib2.Request(url)
1510 try:
1511 page = urllib2.urlopen(request).read()
1512 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1513 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1514 return
1515
1516 # Extract video identifiers
1517 ids_in_page = []
1518 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1519 if mobj.group(1) not in ids_in_page:
1520 ids_in_page.append(mobj.group(1))
1521 video_ids.extend(ids_in_page)
1522
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 break
1525 pagenum = pagenum + 1
1526
1527 playliststart = self._downloader.params.get('playliststart', 1) - 1
1528 playlistend = self._downloader.params.get('playlistend', -1)
1529 if playlistend == -1:
1530 video_ids = video_ids[playliststart:]
1531 else:
1532 video_ids = video_ids[playliststart:playlistend]
1533
1534 for id in video_ids:
58ca755f 1535 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1536 return
1537
1538
1539class YoutubeUserIE(InfoExtractor):
1540 """Information Extractor for YouTube users."""
1541
1542 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1543 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1544 _GDATA_PAGE_SIZE = 50
1545 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1546 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
d77c3dfd
FV
1547 IE_NAME = u'youtube:user'
1548
58ca755f 1549 def __init__(self, downloader=None):
d77c3dfd 1550 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1551
1552 def report_download_page(self, username, start_index):
1553 """Report attempt to download user page."""
1554 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1555 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1556
d77c3dfd
FV
1557 def _real_extract(self, url):
1558 # Extract username
1559 mobj = re.match(self._VALID_URL, url)
1560 if mobj is None:
1561 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1562 return
1563
1564 username = mobj.group(1)
1565
1566 # Download video ids using YouTube Data API. Result size per
1567 # query is limited (currently to 50 videos) so we need to query
1568 # page by page until there are no video ids - it means we got
1569 # all of them.
1570
1571 video_ids = []
1572 pagenum = 0
1573
1574 while True:
1575 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1576 self.report_download_page(username, start_index)
1577
1578 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1579
1580 try:
1581 page = urllib2.urlopen(request).read()
1582 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1583 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1584 return
1585
1586 # Extract video identifiers
1587 ids_in_page = []
1588
1589 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1592
1593 video_ids.extend(ids_in_page)
1594
1595 # A little optimization - if current page is not
1596 # "full", ie. does not contain PAGE_SIZE video ids then
1597 # we can assume that this page is the last one - there
1598 # are no more ids on further pages - no need to query
1599 # again.
1600
1601 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1602 break
1603
1604 pagenum += 1
1605
1606 all_ids_count = len(video_ids)
1607 playliststart = self._downloader.params.get('playliststart', 1) - 1
1608 playlistend = self._downloader.params.get('playlistend', -1)
1609
1610 if playlistend == -1:
1611 video_ids = video_ids[playliststart:]
1612 else:
1613 video_ids = video_ids[playliststart:playlistend]
1614
1615 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1616 (username, all_ids_count, len(video_ids)))
1617
1618 for video_id in video_ids:
58ca755f 1619 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1620
1621
1622class DepositFilesIE(InfoExtractor):
1623 """Information extractor for depositfiles.com"""
1624
1625 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1626 IE_NAME = u'DepositFiles'
1627
1628 def __init__(self, downloader=None):
1629 InfoExtractor.__init__(self, downloader)
1630
1631 def report_download_webpage(self, file_id):
1632 """Report webpage download."""
1633 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1634
1635 def report_extraction(self, file_id):
1636 """Report information extraction."""
1637 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1638
1639 def _real_extract(self, url):
d77c3dfd
FV
1640 file_id = url.split('/')[-1]
1641 # Rebuild url in english locale
1642 url = 'http://depositfiles.com/en/files/' + file_id
1643
1644 # Retrieve file webpage with 'Free download' button pressed
1645 free_download_indication = { 'gateway_result' : '1' }
1646 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1647 try:
1648 self.report_download_webpage(file_id)
1649 webpage = urllib2.urlopen(request).read()
1650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1652 return
1653
1654 # Search for the real file URL
1655 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1656 if (mobj is None) or (mobj.group(1) is None):
1657 # Try to figure out reason of the error.
1658 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1659 if (mobj is not None) and (mobj.group(1) is not None):
1660 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1661 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1662 else:
1663 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1664 return
1665
1666 file_url = mobj.group(1)
1667 file_extension = os.path.splitext(file_url)[1][1:]
1668
1669 # Search for file title
1670 mobj = re.search(r'<b title="(.*?)">', webpage)
1671 if mobj is None:
1672 self._downloader.trouble(u'ERROR: unable to extract title')
1673 return
1674 file_title = mobj.group(1).decode('utf-8')
1675
58ca755f
FV
1676 return [{
1677 'id': file_id.decode('utf-8'),
1678 'url': file_url.decode('utf-8'),
1679 'uploader': u'NA',
1680 'upload_date': u'NA',
1681 'title': file_title,
58ca755f
FV
1682 'ext': file_extension.decode('utf-8'),
1683 'format': u'NA',
1684 'player_url': None,
1685 }]
d77c3dfd
FV
1686
1687
1688class FacebookIE(InfoExtractor):
1689 """Information Extractor for Facebook"""
1690
1691 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1692 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1693 _NETRC_MACHINE = 'facebook'
1694 _available_formats = ['video', 'highqual', 'lowqual']
1695 _video_extensions = {
1696 'video': 'mp4',
1697 'highqual': 'mp4',
1698 'lowqual': 'mp4',
1699 }
1700 IE_NAME = u'facebook'
1701
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1704
1705 def _reporter(self, message):
1706 """Add header and report message."""
1707 self._downloader.to_screen(u'[facebook] %s' % message)
1708
1709 def report_login(self):
1710 """Report attempt to log in."""
1711 self._reporter(u'Logging in')
1712
1713 def report_video_webpage_download(self, video_id):
1714 """Report attempt to download video webpage."""
1715 self._reporter(u'%s: Downloading video webpage' % video_id)
1716
1717 def report_information_extraction(self, video_id):
1718 """Report attempt to extract video information."""
1719 self._reporter(u'%s: Extracting video information' % video_id)
1720
1721 def _parse_page(self, video_webpage):
1722 """Extract video information from page"""
1723 # General data
1724 data = {'title': r'\("video_title", "(.*?)"\)',
1725 'description': r'<div class="datawrap">(.*?)</div>',
1726 'owner': r'\("video_owner_name", "(.*?)"\)',
1727 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1728 }
1729 video_info = {}
1730 for piece in data.keys():
1731 mobj = re.search(data[piece], video_webpage)
1732 if mobj is not None:
1733 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1734
1735 # Video urls
1736 video_urls = {}
1737 for fmt in self._available_formats:
1738 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1739 if mobj is not None:
1740 # URL is in a Javascript segment inside an escaped Unicode format within
1741 # the generally utf-8 page
1742 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1743 video_info['video_urls'] = video_urls
1744
1745 return video_info
1746
1747 def _real_initialize(self):
1748 if self._downloader is None:
1749 return
1750
1751 useremail = None
1752 password = None
1753 downloader_params = self._downloader.params
1754
1755 # Attempt to use provided username and password or .netrc data
1756 if downloader_params.get('username', None) is not None:
1757 useremail = downloader_params['username']
1758 password = downloader_params['password']
1759 elif downloader_params.get('usenetrc', False):
1760 try:
1761 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1762 if info is not None:
1763 useremail = info[0]
1764 password = info[2]
1765 else:
1766 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1767 except (IOError, netrc.NetrcParseError), err:
1768 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1769 return
1770
1771 if useremail is None:
1772 return
1773
1774 # Log in
1775 login_form = {
1776 'email': useremail,
1777 'pass': password,
1778 'login': 'Log+In'
1779 }
1780 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1781 try:
1782 self.report_login()
1783 login_results = urllib2.urlopen(request).read()
1784 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1785 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1786 return
1787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1789 return
1790
1791 def _real_extract(self, url):
1792 mobj = re.match(self._VALID_URL, url)
1793 if mobj is None:
1794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1795 return
1796 video_id = mobj.group('ID')
1797
1798 # Get video webpage
1799 self.report_video_webpage_download(video_id)
1800 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1801 try:
1802 page = urllib2.urlopen(request)
1803 video_webpage = page.read()
1804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1806 return
1807
1808 # Start extracting information
1809 self.report_information_extraction(video_id)
1810
1811 # Extract information
1812 video_info = self._parse_page(video_webpage)
1813
1814 # uploader
1815 if 'owner' not in video_info:
1816 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1817 return
1818 video_uploader = video_info['owner']
1819
1820 # title
1821 if 'title' not in video_info:
1822 self._downloader.trouble(u'ERROR: unable to extract video title')
1823 return
1824 video_title = video_info['title']
1825 video_title = video_title.decode('utf-8')
d77c3dfd
FV
1826
1827 # thumbnail image
1828 if 'thumbnail' not in video_info:
1829 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1830 video_thumbnail = ''
1831 else:
1832 video_thumbnail = video_info['thumbnail']
1833
1834 # upload date
1835 upload_date = u'NA'
1836 if 'upload_date' in video_info:
1837 upload_time = video_info['upload_date']
1838 timetuple = email.utils.parsedate_tz(upload_time)
1839 if timetuple is not None:
1840 try:
1841 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1842 except:
1843 pass
1844
1845 # description
1846 video_description = video_info.get('description', 'No description available.')
1847
1848 url_map = video_info['video_urls']
1849 if len(url_map.keys()) > 0:
1850 # Decide which formats to download
1851 req_format = self._downloader.params.get('format', None)
1852 format_limit = self._downloader.params.get('format_limit', None)
1853
1854 if format_limit is not None and format_limit in self._available_formats:
1855 format_list = self._available_formats[self._available_formats.index(format_limit):]
1856 else:
1857 format_list = self._available_formats
1858 existing_formats = [x for x in format_list if x in url_map]
1859 if len(existing_formats) == 0:
1860 self._downloader.trouble(u'ERROR: no known formats available for video')
1861 return
1862 if req_format is None:
1863 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1864 elif req_format == 'worst':
1865 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1866 elif req_format == '-1':
1867 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1868 else:
1869 # Specific format
1870 if req_format not in url_map:
1871 self._downloader.trouble(u'ERROR: requested format not available')
1872 return
1873 video_url_list = [(req_format, url_map[req_format])] # Specific format
1874
58ca755f 1875 results = []
d77c3dfd 1876 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
1877 # Extension
1878 video_extension = self._video_extensions.get(format_param, 'mp4')
1879
58ca755f
FV
1880 results.append({
1881 'id': video_id.decode('utf-8'),
1882 'url': video_real_url.decode('utf-8'),
1883 'uploader': video_uploader.decode('utf-8'),
1884 'upload_date': upload_date,
1885 'title': video_title,
58ca755f
FV
1886 'ext': video_extension.decode('utf-8'),
1887 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1888 'thumbnail': video_thumbnail.decode('utf-8'),
1889 'description': video_description.decode('utf-8'),
1890 'player_url': None,
1891 })
1892 return results
d77c3dfd
FV
1893
1894class BlipTVIE(InfoExtractor):
1895 """Information extractor for blip.tv"""
1896
1897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1898 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1899 IE_NAME = u'blip.tv'
1900
1901 def report_extraction(self, file_id):
1902 """Report information extraction."""
1903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1904
1905 def report_direct_download(self, title):
1906 """Report information extraction."""
1907 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1908
1909 def _real_extract(self, url):
1910 mobj = re.match(self._VALID_URL, url)
1911 if mobj is None:
1912 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1913 return
1914
1915 if '?' in url:
1916 cchar = '&'
1917 else:
1918 cchar = '?'
1919 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920 request = urllib2.Request(json_url)
1921 self.report_extraction(mobj.group(1))
1922 info = None
1923 try:
1924 urlh = urllib2.urlopen(request)
1925 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1926 basename = url.split('/')[-1]
1927 title,ext = os.path.splitext(basename)
1928 title = title.decode('UTF-8')
1929 ext = ext.replace('.', '')
1930 self.report_direct_download(title)
1931 info = {
1932 'id': title,
1933 'url': url,
1934 'title': title,
d77c3dfd
FV
1935 'ext': ext,
1936 'urlhandle': urlh
1937 }
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1940 return
1941 if info is None: # Regular URL
1942 try:
1943 json_code = urlh.read()
1944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1945 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1946 return
1947
1948 try:
1949 json_data = json.loads(json_code)
1950 if 'Post' in json_data:
1951 data = json_data['Post']
1952 else:
1953 data = json_data
3fe294e4 1954
d77c3dfd
FV
1955 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1956 video_url = data['media']['url']
1957 umobj = re.match(self._URL_EXT, video_url)
1958 if umobj is None:
1959 raise ValueError('Can not determine filename extension')
1960 ext = umobj.group(1)
3fe294e4 1961
d77c3dfd
FV
1962 info = {
1963 'id': data['item_id'],
1964 'url': video_url,
1965 'uploader': data['display_name'],
1966 'upload_date': upload_date,
1967 'title': data['title'],
d77c3dfd
FV
1968 'ext': ext,
1969 'format': data['media']['mimeType'],
1970 'thumbnail': data['thumbnailUrl'],
1971 'description': data['description'],
1972 'player_url': data['embedUrl']
1973 }
1974 except (ValueError,KeyError), err:
1975 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1976 return
1977
58ca755f 1978 return [info]
d77c3dfd
FV
1979
1980
1981class MyVideoIE(InfoExtractor):
1982 """Information Extractor for myvideo.de."""
1983
1984 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1985 IE_NAME = u'myvideo'
1986
1987 def __init__(self, downloader=None):
1988 InfoExtractor.__init__(self, downloader)
1989
1990 def report_download_webpage(self, video_id):
1991 """Report webpage download."""
1992 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1993
1994 def report_extraction(self, video_id):
1995 """Report information extraction."""
1996 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1997
1998 def _real_extract(self,url):
1999 mobj = re.match(self._VALID_URL, url)
2000 if mobj is None:
2001 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2002 return
2003
2004 video_id = mobj.group(1)
2005
2006 # Get video webpage
2007 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2008 try:
2009 self.report_download_webpage(video_id)
2010 webpage = urllib2.urlopen(request).read()
2011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2013 return
2014
2015 self.report_extraction(video_id)
2016 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2017 webpage)
2018 if mobj is None:
2019 self._downloader.trouble(u'ERROR: unable to extract media URL')
2020 return
2021 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2022
2023 mobj = re.search('<title>([^<]+)</title>', webpage)
2024 if mobj is None:
2025 self._downloader.trouble(u'ERROR: unable to extract title')
2026 return
2027
2028 video_title = mobj.group(1)
d77c3dfd 2029
58ca755f
FV
2030 return [{
2031 'id': video_id,
2032 'url': video_url,
2033 'uploader': u'NA',
2034 'upload_date': u'NA',
2035 'title': video_title,
58ca755f
FV
2036 'ext': u'flv',
2037 'format': u'NA',
2038 'player_url': None,
2039 }]
d77c3dfd
FV
2040
2041class ComedyCentralIE(InfoExtractor):
2042 """Information extractor for The Daily Show and Colbert Report """
2043
2044 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2045 IE_NAME = u'comedycentral'
2046
2047 def report_extraction(self, episode_id):
2048 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3fe294e4 2049
d77c3dfd
FV
2050 def report_config_download(self, episode_id):
2051 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2052
2053 def report_index_download(self, episode_id):
2054 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2055
2056 def report_player_url(self, episode_id):
2057 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2058
2059 def _real_extract(self, url):
2060 mobj = re.match(self._VALID_URL, url)
2061 if mobj is None:
2062 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2063 return
2064
2065 if mobj.group('shortname'):
2066 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067 url = u'http://www.thedailyshow.com/full-episodes/'
2068 else:
2069 url = u'http://www.colbertnation.com/full-episodes/'
2070 mobj = re.match(self._VALID_URL, url)
2071 assert mobj is not None
2072
2073 dlNewest = not mobj.group('episode')
2074 if dlNewest:
2075 epTitle = mobj.group('showname')
2076 else:
2077 epTitle = mobj.group('episode')
2078
2079 req = urllib2.Request(url)
2080 self.report_extraction(epTitle)
2081 try:
2082 htmlHandle = urllib2.urlopen(req)
2083 html = htmlHandle.read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2086 return
2087 if dlNewest:
2088 url = htmlHandle.geturl()
2089 mobj = re.match(self._VALID_URL, url)
2090 if mobj is None:
2091 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2092 return
2093 if mobj.group('episode') == '':
2094 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2095 return
2096 epTitle = mobj.group('episode')
2097
2098 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2099 if len(mMovieParams) == 0:
2100 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2101 return
2102
2103 playerUrl_raw = mMovieParams[0][0]
2104 self.report_player_url(epTitle)
2105 try:
2106 urlHandle = urllib2.urlopen(playerUrl_raw)
2107 playerUrl = urlHandle.geturl()
2108 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2109 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2110 return
2111
2112 uri = mMovieParams[0][1]
2113 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2114 self.report_index_download(epTitle)
2115 try:
2116 indexXml = urllib2.urlopen(indexUrl).read()
2117 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2119 return
2120
58ca755f
FV
2121 results = []
2122
d77c3dfd
FV
2123 idoc = xml.etree.ElementTree.fromstring(indexXml)
2124 itemEls = idoc.findall('.//item')
2125 for itemEl in itemEls:
2126 mediaId = itemEl.findall('./guid')[0].text
2127 shortMediaId = mediaId.split(':')[-1]
2128 showId = mediaId.split(':')[-2].replace('.com', '')
2129 officialTitle = itemEl.findall('./title')[0].text
2130 officialDate = itemEl.findall('./pubDate')[0].text
2131
2132 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2133 urllib.urlencode({'uri': mediaId}))
2134 configReq = urllib2.Request(configUrl)
2135 self.report_config_download(epTitle)
2136 try:
2137 configXml = urllib2.urlopen(configReq).read()
2138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2139 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2140 return
2141
2142 cdoc = xml.etree.ElementTree.fromstring(configXml)
2143 turls = []
2144 for rendition in cdoc.findall('.//rendition'):
2145 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2146 turls.append(finfo)
2147
2148 if len(turls) == 0:
2149 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2150 continue
2151
2152 # For now, just pick the highest bitrate
2153 format,video_url = turls[-1]
2154
d77c3dfd
FV
2155 effTitle = showId + u'-' + epTitle
2156 info = {
2157 'id': shortMediaId,
2158 'url': video_url,
2159 'uploader': showId,
2160 'upload_date': officialDate,
2161 'title': effTitle,
d77c3dfd
FV
2162 'ext': 'mp4',
2163 'format': format,
2164 'thumbnail': None,
2165 'description': officialTitle,
2166 'player_url': playerUrl
2167 }
2168
58ca755f
FV
2169 results.append(info)
2170
2171 return results
d77c3dfd
FV
2172
2173
2174class EscapistIE(InfoExtractor):
2175 """Information extractor for The Escapist """
2176
2177 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2178 IE_NAME = u'escapist'
2179
2180 def report_extraction(self, showName):
2181 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2182
2183 def report_config_download(self, showName):
2184 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2185
2186 def _real_extract(self, url):
d77c3dfd
FV
2187 mobj = re.match(self._VALID_URL, url)
2188 if mobj is None:
2189 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2190 return
2191 showName = mobj.group('showname')
2192 videoId = mobj.group('episode')
2193
2194 self.report_extraction(showName)
2195 try:
3210735c
PH
2196 webPage = urllib2.urlopen(url)
2197 webPageBytes = webPage.read()
2198 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2199 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
d77c3dfd
FV
2200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2201 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2202 return
2203
2204 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
9e6dd238 2205 description = unescapeHTML(descMatch.group(1))
d77c3dfd 2206 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
9e6dd238 2207 imgUrl = unescapeHTML(imgMatch.group(1))
d77c3dfd 2208 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
9e6dd238 2209 playerUrl = unescapeHTML(playerUrlMatch.group(1))
d77c3dfd
FV
2210 configUrlMatch = re.search('config=(.*)$', playerUrl)
2211 configUrl = urllib2.unquote(configUrlMatch.group(1))
2212
2213 self.report_config_download(showName)
2214 try:
2215 configJSON = urllib2.urlopen(configUrl).read()
2216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2218 return
2219
2220 # Technically, it's JavaScript, not JSON
2221 configJSON = configJSON.replace("'", '"')
2222
2223 try:
2224 config = json.loads(configJSON)
2225 except (ValueError,), err:
2226 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2227 return
2228
2229 playlist = config['playlist']
2230 videoUrl = playlist[1]['url']
2231
d77c3dfd
FV
2232 info = {
2233 'id': videoId,
2234 'url': videoUrl,
2235 'uploader': showName,
2236 'upload_date': None,
2237 'title': showName,
d77c3dfd
FV
2238 'ext': 'flv',
2239 'format': 'flv',
2240 'thumbnail': imgUrl,
2241 'description': description,
2242 'player_url': playerUrl,
2243 }
2244
58ca755f 2245 return [info]
d77c3dfd
FV
2246
2247
2248class CollegeHumorIE(InfoExtractor):
2249 """Information extractor for collegehumor.com"""
2250
2251 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2252 IE_NAME = u'collegehumor'
2253
2254 def report_webpage(self, video_id):
2255 """Report information extraction."""
2256 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2257
2258 def report_extraction(self, video_id):
2259 """Report information extraction."""
2260 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2261
2262 def _real_extract(self, url):
d77c3dfd
FV
2263 mobj = re.match(self._VALID_URL, url)
2264 if mobj is None:
2265 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2266 return
2267 video_id = mobj.group('videoid')
2268
2269 self.report_webpage(video_id)
2270 request = urllib2.Request(url)
2271 try:
2272 webpage = urllib2.urlopen(request).read()
2273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2274 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2275 return
2276
2277 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2278 if m is None:
2279 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2280 return
2281 internal_video_id = m.group('internalvideoid')
2282
2283 info = {
2284 'id': video_id,
2285 'internal_id': internal_video_id,
2286 }
2287
2288 self.report_extraction(video_id)
2289 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2290 try:
2291 metaXml = urllib2.urlopen(xmlUrl).read()
2292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2293 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2294 return
2295
2296 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2297 try:
2298 videoNode = mdoc.findall('./video')[0]
2299 info['description'] = videoNode.findall('./description')[0].text
2300 info['title'] = videoNode.findall('./caption')[0].text
d77c3dfd
FV
2301 info['url'] = videoNode.findall('./file')[0].text
2302 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2303 info['ext'] = info['url'].rpartition('.')[2]
2304 info['format'] = info['ext']
2305 except IndexError:
2306 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2307 return
2308
58ca755f 2309 return [info]
d77c3dfd
FV
2310
2311
2312class XVideosIE(InfoExtractor):
2313 """Information extractor for xvideos.com"""
2314
2315 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2316 IE_NAME = u'xvideos'
2317
2318 def report_webpage(self, video_id):
2319 """Report information extraction."""
2320 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2321
2322 def report_extraction(self, video_id):
2323 """Report information extraction."""
2324 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2325
2326 def _real_extract(self, url):
d77c3dfd
FV
2327 mobj = re.match(self._VALID_URL, url)
2328 if mobj is None:
2329 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2330 return
2331 video_id = mobj.group(1).decode('utf-8')
2332
2333 self.report_webpage(video_id)
2334
2335 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2336 try:
2337 webpage = urllib2.urlopen(request).read()
2338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2339 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2340 return
2341
2342 self.report_extraction(video_id)
2343
2344
2345 # Extract video URL
2346 mobj = re.search(r'flv_url=(.+?)&', webpage)
2347 if mobj is None:
2348 self._downloader.trouble(u'ERROR: unable to extract video url')
2349 return
2350 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2351
2352
2353 # Extract title
2354 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2355 if mobj is None:
2356 self._downloader.trouble(u'ERROR: unable to extract video title')
2357 return
2358 video_title = mobj.group(1).decode('utf-8')
2359
2360
2361 # Extract video thumbnail
2362 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2363 if mobj is None:
2364 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2365 return
2366 video_thumbnail = mobj.group(1).decode('utf-8')
2367
d77c3dfd
FV
2368 info = {
2369 'id': video_id,
2370 'url': video_url,
2371 'uploader': None,
2372 'upload_date': None,
2373 'title': video_title,
d77c3dfd
FV
2374 'ext': 'flv',
2375 'format': 'flv',
2376 'thumbnail': video_thumbnail,
2377 'description': None,
2378 'player_url': None,
2379 }
2380
58ca755f 2381 return [info]
d77c3dfd
FV
2382
2383
2384class SoundcloudIE(InfoExtractor):
2385 """Information extractor for soundcloud.com
2386 To access the media, the uid of the song and a stream token
2387 must be extracted from the page source and the script must make
2388 a request to media.soundcloud.com/crossdomain.xml. Then
2389 the media can be grabbed by requesting from an url composed
2390 of the stream token and uid
2391 """
2392
2393 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2394 IE_NAME = u'soundcloud'
2395
2396 def __init__(self, downloader=None):
2397 InfoExtractor.__init__(self, downloader)
2398
2399 def report_webpage(self, video_id):
2400 """Report information extraction."""
2401 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2402
2403 def report_extraction(self, video_id):
2404 """Report information extraction."""
2405 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2406
2407 def _real_extract(self, url):
d77c3dfd
FV
2408 mobj = re.match(self._VALID_URL, url)
2409 if mobj is None:
2410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2411 return
2412
2413 # extract uploader (which is in the url)
2414 uploader = mobj.group(1).decode('utf-8')
2415 # extract simple title (uploader + slug of song title)
2416 slug_title = mobj.group(2).decode('utf-8')
2c288bda 2417 simple_title = uploader + u'-' + slug_title
d77c3dfd
FV
2418
2419 self.report_webpage('%s/%s' % (uploader, slug_title))
2420
2421 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2422 try:
2423 webpage = urllib2.urlopen(request).read()
2424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2425 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2426 return
2427
2428 self.report_extraction('%s/%s' % (uploader, slug_title))
2429
2430 # extract uid and stream token that soundcloud hands out for access
2431 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2432 if mobj:
2433 video_id = mobj.group(1)
2434 stream_token = mobj.group(2)
2435
2436 # extract unsimplified title
2437 mobj = re.search('"title":"(.*?)",', webpage)
2438 if mobj:
2c288bda
FV
2439 title = mobj.group(1).decode('utf-8')
2440 else:
2441 title = simple_title
d77c3dfd
FV
2442
2443 # construct media url (with uid/token)
2444 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2445 mediaURL = mediaURL % (video_id, stream_token)
2446
2447 # description
2448 description = u'No description available'
2449 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2450 if mobj:
2451 description = mobj.group(1)
2452
2453 # upload date
2454 upload_date = None
2455 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2456 if mobj:
2457 try:
2458 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2459 except Exception, e:
6ab92c8b 2460 self._downloader.to_stderr(str(e))
d77c3dfd
FV
2461
2462 # for soundcloud, a request to a cross domain is required for cookies
2463 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2464
58ca755f
FV
2465 return [{
2466 'id': video_id.decode('utf-8'),
2467 'url': mediaURL,
2468 'uploader': uploader.decode('utf-8'),
2469 'upload_date': upload_date,
2c288bda 2470 'title': title,
58ca755f
FV
2471 'ext': u'mp3',
2472 'format': u'NA',
2473 'player_url': None,
2474 'description': description.decode('utf-8')
2475 }]
d77c3dfd
FV
2476
2477
2478class InfoQIE(InfoExtractor):
2479 """Information extractor for infoq.com"""
2480
2481 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2482 IE_NAME = u'infoq'
2483
2484 def report_webpage(self, video_id):
2485 """Report information extraction."""
2486 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2487
2488 def report_extraction(self, video_id):
2489 """Report information extraction."""
2490 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2491
2492 def _real_extract(self, url):
d77c3dfd
FV
2493 mobj = re.match(self._VALID_URL, url)
2494 if mobj is None:
2495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496 return
2497
2498 self.report_webpage(url)
2499
2500 request = urllib2.Request(url)
2501 try:
2502 webpage = urllib2.urlopen(request).read()
2503 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2504 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2505 return
2506
2507 self.report_extraction(url)
2508
2509
2510 # Extract video URL
2511 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2512 if mobj is None:
2513 self._downloader.trouble(u'ERROR: unable to extract video url')
2514 return
2515 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2516
2517
2518 # Extract title
2519 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2520 if mobj is None:
2521 self._downloader.trouble(u'ERROR: unable to extract video title')
2522 return
2523 video_title = mobj.group(1).decode('utf-8')
2524
2525 # Extract description
2526 video_description = u'No description available.'
2527 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2528 if mobj is not None:
2529 video_description = mobj.group(1).decode('utf-8')
2530
2531 video_filename = video_url.split('/')[-1]
2532 video_id, extension = video_filename.split('.')
2533
d77c3dfd
FV
2534 info = {
2535 'id': video_id,
2536 'url': video_url,
2537 'uploader': None,
2538 'upload_date': None,
2539 'title': video_title,
d77c3dfd
FV
2540 'ext': extension,
2541 'format': extension, # Extension is always(?) mp4, but seems to be flv
2542 'thumbnail': None,
2543 'description': video_description,
2544 'player_url': None,
2545 }
2546
58ca755f 2547 return [info]
d77c3dfd
FV
2548
2549class MixcloudIE(InfoExtractor):
2550 """Information extractor for www.mixcloud.com"""
2551 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2552 IE_NAME = u'mixcloud'
2553
2554 def __init__(self, downloader=None):
2555 InfoExtractor.__init__(self, downloader)
2556
2557 def report_download_json(self, file_id):
2558 """Report JSON download."""
2559 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2560
2561 def report_extraction(self, file_id):
2562 """Report information extraction."""
2563 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2564
2565 def get_urls(self, jsonData, fmt, bitrate='best'):
2566 """Get urls from 'audio_formats' section in json"""
2567 file_url = None
2568 try:
2569 bitrate_list = jsonData[fmt]
2570 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2571 bitrate = max(bitrate_list) # select highest
2572
2573 url_list = jsonData[fmt][bitrate]
2574 except TypeError: # we have no bitrate info.
2575 url_list = jsonData[fmt]
d77c3dfd
FV
2576 return url_list
2577
2578 def check_urls(self, url_list):
2579 """Returns 1st active url from list"""
2580 for url in url_list:
2581 try:
2582 urllib2.urlopen(url)
2583 return url
2584 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2585 url = None
2586
2587 return None
2588
2589 def _print_formats(self, formats):
2590 print 'Available formats:'
2591 for fmt in formats.keys():
2592 for b in formats[fmt]:
2593 try:
2594 ext = formats[fmt][b][0]
2595 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2596 except TypeError: # we have no bitrate info
2597 ext = formats[fmt][0]
2598 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2599 break
2600
2601 def _real_extract(self, url):
2602 mobj = re.match(self._VALID_URL, url)
2603 if mobj is None:
2604 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2605 return
2606 # extract uploader & filename from url
2607 uploader = mobj.group(1).decode('utf-8')
2608 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2609
2610 # construct API request
2611 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2612 # retrieve .json file with links to files
2613 request = urllib2.Request(file_url)
2614 try:
2615 self.report_download_json(file_url)
2616 jsonData = urllib2.urlopen(request).read()
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2619 return
2620
2621 # parse JSON
2622 json_data = json.loads(jsonData)
2623 player_url = json_data['player_swf_url']
2624 formats = dict(json_data['audio_formats'])
2625
2626 req_format = self._downloader.params.get('format', None)
2627 bitrate = None
2628
2629 if self._downloader.params.get('listformats', None):
2630 self._print_formats(formats)
2631 return
2632
2633 if req_format is None or req_format == 'best':
2634 for format_param in formats.keys():
2635 url_list = self.get_urls(formats, format_param)
2636 # check urls
2637 file_url = self.check_urls(url_list)
2638 if file_url is not None:
2639 break # got it!
2640 else:
2641 if req_format not in formats.keys():
2642 self._downloader.trouble(u'ERROR: format is not available')
2643 return
2644
2645 url_list = self.get_urls(formats, req_format)
2646 file_url = self.check_urls(url_list)
2647 format_param = req_format
2648
58ca755f
FV
2649 return [{
2650 'id': file_id.decode('utf-8'),
2651 'url': file_url.decode('utf-8'),
2652 'uploader': uploader.decode('utf-8'),
2653 'upload_date': u'NA',
2654 'title': json_data['name'],
58ca755f
FV
2655 'ext': file_url.split('.')[-1].decode('utf-8'),
2656 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2657 'thumbnail': json_data['thumbnail_url'],
2658 'description': json_data['description'],
2659 'player_url': player_url.decode('utf-8'),
2660 }]
d77c3dfd
FV
2661
2662class StanfordOpenClassroomIE(InfoExtractor):
2663 """Information extractor for Stanford's Open ClassRoom"""
2664
2665 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2666 IE_NAME = u'stanfordoc'
2667
2668 def report_download_webpage(self, objid):
2669 """Report information extraction."""
2670 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2671
2672 def report_extraction(self, video_id):
2673 """Report information extraction."""
2674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2675
2676 def _real_extract(self, url):
2677 mobj = re.match(self._VALID_URL, url)
2678 if mobj is None:
2679 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2680 return
2681
2682 if mobj.group('course') and mobj.group('video'): # A specific video
2683 course = mobj.group('course')
2684 video = mobj.group('video')
2685 info = {
2c288bda 2686 'id': course + '_' + video,
d77c3dfd 2687 }
3fe294e4 2688
d77c3dfd
FV
2689 self.report_extraction(info['id'])
2690 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2691 xmlUrl = baseUrl + video + '.xml'
2692 try:
2693 metaXml = urllib2.urlopen(xmlUrl).read()
2694 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2695 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2696 return
2697 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2698 try:
2699 info['title'] = mdoc.findall('./title')[0].text
2700 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2701 except IndexError:
2702 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2703 return
d77c3dfd
FV
2704 info['ext'] = info['url'].rpartition('.')[2]
2705 info['format'] = info['ext']
58ca755f 2706 return [info]
d77c3dfd 2707 elif mobj.group('course'): # A course page
d77c3dfd
FV
2708 course = mobj.group('course')
2709 info = {
2c288bda 2710 'id': course,
d77c3dfd
FV
2711 'type': 'playlist',
2712 }
2713
2714 self.report_download_webpage(info['id'])
2715 try:
2716 coursepage = urllib2.urlopen(url).read()
2717 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2718 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2719 return
2720
2721 m = re.search('<h1>([^<]+)</h1>', coursepage)
2722 if m:
2723 info['title'] = unescapeHTML(m.group(1))
2724 else:
2725 info['title'] = info['id']
d77c3dfd
FV
2726
2727 m = re.search('<description>([^<]+)</description>', coursepage)
2728 if m:
2729 info['description'] = unescapeHTML(m.group(1))
2730
2731 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2732 info['list'] = [
2733 {
2734 'type': 'reference',
2735 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2736 }
2737 for vpage in links]
58ca755f 2738 results = []
d77c3dfd
FV
2739 for entry in info['list']:
2740 assert entry['type'] == 'reference'
58ca755f
FV
2741 results += self.extract(entry['url'])
2742 return results
2743
d77c3dfd 2744 else: # Root page
d77c3dfd
FV
2745 info = {
2746 'id': 'Stanford OpenClassroom',
2747 'type': 'playlist',
2748 }
2749
2750 self.report_download_webpage(info['id'])
2751 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2752 try:
2753 rootpage = urllib2.urlopen(rootURL).read()
2754 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2755 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2756 return
2757
2758 info['title'] = info['id']
d77c3dfd
FV
2759
2760 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2761 info['list'] = [
2762 {
2763 'type': 'reference',
2764 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2765 }
2766 for cpage in links]
2767
58ca755f 2768 results = []
d77c3dfd
FV
2769 for entry in info['list']:
2770 assert entry['type'] == 'reference'
58ca755f
FV
2771 results += self.extract(entry['url'])
2772 return results
d77c3dfd
FV
2773
2774class MTVIE(InfoExtractor):
2775 """Information extractor for MTV.com"""
2776
2777 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2778 IE_NAME = u'mtv'
2779
2780 def report_webpage(self, video_id):
2781 """Report information extraction."""
2782 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2783
2784 def report_extraction(self, video_id):
2785 """Report information extraction."""
2786 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2787
2788 def _real_extract(self, url):
2789 mobj = re.match(self._VALID_URL, url)
2790 if mobj is None:
2791 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2792 return
2793 if not mobj.group('proto'):
2794 url = 'http://' + url
2795 video_id = mobj.group('videoid')
2796 self.report_webpage(video_id)
2797
2798 request = urllib2.Request(url)
2799 try:
2800 webpage = urllib2.urlopen(request).read()
2801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2802 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2803 return
2804
2805 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2806 if mobj is None:
2807 self._downloader.trouble(u'ERROR: unable to extract song name')
2808 return
2809 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2810 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2811 if mobj is None:
2812 self._downloader.trouble(u'ERROR: unable to extract performer')
2813 return
2814 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2815 video_title = performer + ' - ' + song_name
2816
2817 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2818 if mobj is None:
2819 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2820 return
2821 mtvn_uri = mobj.group(1)
2822
2823 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2824 if mobj is None:
2825 self._downloader.trouble(u'ERROR: unable to extract content id')
2826 return
2827 content_id = mobj.group(1)
2828
2829 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2830 self.report_extraction(video_id)
2831 request = urllib2.Request(videogen_url)
2832 try:
2833 metadataXml = urllib2.urlopen(request).read()
2834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2835 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2836 return
2837
2838 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2839 renditions = mdoc.findall('.//rendition')
2840
2841 # For now, always pick the highest quality.
2842 rendition = renditions[-1]
2843
2844 try:
2845 _,_,ext = rendition.attrib['type'].partition('/')
2846 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2847 video_url = rendition.find('./src').text
2848 except KeyError:
2849 self._downloader.trouble('Invalid rendition field.')
2850 return
2851
d77c3dfd
FV
2852 info = {
2853 'id': video_id,
2854 'url': video_url,
2855 'uploader': performer,
2856 'title': video_title,
d77c3dfd
FV
2857 'ext': ext,
2858 'format': format,
2859 }
2860
58ca755f 2861 return [info]