]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Added options to set download buffer size and disable automatic buffer resizing.
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import datetime
5import HTMLParser
6import httplib
7import netrc
8import os
9import re
10import socket
11import time
12import urllib
13import urllib2
14import email.utils
921a1455
FV
15import xml.etree.ElementTree
16from urlparse import parse_qs
d77c3dfd
FV
17
18try:
19 import cStringIO as StringIO
20except ImportError:
21 import StringIO
22
d11d05d0 23from utils import *
d77c3dfd
FV
24
25
26class InfoExtractor(object):
27 """Information Extractor class.
28
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
36 the following fields:
37
38 id: Video identifier.
39 url: Final video URL.
40 uploader: Nickname of the video uploader.
41 title: Literal title.
d77c3dfd
FV
42 ext: Video filename extension.
43 format: Video format.
44 player_url: SWF Player URL (may be None).
45
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
50
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57 """
58
59 _ready = False
60 _downloader = None
61
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
64 self._ready = False
65 self.set_downloader(downloader)
66
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
70
71 def initialize(self):
72 """Initializes an instance (authentication, etc)."""
73 if not self._ready:
74 self._real_initialize()
75 self._ready = True
76
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
79 self.initialize()
80 return self._real_extract(url)
81
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
85
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
88 pass
89
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
92 pass
93
94
95class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
97
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
3fe294e4
FV
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
d77c3dfd
FV
107 _video_extensions = {
108 '13': '3gp',
109 '17': 'mp4',
110 '18': 'mp4',
111 '22': 'mp4',
112 '37': 'mp4',
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
114 '43': 'webm',
115 '44': 'webm',
116 '45': 'webm',
3fe294e4 117 '46': 'webm',
d77c3dfd
FV
118 }
119 _video_dimensions = {
120 '5': '240x400',
121 '6': '???',
122 '13': '???',
123 '17': '144x176',
124 '18': '360x640',
125 '22': '720x1280',
126 '34': '360x640',
127 '35': '480x854',
128 '37': '1080x1920',
129 '38': '3072x4096',
130 '43': '360x640',
131 '44': '480x854',
132 '45': '720x1280',
3fe294e4 133 '46': '1080x1920',
d77c3dfd
FV
134 }
135 IE_NAME = u'youtube'
136
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
140
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
144
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
148
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
152
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
156
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
160
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
164
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
168
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
172
173 def _closed_captions_xml_to_srt(self, xml_string):
174 srt = ''
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
179 start = float(start)
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
9e6dd238 183 caption = unescapeHTML(caption)
6ab92c8b 184 caption = unescapeHTML(caption) # double cycle, intentional
54041793 185 srt += str(n+1) + '\n'
d77c3dfd
FV
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
188 return srt
189
190 def _print_formats(self, formats):
191 print 'Available formats:'
192 for x in formats:
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
194
195 def _real_initialize(self):
196 if self._downloader is None:
197 return
198
199 username = None
200 password = None
201 downloader_params = self._downloader.params
202
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
208 try:
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
210 if info is not None:
211 username = info[0]
212 password = info[2]
213 else:
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
217 return
218
219 # Set language
220 request = urllib2.Request(self._LANG_URL)
221 try:
222 self.report_lang()
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
226 return
227
228 # No authentication to be performed
229 if username is None:
230 return
231
232 # Log in
233 login_form = {
234 'current_form': 'loginForm',
235 'next': '/',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
239 }
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
241 try:
242 self.report_login()
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
246 return
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
249 return
250
251 # Confirm age
252 age_form = {
253 'next_url': '/',
254 'action_confirm': 'Confirm',
255 }
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
257 try:
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
262 return
263
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
267 if mobj:
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
269
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
272 if mobj is None:
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
274 return
275 video_id = mobj.group(2)
276
277 # Get video webpage
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
280 try:
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
284 return
285
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
288 if mobj is not None:
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
290 else:
291 player_url = None
292
293 # Get video info
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
299 try:
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
303 break
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
306 return
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
310 else:
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
312 return
313
7df97fb5
FV
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
317 return
318
d77c3dfd
FV
319 # Start extracting information
320 self.report_information_extraction(video_id)
321
322 # uploader
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
325 return
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
327
328 # title
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
331 return
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
d77c3dfd
FV
334
335 # thumbnail image
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
338 video_thumbnail = ''
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
341
342 # upload date
343 upload_date = u'NA'
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
345 if mobj is not None:
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
349 try:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
351 except:
352 pass
353
354 # description
9beb5af8
FV
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
9e6dd238 357 else: video_description = ''
d77c3dfd
FV
358
359 # closed captions
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
d77c3dfd 362 try:
0b8c922d
FV
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
365 try:
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
d77c3dfd 369 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
0b8c922d
FV
370 if not srt_lang_list:
371 raise Trouble(u'WARNING: video has no closed captions')
372 if self._downloader.params.get('subtitleslang', False):
373 srt_lang = self._downloader.params.get('subtitleslang')
374 elif 'en' in srt_lang_list:
375 srt_lang = 'en'
d77c3dfd 376 else:
0b8c922d
FV
377 srt_lang = srt_lang_list[0]
378 if not srt_lang in srt_lang_list:
379 raise Trouble(u'WARNING: no closed captions found in the specified language')
380 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
381 try:
382 srt_xml = urllib2.urlopen(request).read()
383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
385 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386 except Trouble as trouble:
387 self._downloader.trouble(trouble[0])
d77c3dfd
FV
388
389 # token
390 video_token = urllib.unquote_plus(video_info['token'][0])
391
392 # Decide which formats to download
393 req_format = self._downloader.params.get('format', None)
394
395 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
396 self.report_rtmp_download()
397 video_url_list = [(None, video_info['conn'][0])]
398 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
399 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
400 url_data = [parse_qs(uds) for uds in url_data_strs]
401 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
402 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
403
404 format_limit = self._downloader.params.get('format_limit', None)
405 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
406 if format_limit is not None and format_limit in available_formats:
407 format_list = available_formats[available_formats.index(format_limit):]
408 else:
409 format_list = available_formats
410 existing_formats = [x for x in format_list if x in url_map]
411 if len(existing_formats) == 0:
412 self._downloader.trouble(u'ERROR: no known formats available for video')
413 return
414 if self._downloader.params.get('listformats', None):
415 self._print_formats(existing_formats)
416 return
417 if req_format is None or req_format == 'best':
418 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
419 elif req_format == 'worst':
420 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
421 elif req_format in ('-1', 'all'):
422 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
423 else:
424 # Specific formats. We pick the first in a slash-delimeted sequence.
425 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
426 req_formats = req_format.split('/')
427 video_url_list = None
428 for rf in req_formats:
429 if rf in url_map:
430 video_url_list = [(rf, url_map[rf])]
431 break
432 if video_url_list is None:
433 self._downloader.trouble(u'ERROR: requested format not available')
434 return
435 else:
436 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
437 return
438
58ca755f 439 results = []
d77c3dfd 440 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
441 # Extension
442 video_extension = self._video_extensions.get(format_param, 'flv')
443
58ca755f
FV
444 results.append({
445 'id': video_id.decode('utf-8'),
446 'url': video_real_url.decode('utf-8'),
447 'uploader': video_uploader.decode('utf-8'),
448 'upload_date': upload_date,
449 'title': video_title,
58ca755f
FV
450 'ext': video_extension.decode('utf-8'),
451 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
452 'thumbnail': video_thumbnail.decode('utf-8'),
453 'description': video_description,
454 'player_url': player_url,
455 'subtitles': video_subtitles
456 })
457 return results
d77c3dfd
FV
458
459
460class MetacafeIE(InfoExtractor):
461 """Information Extractor for metacafe.com."""
462
463 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
464 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
465 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
d77c3dfd
FV
466 IE_NAME = u'metacafe'
467
58ca755f 468 def __init__(self, downloader=None):
d77c3dfd 469 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
470
471 def report_disclaimer(self):
472 """Report disclaimer retrieval."""
473 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
474
475 def report_age_confirmation(self):
476 """Report attempt to confirm age."""
477 self._downloader.to_screen(u'[metacafe] Confirming age')
478
479 def report_download_webpage(self, video_id):
480 """Report webpage download."""
481 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
482
483 def report_extraction(self, video_id):
484 """Report information extraction."""
485 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
486
487 def _real_initialize(self):
488 # Retrieve disclaimer
489 request = urllib2.Request(self._DISCLAIMER)
490 try:
491 self.report_disclaimer()
492 disclaimer = urllib2.urlopen(request).read()
493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
494 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
495 return
496
497 # Confirm age
498 disclaimer_form = {
499 'filters': '0',
500 'submit': "Continue - I'm over 18",
501 }
502 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
503 try:
504 self.report_age_confirmation()
505 disclaimer = urllib2.urlopen(request).read()
506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
508 return
509
510 def _real_extract(self, url):
511 # Extract id and simplified title from URL
512 mobj = re.match(self._VALID_URL, url)
513 if mobj is None:
514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
515 return
516
517 video_id = mobj.group(1)
518
519 # Check if video comes from YouTube
520 mobj2 = re.match(r'^yt-(.*)$', video_id)
521 if mobj2 is not None:
58ca755f 522 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
d77c3dfd
FV
523 return
524
d77c3dfd
FV
525 # Retrieve video webpage to extract further information
526 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
527 try:
528 self.report_download_webpage(video_id)
529 webpage = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
532 return
533
534 # Extract URL, uploader and title from webpage
535 self.report_extraction(video_id)
536 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
537 if mobj is not None:
538 mediaURL = urllib.unquote(mobj.group(1))
539 video_extension = mediaURL[-3:]
540
541 # Extract gdaKey if available
542 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
543 if mobj is None:
544 video_url = mediaURL
545 else:
546 gdaKey = mobj.group(1)
547 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
548 else:
549 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
550 if mobj is None:
551 self._downloader.trouble(u'ERROR: unable to extract media URL')
552 return
553 vardict = parse_qs(mobj.group(1))
554 if 'mediaData' not in vardict:
555 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 return
557 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
558 if mobj is None:
559 self._downloader.trouble(u'ERROR: unable to extract media URL')
560 return
561 mediaURL = mobj.group(1).replace('\\/', '/')
562 video_extension = mediaURL[-3:]
563 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
564
565 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
566 if mobj is None:
567 self._downloader.trouble(u'ERROR: unable to extract title')
568 return
569 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
570
571 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
572 if mobj is None:
573 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
574 return
575 video_uploader = mobj.group(1)
576
58ca755f
FV
577 return [{
578 'id': video_id.decode('utf-8'),
579 'url': video_url.decode('utf-8'),
580 'uploader': video_uploader.decode('utf-8'),
581 'upload_date': u'NA',
582 'title': video_title,
58ca755f
FV
583 'ext': video_extension.decode('utf-8'),
584 'format': u'NA',
585 'player_url': None,
586 }]
d77c3dfd
FV
587
588
589class DailymotionIE(InfoExtractor):
590 """Information Extractor for Dailymotion"""
591
592 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
593 IE_NAME = u'dailymotion'
594
595 def __init__(self, downloader=None):
596 InfoExtractor.__init__(self, downloader)
597
598 def report_download_webpage(self, video_id):
599 """Report webpage download."""
600 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
601
602 def report_extraction(self, video_id):
603 """Report information extraction."""
604 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
605
606 def _real_extract(self, url):
607 # Extract id and simplified title from URL
608 mobj = re.match(self._VALID_URL, url)
609 if mobj is None:
610 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
611 return
612
d77c3dfd
FV
613 video_id = mobj.group(1)
614
615 video_extension = 'flv'
616
617 # Retrieve video webpage to extract further information
618 request = urllib2.Request(url)
619 request.add_header('Cookie', 'family_filter=off')
620 try:
621 self.report_download_webpage(video_id)
622 webpage = urllib2.urlopen(request).read()
623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
624 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
625 return
626
627 # Extract URL, uploader and title from webpage
628 self.report_extraction(video_id)
629 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
630 if mobj is None:
631 self._downloader.trouble(u'ERROR: unable to extract media URL')
632 return
633 sequence = urllib.unquote(mobj.group(1))
634 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
635 if mobj is None:
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
637 return
638 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
639
640 # if needed add http://www.dailymotion.com/ if relative URL
641
642 video_url = mediaURL
643
644 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
645 if mobj is None:
646 self._downloader.trouble(u'ERROR: unable to extract title')
647 return
648 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
d77c3dfd
FV
649
650 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
651 if mobj is None:
652 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
653 return
654 video_uploader = mobj.group(1)
655
58ca755f
FV
656 return [{
657 'id': video_id.decode('utf-8'),
658 'url': video_url.decode('utf-8'),
659 'uploader': video_uploader.decode('utf-8'),
660 'upload_date': u'NA',
661 'title': video_title,
58ca755f
FV
662 'ext': video_extension.decode('utf-8'),
663 'format': u'NA',
664 'player_url': None,
665 }]
d77c3dfd
FV
666
667
668class GoogleIE(InfoExtractor):
669 """Information extractor for video.google.com."""
670
671 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
672 IE_NAME = u'video.google'
673
674 def __init__(self, downloader=None):
675 InfoExtractor.__init__(self, downloader)
676
677 def report_download_webpage(self, video_id):
678 """Report webpage download."""
679 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
680
681 def report_extraction(self, video_id):
682 """Report information extraction."""
683 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
684
685 def _real_extract(self, url):
686 # Extract id from URL
687 mobj = re.match(self._VALID_URL, url)
688 if mobj is None:
689 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
690 return
691
d77c3dfd
FV
692 video_id = mobj.group(1)
693
694 video_extension = 'mp4'
695
696 # Retrieve video webpage to extract further information
697 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
698 try:
699 self.report_download_webpage(video_id)
700 webpage = urllib2.urlopen(request).read()
701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
703 return
704
705 # Extract URL, uploader, and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r"download_url:'([^']+)'", webpage)
708 if mobj is None:
709 video_extension = 'flv'
710 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
711 if mobj is None:
712 self._downloader.trouble(u'ERROR: unable to extract media URL')
713 return
714 mediaURL = urllib.unquote(mobj.group(1))
715 mediaURL = mediaURL.replace('\\x3d', '\x3d')
716 mediaURL = mediaURL.replace('\\x26', '\x26')
717
718 video_url = mediaURL
719
720 mobj = re.search(r'<title>(.*)</title>', webpage)
721 if mobj is None:
722 self._downloader.trouble(u'ERROR: unable to extract title')
723 return
724 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
725
726 # Extract video description
727 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
728 if mobj is None:
729 self._downloader.trouble(u'ERROR: unable to extract video description')
730 return
731 video_description = mobj.group(1).decode('utf-8')
732 if not video_description:
733 video_description = 'No description available.'
734
735 # Extract video thumbnail
736 if self._downloader.params.get('forcethumbnail', False):
737 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
738 try:
739 webpage = urllib2.urlopen(request).read()
740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
742 return
743 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
744 if mobj is None:
745 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
746 return
747 video_thumbnail = mobj.group(1)
748 else: # we need something to pass to process_info
749 video_thumbnail = ''
750
58ca755f
FV
751 return [{
752 'id': video_id.decode('utf-8'),
753 'url': video_url.decode('utf-8'),
754 'uploader': u'NA',
755 'upload_date': u'NA',
756 'title': video_title,
58ca755f
FV
757 'ext': video_extension.decode('utf-8'),
758 'format': u'NA',
759 'player_url': None,
760 }]
d77c3dfd
FV
761
762
763class PhotobucketIE(InfoExtractor):
764 """Information extractor for photobucket.com."""
765
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
768
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
771
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
775
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
779
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
785 return
786
d77c3dfd
FV
787 video_id = mobj.group(1)
788
789 video_extension = 'flv'
790
791 # Retrieve video webpage to extract further information
792 request = urllib2.Request(url)
793 try:
794 self.report_download_webpage(video_id)
795 webpage = urllib2.urlopen(request).read()
796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
798 return
799
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
803 if mobj is None:
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
805 return
806 mediaURL = urllib.unquote(mobj.group(1))
807
808 video_url = mediaURL
809
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
811 if mobj is None:
812 self._downloader.trouble(u'ERROR: unable to extract title')
813 return
814 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
815
816 video_uploader = mobj.group(2).decode('utf-8')
817
58ca755f
FV
818 return [{
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
822 'upload_date': u'NA',
823 'title': video_title,
58ca755f
FV
824 'ext': video_extension.decode('utf-8'),
825 'format': u'NA',
826 'player_url': None,
827 }]
d77c3dfd
FV
828
829
830class YahooIE(InfoExtractor):
831 """Information extractor for video.yahoo.com."""
832
833 # _VALID_URL matches all Yahoo! Video URLs
834 # _VPAGE_URL matches only the extractable '/watch/' URLs
835 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837 IE_NAME = u'video.yahoo'
838
839 def __init__(self, downloader=None):
840 InfoExtractor.__init__(self, downloader)
841
842 def report_download_webpage(self, video_id):
843 """Report webpage download."""
844 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
845
846 def report_extraction(self, video_id):
847 """Report information extraction."""
848 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
849
850 def _real_extract(self, url, new_video=True):
851 # Extract ID from URL
852 mobj = re.match(self._VALID_URL, url)
853 if mobj is None:
854 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
855 return
856
d77c3dfd
FV
857 video_id = mobj.group(2)
858 video_extension = 'flv'
859
860 # Rewrite valid but non-extractable URLs as
861 # extractable English language /watch/ URLs
862 if re.match(self._VPAGE_URL, url) is None:
863 request = urllib2.Request(url)
864 try:
865 webpage = urllib2.urlopen(request).read()
866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
868 return
869
870 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
871 if mobj is None:
872 self._downloader.trouble(u'ERROR: Unable to extract id field')
873 return
874 yahoo_id = mobj.group(1)
875
876 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
877 if mobj is None:
878 self._downloader.trouble(u'ERROR: Unable to extract vid field')
879 return
880 yahoo_vid = mobj.group(1)
881
882 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883 return self._real_extract(url, new_video=False)
884
885 # Retrieve video webpage to extract further information
886 request = urllib2.Request(url)
887 try:
888 self.report_download_webpage(video_id)
889 webpage = urllib2.urlopen(request).read()
890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
892 return
893
894 # Extract uploader and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
897 if mobj is None:
898 self._downloader.trouble(u'ERROR: unable to extract video title')
899 return
900 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
901
902 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
903 if mobj is None:
904 self._downloader.trouble(u'ERROR: unable to extract video uploader')
905 return
906 video_uploader = mobj.group(1).decode('utf-8')
907
908 # Extract video thumbnail
909 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
910 if mobj is None:
911 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
912 return
913 video_thumbnail = mobj.group(1).decode('utf-8')
914
915 # Extract video description
916 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
917 if mobj is None:
918 self._downloader.trouble(u'ERROR: unable to extract video description')
919 return
920 video_description = mobj.group(1).decode('utf-8')
921 if not video_description:
922 video_description = 'No description available.'
923
924 # Extract video height and width
925 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: unable to extract video height')
928 return
929 yv_video_height = mobj.group(1)
930
931 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: unable to extract video width')
934 return
935 yv_video_width = mobj.group(1)
936
937 # Retrieve video playlist to extract media URL
938 # I'm not completely sure what all these options are, but we
939 # seem to need most of them, otherwise the server sends a 401.
940 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
941 yv_bitrate = '700' # according to Wikipedia this is hard-coded
942 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
945 try:
946 self.report_download_webpage(video_id)
947 webpage = urllib2.urlopen(request).read()
948 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
950 return
951
952 # Extract media URL from playlist XML
953 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
954 if mobj is None:
955 self._downloader.trouble(u'ERROR: Unable to extract media URL')
956 return
957 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
9e6dd238 958 video_url = unescapeHTML(video_url)
d77c3dfd 959
58ca755f
FV
960 return [{
961 'id': video_id.decode('utf-8'),
962 'url': video_url,
963 'uploader': video_uploader,
964 'upload_date': u'NA',
965 'title': video_title,
58ca755f
FV
966 'ext': video_extension.decode('utf-8'),
967 'thumbnail': video_thumbnail.decode('utf-8'),
968 'description': video_description,
969 'thumbnail': video_thumbnail,
970 'player_url': None,
971 }]
d77c3dfd
FV
972
973
974class VimeoIE(InfoExtractor):
975 """Information extractor for vimeo.com."""
976
977 # _VALID_URL matches Vimeo URLs
978 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
979 IE_NAME = u'vimeo'
980
981 def __init__(self, downloader=None):
982 InfoExtractor.__init__(self, downloader)
983
984 def report_download_webpage(self, video_id):
985 """Report webpage download."""
986 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
987
988 def report_extraction(self, video_id):
989 """Report information extraction."""
990 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
991
992 def _real_extract(self, url, new_video=True):
993 # Extract ID from URL
994 mobj = re.match(self._VALID_URL, url)
995 if mobj is None:
996 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
997 return
998
d77c3dfd
FV
999 video_id = mobj.group(1)
1000
1001 # Retrieve video webpage to extract further information
1002 request = urllib2.Request(url, None, std_headers)
1003 try:
1004 self.report_download_webpage(video_id)
1005 webpage = urllib2.urlopen(request).read()
1006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1008 return
1009
1010 # Now we begin extracting as much information as we can from what we
1011 # retrieved. First we extract the information common to all extractors,
1012 # and latter we extract those that are Vimeo specific.
1013 self.report_extraction(video_id)
1014
1015 # Extract the config JSON
1016 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017 try:
1018 config = json.loads(config)
1019 except:
1020 self._downloader.trouble(u'ERROR: unable to extract info section')
1021 return
1022
1023 # Extract title
1024 video_title = config["video"]["title"]
d77c3dfd
FV
1025
1026 # Extract uploader
1027 video_uploader = config["video"]["owner"]["name"]
1028
1029 # Extract video thumbnail
1030 video_thumbnail = config["video"]["thumbnail"]
1031
1032 # Extract video description
9beb5af8
FV
1033 video_description = get_element_by_id("description", webpage.decode('utf8'))
1034 if video_description: video_description = clean_html(video_description)
9e6dd238 1035 else: video_description = ''
d77c3dfd
FV
1036
1037 # Extract upload date
1038 video_upload_date = u'NA'
1039 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1040 if mobj is not None:
1041 video_upload_date = mobj.group(1)
1042
1043 # Vimeo specific: extract request signature and timestamp
1044 sig = config['request']['signature']
1045 timestamp = config['request']['timestamp']
1046
1047 # Vimeo specific: extract video codec and quality information
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 for codec in codecs:
1051 if codec[0] in config["video"]["files"]:
1052 video_codec = codec[0]
1053 video_extension = codec[1]
1054 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1055 else: quality = 'sd'
1056 break
1057 else:
1058 self._downloader.trouble(u'ERROR: no known codec found')
1059 return
1060
1061 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1062 %(video_id, sig, timestamp, quality, video_codec.upper())
1063
58ca755f
FV
1064 return [{
1065 'id': video_id,
1066 'url': video_url,
1067 'uploader': video_uploader,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
58ca755f
FV
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1073 'player_url': None,
1074 }]
d77c3dfd
FV
1075
1076
1077class GenericIE(InfoExtractor):
1078 """Generic last-resort information extractor."""
1079
1080 _VALID_URL = r'.*'
1081 IE_NAME = u'generic'
1082
1083 def __init__(self, downloader=None):
1084 InfoExtractor.__init__(self, downloader)
1085
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1089 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1090
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1094
1095 def report_following_redirect(self, new_url):
1096 """Report information extraction."""
1097 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1098
1099 def _test_redirect(self, url):
1100 """Check if it is a redirect, like url shorteners, in case restart chain."""
1101 class HeadRequest(urllib2.Request):
1102 def get_method(self):
1103 return "HEAD"
1104
1105 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1106 """
1107 Subclass the HTTPRedirectHandler to make it use our
1108 HeadRequest also on the redirected URL
1109 """
1110 def redirect_request(self, req, fp, code, msg, headers, newurl):
1111 if code in (301, 302, 303, 307):
303692b5
FV
1112 newurl = newurl.replace(' ', '%20')
1113 newheaders = dict((k,v) for k,v in req.headers.items()
1114 if k.lower() not in ("content-length", "content-type"))
1115 return HeadRequest(newurl,
1116 headers=newheaders,
1117 origin_req_host=req.get_origin_req_host(),
1118 unverifiable=True)
d77c3dfd 1119 else:
303692b5
FV
1120 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1121
d77c3dfd
FV
1122 class HTTPMethodFallback(urllib2.BaseHandler):
1123 """
1124 Fallback to GET if HEAD is not allowed (405 HTTP error)
1125 """
1126 def http_error_405(self, req, fp, code, msg, headers):
1127 fp.read()
1128 fp.close()
1129
1130 newheaders = dict((k,v) for k,v in req.headers.items()
303692b5 1131 if k.lower() not in ("content-length", "content-type"))
d77c3dfd 1132 return self.parent.open(urllib2.Request(req.get_full_url(),
303692b5
FV
1133 headers=newheaders,
1134 origin_req_host=req.get_origin_req_host(),
1135 unverifiable=True))
d77c3dfd
FV
1136
1137 # Build our opener
1138 opener = urllib2.OpenerDirector()
1139 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
303692b5
FV
1140 HTTPMethodFallback, HEADRedirectHandler,
1141 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
d77c3dfd
FV
1142 opener.add_handler(handler())
1143
1144 response = opener.open(HeadRequest(url))
1145 new_url = response.geturl()
1146
1147 if url == new_url: return False
1148
1149 self.report_following_redirect(new_url)
1150 self._downloader.download([new_url])
1151 return True
1152
1153 def _real_extract(self, url):
1154 if self._test_redirect(url): return
d77c3dfd
FV
1155
1156 video_id = url.split('/')[-1]
1157 request = urllib2.Request(url)
1158 try:
1159 self.report_download_webpage(video_id)
1160 webpage = urllib2.urlopen(request).read()
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1163 return
1164 except ValueError, err:
1165 # since this is the last-resort InfoExtractor, if
1166 # this error is thrown, it'll be thrown here
1167 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1168 return
1169
1170 self.report_extraction(video_id)
1171 # Start with something easy: JW Player in SWFObject
1172 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1173 if mobj is None:
1174 # Broaden the search a little bit
1175 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1176 if mobj is None:
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178 return
1179
1180 # It's possible that one of the regexes
1181 # matched, but returned an empty group:
1182 if mobj.group(1) is None:
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184 return
1185
1186 video_url = urllib.unquote(mobj.group(1))
1187 video_id = os.path.basename(video_url)
1188
1189 # here's a fun little line of code for you:
1190 video_extension = os.path.splitext(video_id)[1][1:]
1191 video_id = os.path.splitext(video_id)[0]
1192
1193 # it's tempting to parse this further, but you would
1194 # have to take into account all the variations like
1195 # Video Title - Site Name
1196 # Site Name | Video Title
1197 # Video Title - Tagline | Site Name
1198 # and so on and so forth; it's just not practical
1199 mobj = re.search(r'<title>(.*)</title>', webpage)
1200 if mobj is None:
1201 self._downloader.trouble(u'ERROR: unable to extract title')
1202 return
1203 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
1204
1205 # video uploader is domain name
1206 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1207 if mobj is None:
1208 self._downloader.trouble(u'ERROR: unable to extract title')
1209 return
1210 video_uploader = mobj.group(1).decode('utf-8')
1211
58ca755f
FV
1212 return [{
1213 'id': video_id.decode('utf-8'),
1214 'url': video_url.decode('utf-8'),
1215 'uploader': video_uploader,
1216 'upload_date': u'NA',
1217 'title': video_title,
58ca755f
FV
1218 'ext': video_extension.decode('utf-8'),
1219 'format': u'NA',
1220 'player_url': None,
1221 }]
d77c3dfd
FV
1222
1223
1224class YoutubeSearchIE(InfoExtractor):
1225 """Information Extractor for YouTube search queries."""
1226 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1227 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
d77c3dfd
FV
1228 _max_youtube_results = 1000
1229 IE_NAME = u'youtube:search'
1230
58ca755f 1231 def __init__(self, downloader=None):
d77c3dfd 1232 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1233
1234 def report_download_page(self, query, pagenum):
d4e16d3e 1235 """Report attempt to download search page with given number."""
d77c3dfd
FV
1236 query = query.decode(preferredencoding())
1237 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1238
d77c3dfd
FV
1239 def _real_extract(self, query):
1240 mobj = re.match(self._VALID_URL, query)
1241 if mobj is None:
1242 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1243 return
1244
1245 prefix, query = query.split(':')
1246 prefix = prefix[8:]
1247 query = query.encode('utf-8')
1248 if prefix == '':
1249 self._download_n_results(query, 1)
1250 return
1251 elif prefix == 'all':
1252 self._download_n_results(query, self._max_youtube_results)
1253 return
1254 else:
1255 try:
1256 n = long(prefix)
1257 if n <= 0:
1258 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1259 return
1260 elif n > self._max_youtube_results:
1261 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1262 n = self._max_youtube_results
1263 self._download_n_results(query, n)
1264 return
1265 except ValueError: # parsing prefix as integer fails
1266 self._download_n_results(query, 1)
1267 return
1268
1269 def _download_n_results(self, query, n):
1270 """Downloads a specified number of results for a query"""
1271
1272 video_ids = []
1273 pagenum = 0
1274 limit = n
1275
1276 while (50 * pagenum) < limit:
1277 self.report_download_page(query, pagenum+1)
1278 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1279 request = urllib2.Request(result_url)
1280 try:
1281 data = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1284 return
1285 api_response = json.loads(data)['data']
1286
1287 new_ids = list(video['id'] for video in api_response['items'])
1288 video_ids += new_ids
1289
1290 limit = min(n, api_response['totalItems'])
1291 pagenum += 1
1292
1293 if len(video_ids) > n:
1294 video_ids = video_ids[:n]
1295 for id in video_ids:
58ca755f 1296 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1297 return
1298
1299
1300class GoogleSearchIE(InfoExtractor):
1301 """Information Extractor for Google Video search queries."""
1302 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1303 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1304 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1305 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
d77c3dfd
FV
1306 _max_google_results = 1000
1307 IE_NAME = u'video.google:search'
1308
58ca755f 1309 def __init__(self, downloader=None):
d77c3dfd 1310 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1311
1312 def report_download_page(self, query, pagenum):
1313 """Report attempt to download playlist page with given number."""
1314 query = query.decode(preferredencoding())
1315 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1316
d77c3dfd
FV
1317 def _real_extract(self, query):
1318 mobj = re.match(self._VALID_URL, query)
1319 if mobj is None:
1320 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1321 return
1322
1323 prefix, query = query.split(':')
1324 prefix = prefix[8:]
1325 query = query.encode('utf-8')
1326 if prefix == '':
1327 self._download_n_results(query, 1)
1328 return
1329 elif prefix == 'all':
1330 self._download_n_results(query, self._max_google_results)
1331 return
1332 else:
1333 try:
1334 n = long(prefix)
1335 if n <= 0:
1336 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1337 return
1338 elif n > self._max_google_results:
1339 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1340 n = self._max_google_results
1341 self._download_n_results(query, n)
1342 return
1343 except ValueError: # parsing prefix as integer fails
1344 self._download_n_results(query, 1)
1345 return
1346
1347 def _download_n_results(self, query, n):
1348 """Downloads a specified number of results for a query"""
1349
1350 video_ids = []
1351 pagenum = 0
1352
1353 while True:
1354 self.report_download_page(query, pagenum)
1355 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1356 request = urllib2.Request(result_url)
1357 try:
1358 page = urllib2.urlopen(request).read()
1359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1360 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1361 return
1362
1363 # Extract video identifiers
1364 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1365 video_id = mobj.group(1)
1366 if video_id not in video_ids:
1367 video_ids.append(video_id)
1368 if len(video_ids) == n:
1369 # Specified n videos reached
1370 for id in video_ids:
58ca755f 1371 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1372 return
1373
1374 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1375 for id in video_ids:
58ca755f 1376 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1377 return
1378
1379 pagenum = pagenum + 1
1380
1381
1382class YahooSearchIE(InfoExtractor):
1383 """Information Extractor for Yahoo! Video search queries."""
1384 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1385 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1386 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1387 _MORE_PAGES_INDICATOR = r'\s*Next'
d77c3dfd
FV
1388 _max_yahoo_results = 1000
1389 IE_NAME = u'video.yahoo:search'
1390
58ca755f 1391 def __init__(self, downloader=None):
d77c3dfd 1392 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1393
1394 def report_download_page(self, query, pagenum):
1395 """Report attempt to download playlist page with given number."""
1396 query = query.decode(preferredencoding())
1397 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1398
d77c3dfd
FV
1399 def _real_extract(self, query):
1400 mobj = re.match(self._VALID_URL, query)
1401 if mobj is None:
1402 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1403 return
1404
1405 prefix, query = query.split(':')
1406 prefix = prefix[8:]
1407 query = query.encode('utf-8')
1408 if prefix == '':
1409 self._download_n_results(query, 1)
1410 return
1411 elif prefix == 'all':
1412 self._download_n_results(query, self._max_yahoo_results)
1413 return
1414 else:
1415 try:
1416 n = long(prefix)
1417 if n <= 0:
1418 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1419 return
1420 elif n > self._max_yahoo_results:
1421 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1422 n = self._max_yahoo_results
1423 self._download_n_results(query, n)
1424 return
1425 except ValueError: # parsing prefix as integer fails
1426 self._download_n_results(query, 1)
1427 return
1428
1429 def _download_n_results(self, query, n):
1430 """Downloads a specified number of results for a query"""
1431
1432 video_ids = []
1433 already_seen = set()
1434 pagenum = 1
1435
1436 while True:
1437 self.report_download_page(query, pagenum)
1438 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1439 request = urllib2.Request(result_url)
1440 try:
1441 page = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1444 return
1445
1446 # Extract video identifiers
1447 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1448 video_id = mobj.group(1)
1449 if video_id not in already_seen:
1450 video_ids.append(video_id)
1451 already_seen.add(video_id)
1452 if len(video_ids) == n:
1453 # Specified n videos reached
1454 for id in video_ids:
58ca755f 1455 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1456 return
1457
1458 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1459 for id in video_ids:
58ca755f 1460 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1461 return
1462
1463 pagenum = pagenum + 1
1464
1465
1466class YoutubePlaylistIE(InfoExtractor):
1467 """Information Extractor for YouTube playlists."""
1468
1469 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1470 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
d4e16d3e
FV
1471 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1472 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
d77c3dfd
FV
1473 IE_NAME = u'youtube:playlist'
1474
58ca755f 1475 def __init__(self, downloader=None):
d77c3dfd 1476 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1477
1478 def report_download_page(self, playlist_id, pagenum):
1479 """Report attempt to download playlist page with given number."""
1480 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1481
d77c3dfd
FV
1482 def _real_extract(self, url):
1483 # Extract playlist id
1484 mobj = re.match(self._VALID_URL, url)
1485 if mobj is None:
1486 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1487 return
1488
1489 # Single video case
1490 if mobj.group(3) is not None:
58ca755f 1491 self._downloader.download([mobj.group(3)])
d77c3dfd
FV
1492 return
1493
1494 # Download playlist pages
1495 # prefix is 'p' as default for playlists but there are other types that need extra care
1496 playlist_prefix = mobj.group(1)
1497 if playlist_prefix == 'a':
1498 playlist_access = 'artist'
1499 else:
1500 playlist_prefix = 'p'
1501 playlist_access = 'view_play_list'
1502 playlist_id = mobj.group(2)
1503 video_ids = []
1504 pagenum = 1
1505
1506 while True:
1507 self.report_download_page(playlist_id, pagenum)
1508 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1509 request = urllib2.Request(url)
1510 try:
1511 page = urllib2.urlopen(request).read()
1512 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1513 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1514 return
1515
1516 # Extract video identifiers
1517 ids_in_page = []
1518 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1519 if mobj.group(1) not in ids_in_page:
1520 ids_in_page.append(mobj.group(1))
1521 video_ids.extend(ids_in_page)
1522
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 break
1525 pagenum = pagenum + 1
1526
1527 playliststart = self._downloader.params.get('playliststart', 1) - 1
1528 playlistend = self._downloader.params.get('playlistend', -1)
1529 if playlistend == -1:
1530 video_ids = video_ids[playliststart:]
1531 else:
1532 video_ids = video_ids[playliststart:playlistend]
1533
1534 for id in video_ids:
58ca755f 1535 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1536 return
1537
1538
1539class YoutubeUserIE(InfoExtractor):
1540 """Information Extractor for YouTube users."""
1541
1542 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1543 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1544 _GDATA_PAGE_SIZE = 50
1545 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1546 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
d77c3dfd
FV
1547 IE_NAME = u'youtube:user'
1548
58ca755f 1549 def __init__(self, downloader=None):
d77c3dfd 1550 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1551
1552 def report_download_page(self, username, start_index):
1553 """Report attempt to download user page."""
1554 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1555 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1556
d77c3dfd
FV
1557 def _real_extract(self, url):
1558 # Extract username
1559 mobj = re.match(self._VALID_URL, url)
1560 if mobj is None:
1561 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1562 return
1563
1564 username = mobj.group(1)
1565
1566 # Download video ids using YouTube Data API. Result size per
1567 # query is limited (currently to 50 videos) so we need to query
1568 # page by page until there are no video ids - it means we got
1569 # all of them.
1570
1571 video_ids = []
1572 pagenum = 0
1573
1574 while True:
1575 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1576 self.report_download_page(username, start_index)
1577
1578 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1579
1580 try:
1581 page = urllib2.urlopen(request).read()
1582 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1583 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1584 return
1585
1586 # Extract video identifiers
1587 ids_in_page = []
1588
1589 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1592
1593 video_ids.extend(ids_in_page)
1594
1595 # A little optimization - if current page is not
1596 # "full", ie. does not contain PAGE_SIZE video ids then
1597 # we can assume that this page is the last one - there
1598 # are no more ids on further pages - no need to query
1599 # again.
1600
1601 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1602 break
1603
1604 pagenum += 1
1605
1606 all_ids_count = len(video_ids)
1607 playliststart = self._downloader.params.get('playliststart', 1) - 1
1608 playlistend = self._downloader.params.get('playlistend', -1)
1609
1610 if playlistend == -1:
1611 video_ids = video_ids[playliststart:]
1612 else:
1613 video_ids = video_ids[playliststart:playlistend]
1614
1615 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1616 (username, all_ids_count, len(video_ids)))
1617
1618 for video_id in video_ids:
58ca755f 1619 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1620
1621
1622class DepositFilesIE(InfoExtractor):
1623 """Information extractor for depositfiles.com"""
1624
1625 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1626 IE_NAME = u'DepositFiles'
1627
1628 def __init__(self, downloader=None):
1629 InfoExtractor.__init__(self, downloader)
1630
1631 def report_download_webpage(self, file_id):
1632 """Report webpage download."""
1633 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1634
1635 def report_extraction(self, file_id):
1636 """Report information extraction."""
1637 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1638
1639 def _real_extract(self, url):
d77c3dfd
FV
1640 file_id = url.split('/')[-1]
1641 # Rebuild url in english locale
1642 url = 'http://depositfiles.com/en/files/' + file_id
1643
1644 # Retrieve file webpage with 'Free download' button pressed
1645 free_download_indication = { 'gateway_result' : '1' }
1646 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1647 try:
1648 self.report_download_webpage(file_id)
1649 webpage = urllib2.urlopen(request).read()
1650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1652 return
1653
1654 # Search for the real file URL
1655 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1656 if (mobj is None) or (mobj.group(1) is None):
1657 # Try to figure out reason of the error.
1658 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1659 if (mobj is not None) and (mobj.group(1) is not None):
1660 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1661 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1662 else:
1663 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1664 return
1665
1666 file_url = mobj.group(1)
1667 file_extension = os.path.splitext(file_url)[1][1:]
1668
1669 # Search for file title
1670 mobj = re.search(r'<b title="(.*?)">', webpage)
1671 if mobj is None:
1672 self._downloader.trouble(u'ERROR: unable to extract title')
1673 return
1674 file_title = mobj.group(1).decode('utf-8')
1675
58ca755f
FV
1676 return [{
1677 'id': file_id.decode('utf-8'),
1678 'url': file_url.decode('utf-8'),
1679 'uploader': u'NA',
1680 'upload_date': u'NA',
1681 'title': file_title,
58ca755f
FV
1682 'ext': file_extension.decode('utf-8'),
1683 'format': u'NA',
1684 'player_url': None,
1685 }]
d77c3dfd
FV
1686
1687
1688class FacebookIE(InfoExtractor):
1689 """Information Extractor for Facebook"""
1690
1691 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1692 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1693 _NETRC_MACHINE = 'facebook'
1694 _available_formats = ['video', 'highqual', 'lowqual']
1695 _video_extensions = {
1696 'video': 'mp4',
1697 'highqual': 'mp4',
1698 'lowqual': 'mp4',
1699 }
1700 IE_NAME = u'facebook'
1701
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1704
1705 def _reporter(self, message):
1706 """Add header and report message."""
1707 self._downloader.to_screen(u'[facebook] %s' % message)
1708
1709 def report_login(self):
1710 """Report attempt to log in."""
1711 self._reporter(u'Logging in')
1712
1713 def report_video_webpage_download(self, video_id):
1714 """Report attempt to download video webpage."""
1715 self._reporter(u'%s: Downloading video webpage' % video_id)
1716
1717 def report_information_extraction(self, video_id):
1718 """Report attempt to extract video information."""
1719 self._reporter(u'%s: Extracting video information' % video_id)
1720
1721 def _parse_page(self, video_webpage):
1722 """Extract video information from page"""
1723 # General data
1724 data = {'title': r'\("video_title", "(.*?)"\)',
1725 'description': r'<div class="datawrap">(.*?)</div>',
1726 'owner': r'\("video_owner_name", "(.*?)"\)',
1727 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1728 }
1729 video_info = {}
1730 for piece in data.keys():
1731 mobj = re.search(data[piece], video_webpage)
1732 if mobj is not None:
1733 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1734
1735 # Video urls
1736 video_urls = {}
1737 for fmt in self._available_formats:
1738 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1739 if mobj is not None:
1740 # URL is in a Javascript segment inside an escaped Unicode format within
1741 # the generally utf-8 page
1742 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1743 video_info['video_urls'] = video_urls
1744
1745 return video_info
1746
1747 def _real_initialize(self):
1748 if self._downloader is None:
1749 return
1750
1751 useremail = None
1752 password = None
1753 downloader_params = self._downloader.params
1754
1755 # Attempt to use provided username and password or .netrc data
1756 if downloader_params.get('username', None) is not None:
1757 useremail = downloader_params['username']
1758 password = downloader_params['password']
1759 elif downloader_params.get('usenetrc', False):
1760 try:
1761 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1762 if info is not None:
1763 useremail = info[0]
1764 password = info[2]
1765 else:
1766 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1767 except (IOError, netrc.NetrcParseError), err:
1768 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1769 return
1770
1771 if useremail is None:
1772 return
1773
1774 # Log in
1775 login_form = {
1776 'email': useremail,
1777 'pass': password,
1778 'login': 'Log+In'
1779 }
1780 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1781 try:
1782 self.report_login()
1783 login_results = urllib2.urlopen(request).read()
1784 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1785 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1786 return
1787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1789 return
1790
1791 def _real_extract(self, url):
1792 mobj = re.match(self._VALID_URL, url)
1793 if mobj is None:
1794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1795 return
1796 video_id = mobj.group('ID')
1797
1798 # Get video webpage
1799 self.report_video_webpage_download(video_id)
1800 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1801 try:
1802 page = urllib2.urlopen(request)
1803 video_webpage = page.read()
1804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1806 return
1807
1808 # Start extracting information
1809 self.report_information_extraction(video_id)
1810
1811 # Extract information
1812 video_info = self._parse_page(video_webpage)
1813
1814 # uploader
1815 if 'owner' not in video_info:
1816 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1817 return
1818 video_uploader = video_info['owner']
1819
1820 # title
1821 if 'title' not in video_info:
1822 self._downloader.trouble(u'ERROR: unable to extract video title')
1823 return
1824 video_title = video_info['title']
1825 video_title = video_title.decode('utf-8')
d77c3dfd
FV
1826
1827 # thumbnail image
1828 if 'thumbnail' not in video_info:
1829 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1830 video_thumbnail = ''
1831 else:
1832 video_thumbnail = video_info['thumbnail']
1833
1834 # upload date
1835 upload_date = u'NA'
1836 if 'upload_date' in video_info:
1837 upload_time = video_info['upload_date']
1838 timetuple = email.utils.parsedate_tz(upload_time)
1839 if timetuple is not None:
1840 try:
1841 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1842 except:
1843 pass
1844
1845 # description
1846 video_description = video_info.get('description', 'No description available.')
1847
1848 url_map = video_info['video_urls']
1849 if len(url_map.keys()) > 0:
1850 # Decide which formats to download
1851 req_format = self._downloader.params.get('format', None)
1852 format_limit = self._downloader.params.get('format_limit', None)
1853
1854 if format_limit is not None and format_limit in self._available_formats:
1855 format_list = self._available_formats[self._available_formats.index(format_limit):]
1856 else:
1857 format_list = self._available_formats
1858 existing_formats = [x for x in format_list if x in url_map]
1859 if len(existing_formats) == 0:
1860 self._downloader.trouble(u'ERROR: no known formats available for video')
1861 return
1862 if req_format is None:
1863 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1864 elif req_format == 'worst':
1865 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1866 elif req_format == '-1':
1867 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1868 else:
1869 # Specific format
1870 if req_format not in url_map:
1871 self._downloader.trouble(u'ERROR: requested format not available')
1872 return
1873 video_url_list = [(req_format, url_map[req_format])] # Specific format
1874
58ca755f 1875 results = []
d77c3dfd 1876 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
1877 # Extension
1878 video_extension = self._video_extensions.get(format_param, 'mp4')
1879
58ca755f
FV
1880 results.append({
1881 'id': video_id.decode('utf-8'),
1882 'url': video_real_url.decode('utf-8'),
1883 'uploader': video_uploader.decode('utf-8'),
1884 'upload_date': upload_date,
1885 'title': video_title,
58ca755f
FV
1886 'ext': video_extension.decode('utf-8'),
1887 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1888 'thumbnail': video_thumbnail.decode('utf-8'),
1889 'description': video_description.decode('utf-8'),
1890 'player_url': None,
1891 })
1892 return results
d77c3dfd
FV
1893
1894class BlipTVIE(InfoExtractor):
1895 """Information extractor for blip.tv"""
1896
1897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1898 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1899 IE_NAME = u'blip.tv'
1900
1901 def report_extraction(self, file_id):
1902 """Report information extraction."""
1903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1904
1905 def report_direct_download(self, title):
1906 """Report information extraction."""
1907 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1908
1909 def _real_extract(self, url):
1910 mobj = re.match(self._VALID_URL, url)
1911 if mobj is None:
1912 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1913 return
1914
1915 if '?' in url:
1916 cchar = '&'
1917 else:
1918 cchar = '?'
1919 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920 request = urllib2.Request(json_url)
1921 self.report_extraction(mobj.group(1))
1922 info = None
1923 try:
1924 urlh = urllib2.urlopen(request)
1925 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1926 basename = url.split('/')[-1]
1927 title,ext = os.path.splitext(basename)
1928 title = title.decode('UTF-8')
1929 ext = ext.replace('.', '')
1930 self.report_direct_download(title)
1931 info = {
1932 'id': title,
1933 'url': url,
1934 'title': title,
d77c3dfd
FV
1935 'ext': ext,
1936 'urlhandle': urlh
1937 }
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1940 return
1941 if info is None: # Regular URL
1942 try:
1943 json_code = urlh.read()
1944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1945 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1946 return
1947
1948 try:
1949 json_data = json.loads(json_code)
1950 if 'Post' in json_data:
1951 data = json_data['Post']
1952 else:
1953 data = json_data
3fe294e4 1954
d77c3dfd
FV
1955 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1956 video_url = data['media']['url']
1957 umobj = re.match(self._URL_EXT, video_url)
1958 if umobj is None:
1959 raise ValueError('Can not determine filename extension')
1960 ext = umobj.group(1)
3fe294e4 1961
d77c3dfd
FV
1962 info = {
1963 'id': data['item_id'],
1964 'url': video_url,
1965 'uploader': data['display_name'],
1966 'upload_date': upload_date,
1967 'title': data['title'],
d77c3dfd
FV
1968 'ext': ext,
1969 'format': data['media']['mimeType'],
1970 'thumbnail': data['thumbnailUrl'],
1971 'description': data['description'],
1972 'player_url': data['embedUrl']
1973 }
1974 except (ValueError,KeyError), err:
1975 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1976 return
1977
58ca755f 1978 return [info]
d77c3dfd
FV
1979
1980
1981class MyVideoIE(InfoExtractor):
1982 """Information Extractor for myvideo.de."""
1983
1984 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1985 IE_NAME = u'myvideo'
1986
1987 def __init__(self, downloader=None):
1988 InfoExtractor.__init__(self, downloader)
1989
1990 def report_download_webpage(self, video_id):
1991 """Report webpage download."""
1992 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1993
1994 def report_extraction(self, video_id):
1995 """Report information extraction."""
1996 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1997
1998 def _real_extract(self,url):
1999 mobj = re.match(self._VALID_URL, url)
2000 if mobj is None:
2001 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2002 return
2003
2004 video_id = mobj.group(1)
2005
2006 # Get video webpage
2007 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2008 try:
2009 self.report_download_webpage(video_id)
2010 webpage = urllib2.urlopen(request).read()
2011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2013 return
2014
2015 self.report_extraction(video_id)
2016 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2017 webpage)
2018 if mobj is None:
2019 self._downloader.trouble(u'ERROR: unable to extract media URL')
2020 return
2021 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2022
2023 mobj = re.search('<title>([^<]+)</title>', webpage)
2024 if mobj is None:
2025 self._downloader.trouble(u'ERROR: unable to extract title')
2026 return
2027
2028 video_title = mobj.group(1)
d77c3dfd 2029
58ca755f
FV
2030 return [{
2031 'id': video_id,
2032 'url': video_url,
2033 'uploader': u'NA',
2034 'upload_date': u'NA',
2035 'title': video_title,
58ca755f
FV
2036 'ext': u'flv',
2037 'format': u'NA',
2038 'player_url': None,
2039 }]
d77c3dfd
FV
2040
2041class ComedyCentralIE(InfoExtractor):
2042 """Information extractor for The Daily Show and Colbert Report """
2043
2044 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2045 IE_NAME = u'comedycentral'
2046
2047 def report_extraction(self, episode_id):
2048 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3fe294e4 2049
d77c3dfd
FV
2050 def report_config_download(self, episode_id):
2051 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2052
2053 def report_index_download(self, episode_id):
2054 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2055
2056 def report_player_url(self, episode_id):
2057 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2058
2059 def _real_extract(self, url):
2060 mobj = re.match(self._VALID_URL, url)
2061 if mobj is None:
2062 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2063 return
2064
2065 if mobj.group('shortname'):
2066 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067 url = u'http://www.thedailyshow.com/full-episodes/'
2068 else:
2069 url = u'http://www.colbertnation.com/full-episodes/'
2070 mobj = re.match(self._VALID_URL, url)
2071 assert mobj is not None
2072
2073 dlNewest = not mobj.group('episode')
2074 if dlNewest:
2075 epTitle = mobj.group('showname')
2076 else:
2077 epTitle = mobj.group('episode')
2078
2079 req = urllib2.Request(url)
2080 self.report_extraction(epTitle)
2081 try:
2082 htmlHandle = urllib2.urlopen(req)
2083 html = htmlHandle.read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2086 return
2087 if dlNewest:
2088 url = htmlHandle.geturl()
2089 mobj = re.match(self._VALID_URL, url)
2090 if mobj is None:
2091 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2092 return
2093 if mobj.group('episode') == '':
2094 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2095 return
2096 epTitle = mobj.group('episode')
2097
2098 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2099 if len(mMovieParams) == 0:
2100 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2101 return
2102
2103 playerUrl_raw = mMovieParams[0][0]
2104 self.report_player_url(epTitle)
2105 try:
2106 urlHandle = urllib2.urlopen(playerUrl_raw)
2107 playerUrl = urlHandle.geturl()
2108 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2109 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2110 return
2111
2112 uri = mMovieParams[0][1]
2113 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2114 self.report_index_download(epTitle)
2115 try:
2116 indexXml = urllib2.urlopen(indexUrl).read()
2117 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2119 return
2120
58ca755f
FV
2121 results = []
2122
d77c3dfd
FV
2123 idoc = xml.etree.ElementTree.fromstring(indexXml)
2124 itemEls = idoc.findall('.//item')
2125 for itemEl in itemEls:
2126 mediaId = itemEl.findall('./guid')[0].text
2127 shortMediaId = mediaId.split(':')[-1]
2128 showId = mediaId.split(':')[-2].replace('.com', '')
2129 officialTitle = itemEl.findall('./title')[0].text
2130 officialDate = itemEl.findall('./pubDate')[0].text
2131
2132 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2133 urllib.urlencode({'uri': mediaId}))
2134 configReq = urllib2.Request(configUrl)
2135 self.report_config_download(epTitle)
2136 try:
2137 configXml = urllib2.urlopen(configReq).read()
2138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2139 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2140 return
2141
2142 cdoc = xml.etree.ElementTree.fromstring(configXml)
2143 turls = []
2144 for rendition in cdoc.findall('.//rendition'):
2145 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2146 turls.append(finfo)
2147
2148 if len(turls) == 0:
2149 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2150 continue
2151
2152 # For now, just pick the highest bitrate
2153 format,video_url = turls[-1]
2154
d77c3dfd
FV
2155 effTitle = showId + u'-' + epTitle
2156 info = {
2157 'id': shortMediaId,
2158 'url': video_url,
2159 'uploader': showId,
2160 'upload_date': officialDate,
2161 'title': effTitle,
d77c3dfd
FV
2162 'ext': 'mp4',
2163 'format': format,
2164 'thumbnail': None,
2165 'description': officialTitle,
2166 'player_url': playerUrl
2167 }
2168
58ca755f
FV
2169 results.append(info)
2170
2171 return results
d77c3dfd
FV
2172
2173
2174class EscapistIE(InfoExtractor):
2175 """Information extractor for The Escapist """
2176
2177 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2178 IE_NAME = u'escapist'
2179
2180 def report_extraction(self, showName):
2181 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2182
2183 def report_config_download(self, showName):
2184 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2185
2186 def _real_extract(self, url):
d77c3dfd
FV
2187 mobj = re.match(self._VALID_URL, url)
2188 if mobj is None:
2189 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2190 return
2191 showName = mobj.group('showname')
2192 videoId = mobj.group('episode')
2193
2194 self.report_extraction(showName)
2195 try:
9ab3406d 2196 webPageBytes = urllib2.urlopen(url).read()
d77c3dfd
FV
2197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2198 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2199 return
2200
9ab3406d 2201 webPage = webPageBytes.decode('utf-8')
d77c3dfd 2202 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
9e6dd238 2203 description = unescapeHTML(descMatch.group(1))
d77c3dfd 2204 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
9e6dd238 2205 imgUrl = unescapeHTML(imgMatch.group(1))
d77c3dfd 2206 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
9e6dd238 2207 playerUrl = unescapeHTML(playerUrlMatch.group(1))
d77c3dfd
FV
2208 configUrlMatch = re.search('config=(.*)$', playerUrl)
2209 configUrl = urllib2.unquote(configUrlMatch.group(1))
2210
2211 self.report_config_download(showName)
2212 try:
2213 configJSON = urllib2.urlopen(configUrl).read()
2214 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2215 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2216 return
2217
2218 # Technically, it's JavaScript, not JSON
2219 configJSON = configJSON.replace("'", '"')
2220
2221 try:
2222 config = json.loads(configJSON)
2223 except (ValueError,), err:
2224 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2225 return
2226
2227 playlist = config['playlist']
2228 videoUrl = playlist[1]['url']
2229
d77c3dfd
FV
2230 info = {
2231 'id': videoId,
2232 'url': videoUrl,
2233 'uploader': showName,
2234 'upload_date': None,
2235 'title': showName,
d77c3dfd
FV
2236 'ext': 'flv',
2237 'format': 'flv',
2238 'thumbnail': imgUrl,
2239 'description': description,
2240 'player_url': playerUrl,
2241 }
2242
58ca755f 2243 return [info]
d77c3dfd
FV
2244
2245
2246class CollegeHumorIE(InfoExtractor):
2247 """Information extractor for collegehumor.com"""
2248
2249 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2250 IE_NAME = u'collegehumor'
2251
2252 def report_webpage(self, video_id):
2253 """Report information extraction."""
2254 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2255
2256 def report_extraction(self, video_id):
2257 """Report information extraction."""
2258 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2259
2260 def _real_extract(self, url):
d77c3dfd
FV
2261 mobj = re.match(self._VALID_URL, url)
2262 if mobj is None:
2263 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2264 return
2265 video_id = mobj.group('videoid')
2266
2267 self.report_webpage(video_id)
2268 request = urllib2.Request(url)
2269 try:
2270 webpage = urllib2.urlopen(request).read()
2271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2273 return
2274
2275 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2276 if m is None:
2277 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2278 return
2279 internal_video_id = m.group('internalvideoid')
2280
2281 info = {
2282 'id': video_id,
2283 'internal_id': internal_video_id,
2284 }
2285
2286 self.report_extraction(video_id)
2287 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2288 try:
2289 metaXml = urllib2.urlopen(xmlUrl).read()
2290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2291 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2292 return
2293
2294 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2295 try:
2296 videoNode = mdoc.findall('./video')[0]
2297 info['description'] = videoNode.findall('./description')[0].text
2298 info['title'] = videoNode.findall('./caption')[0].text
d77c3dfd
FV
2299 info['url'] = videoNode.findall('./file')[0].text
2300 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2301 info['ext'] = info['url'].rpartition('.')[2]
2302 info['format'] = info['ext']
2303 except IndexError:
2304 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2305 return
2306
58ca755f 2307 return [info]
d77c3dfd
FV
2308
2309
2310class XVideosIE(InfoExtractor):
2311 """Information extractor for xvideos.com"""
2312
2313 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2314 IE_NAME = u'xvideos'
2315
2316 def report_webpage(self, video_id):
2317 """Report information extraction."""
2318 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2319
2320 def report_extraction(self, video_id):
2321 """Report information extraction."""
2322 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2323
2324 def _real_extract(self, url):
d77c3dfd
FV
2325 mobj = re.match(self._VALID_URL, url)
2326 if mobj is None:
2327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2328 return
2329 video_id = mobj.group(1).decode('utf-8')
2330
2331 self.report_webpage(video_id)
2332
2333 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2334 try:
2335 webpage = urllib2.urlopen(request).read()
2336 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2337 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2338 return
2339
2340 self.report_extraction(video_id)
2341
2342
2343 # Extract video URL
2344 mobj = re.search(r'flv_url=(.+?)&', webpage)
2345 if mobj is None:
2346 self._downloader.trouble(u'ERROR: unable to extract video url')
2347 return
2348 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2349
2350
2351 # Extract title
2352 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2353 if mobj is None:
2354 self._downloader.trouble(u'ERROR: unable to extract video title')
2355 return
2356 video_title = mobj.group(1).decode('utf-8')
2357
2358
2359 # Extract video thumbnail
2360 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2361 if mobj is None:
2362 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2363 return
2364 video_thumbnail = mobj.group(1).decode('utf-8')
2365
d77c3dfd
FV
2366 info = {
2367 'id': video_id,
2368 'url': video_url,
2369 'uploader': None,
2370 'upload_date': None,
2371 'title': video_title,
d77c3dfd
FV
2372 'ext': 'flv',
2373 'format': 'flv',
2374 'thumbnail': video_thumbnail,
2375 'description': None,
2376 'player_url': None,
2377 }
2378
58ca755f 2379 return [info]
d77c3dfd
FV
2380
2381
2382class SoundcloudIE(InfoExtractor):
2383 """Information extractor for soundcloud.com
2384 To access the media, the uid of the song and a stream token
2385 must be extracted from the page source and the script must make
2386 a request to media.soundcloud.com/crossdomain.xml. Then
2387 the media can be grabbed by requesting from an url composed
2388 of the stream token and uid
2389 """
2390
2391 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2392 IE_NAME = u'soundcloud'
2393
2394 def __init__(self, downloader=None):
2395 InfoExtractor.__init__(self, downloader)
2396
2397 def report_webpage(self, video_id):
2398 """Report information extraction."""
2399 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2400
2401 def report_extraction(self, video_id):
2402 """Report information extraction."""
2403 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2404
2405 def _real_extract(self, url):
d77c3dfd
FV
2406 mobj = re.match(self._VALID_URL, url)
2407 if mobj is None:
2408 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2409 return
2410
2411 # extract uploader (which is in the url)
2412 uploader = mobj.group(1).decode('utf-8')
2413 # extract simple title (uploader + slug of song title)
2414 slug_title = mobj.group(2).decode('utf-8')
2c288bda 2415 simple_title = uploader + u'-' + slug_title
d77c3dfd
FV
2416
2417 self.report_webpage('%s/%s' % (uploader, slug_title))
2418
2419 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2420 try:
2421 webpage = urllib2.urlopen(request).read()
2422 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2423 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2424 return
2425
2426 self.report_extraction('%s/%s' % (uploader, slug_title))
2427
2428 # extract uid and stream token that soundcloud hands out for access
2429 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2430 if mobj:
2431 video_id = mobj.group(1)
2432 stream_token = mobj.group(2)
2433
2434 # extract unsimplified title
2435 mobj = re.search('"title":"(.*?)",', webpage)
2436 if mobj:
2c288bda
FV
2437 title = mobj.group(1).decode('utf-8')
2438 else:
2439 title = simple_title
d77c3dfd
FV
2440
2441 # construct media url (with uid/token)
2442 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2443 mediaURL = mediaURL % (video_id, stream_token)
2444
2445 # description
2446 description = u'No description available'
2447 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2448 if mobj:
2449 description = mobj.group(1)
2450
2451 # upload date
2452 upload_date = None
2453 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2454 if mobj:
2455 try:
2456 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2457 except Exception, e:
6ab92c8b 2458 self._downloader.to_stderr(str(e))
d77c3dfd
FV
2459
2460 # for soundcloud, a request to a cross domain is required for cookies
2461 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2462
58ca755f
FV
2463 return [{
2464 'id': video_id.decode('utf-8'),
2465 'url': mediaURL,
2466 'uploader': uploader.decode('utf-8'),
2467 'upload_date': upload_date,
2c288bda 2468 'title': title,
58ca755f
FV
2469 'ext': u'mp3',
2470 'format': u'NA',
2471 'player_url': None,
2472 'description': description.decode('utf-8')
2473 }]
d77c3dfd
FV
2474
2475
2476class InfoQIE(InfoExtractor):
2477 """Information extractor for infoq.com"""
2478
2479 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2480 IE_NAME = u'infoq'
2481
2482 def report_webpage(self, video_id):
2483 """Report information extraction."""
2484 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2485
2486 def report_extraction(self, video_id):
2487 """Report information extraction."""
2488 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2489
2490 def _real_extract(self, url):
d77c3dfd
FV
2491 mobj = re.match(self._VALID_URL, url)
2492 if mobj is None:
2493 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2494 return
2495
2496 self.report_webpage(url)
2497
2498 request = urllib2.Request(url)
2499 try:
2500 webpage = urllib2.urlopen(request).read()
2501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2502 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2503 return
2504
2505 self.report_extraction(url)
2506
2507
2508 # Extract video URL
2509 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2510 if mobj is None:
2511 self._downloader.trouble(u'ERROR: unable to extract video url')
2512 return
2513 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2514
2515
2516 # Extract title
2517 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2518 if mobj is None:
2519 self._downloader.trouble(u'ERROR: unable to extract video title')
2520 return
2521 video_title = mobj.group(1).decode('utf-8')
2522
2523 # Extract description
2524 video_description = u'No description available.'
2525 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2526 if mobj is not None:
2527 video_description = mobj.group(1).decode('utf-8')
2528
2529 video_filename = video_url.split('/')[-1]
2530 video_id, extension = video_filename.split('.')
2531
d77c3dfd
FV
2532 info = {
2533 'id': video_id,
2534 'url': video_url,
2535 'uploader': None,
2536 'upload_date': None,
2537 'title': video_title,
d77c3dfd
FV
2538 'ext': extension,
2539 'format': extension, # Extension is always(?) mp4, but seems to be flv
2540 'thumbnail': None,
2541 'description': video_description,
2542 'player_url': None,
2543 }
2544
58ca755f 2545 return [info]
d77c3dfd
FV
2546
2547class MixcloudIE(InfoExtractor):
2548 """Information extractor for www.mixcloud.com"""
2549 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2550 IE_NAME = u'mixcloud'
2551
2552 def __init__(self, downloader=None):
2553 InfoExtractor.__init__(self, downloader)
2554
2555 def report_download_json(self, file_id):
2556 """Report JSON download."""
2557 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2558
2559 def report_extraction(self, file_id):
2560 """Report information extraction."""
2561 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2562
2563 def get_urls(self, jsonData, fmt, bitrate='best'):
2564 """Get urls from 'audio_formats' section in json"""
2565 file_url = None
2566 try:
2567 bitrate_list = jsonData[fmt]
2568 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2569 bitrate = max(bitrate_list) # select highest
2570
2571 url_list = jsonData[fmt][bitrate]
2572 except TypeError: # we have no bitrate info.
2573 url_list = jsonData[fmt]
d77c3dfd
FV
2574 return url_list
2575
2576 def check_urls(self, url_list):
2577 """Returns 1st active url from list"""
2578 for url in url_list:
2579 try:
2580 urllib2.urlopen(url)
2581 return url
2582 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2583 url = None
2584
2585 return None
2586
2587 def _print_formats(self, formats):
2588 print 'Available formats:'
2589 for fmt in formats.keys():
2590 for b in formats[fmt]:
2591 try:
2592 ext = formats[fmt][b][0]
2593 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2594 except TypeError: # we have no bitrate info
2595 ext = formats[fmt][0]
2596 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2597 break
2598
2599 def _real_extract(self, url):
2600 mobj = re.match(self._VALID_URL, url)
2601 if mobj is None:
2602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2603 return
2604 # extract uploader & filename from url
2605 uploader = mobj.group(1).decode('utf-8')
2606 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2607
2608 # construct API request
2609 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2610 # retrieve .json file with links to files
2611 request = urllib2.Request(file_url)
2612 try:
2613 self.report_download_json(file_url)
2614 jsonData = urllib2.urlopen(request).read()
2615 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2616 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2617 return
2618
2619 # parse JSON
2620 json_data = json.loads(jsonData)
2621 player_url = json_data['player_swf_url']
2622 formats = dict(json_data['audio_formats'])
2623
2624 req_format = self._downloader.params.get('format', None)
2625 bitrate = None
2626
2627 if self._downloader.params.get('listformats', None):
2628 self._print_formats(formats)
2629 return
2630
2631 if req_format is None or req_format == 'best':
2632 for format_param in formats.keys():
2633 url_list = self.get_urls(formats, format_param)
2634 # check urls
2635 file_url = self.check_urls(url_list)
2636 if file_url is not None:
2637 break # got it!
2638 else:
2639 if req_format not in formats.keys():
2640 self._downloader.trouble(u'ERROR: format is not available')
2641 return
2642
2643 url_list = self.get_urls(formats, req_format)
2644 file_url = self.check_urls(url_list)
2645 format_param = req_format
2646
58ca755f
FV
2647 return [{
2648 'id': file_id.decode('utf-8'),
2649 'url': file_url.decode('utf-8'),
2650 'uploader': uploader.decode('utf-8'),
2651 'upload_date': u'NA',
2652 'title': json_data['name'],
58ca755f
FV
2653 'ext': file_url.split('.')[-1].decode('utf-8'),
2654 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2655 'thumbnail': json_data['thumbnail_url'],
2656 'description': json_data['description'],
2657 'player_url': player_url.decode('utf-8'),
2658 }]
d77c3dfd
FV
2659
2660class StanfordOpenClassroomIE(InfoExtractor):
2661 """Information extractor for Stanford's Open ClassRoom"""
2662
2663 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2664 IE_NAME = u'stanfordoc'
2665
2666 def report_download_webpage(self, objid):
2667 """Report information extraction."""
2668 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2669
2670 def report_extraction(self, video_id):
2671 """Report information extraction."""
2672 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2673
2674 def _real_extract(self, url):
2675 mobj = re.match(self._VALID_URL, url)
2676 if mobj is None:
2677 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2678 return
2679
2680 if mobj.group('course') and mobj.group('video'): # A specific video
2681 course = mobj.group('course')
2682 video = mobj.group('video')
2683 info = {
2c288bda 2684 'id': course + '_' + video,
d77c3dfd 2685 }
3fe294e4 2686
d77c3dfd
FV
2687 self.report_extraction(info['id'])
2688 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2689 xmlUrl = baseUrl + video + '.xml'
2690 try:
2691 metaXml = urllib2.urlopen(xmlUrl).read()
2692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2693 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2694 return
2695 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2696 try:
2697 info['title'] = mdoc.findall('./title')[0].text
2698 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2699 except IndexError:
2700 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2701 return
d77c3dfd
FV
2702 info['ext'] = info['url'].rpartition('.')[2]
2703 info['format'] = info['ext']
58ca755f 2704 return [info]
d77c3dfd 2705 elif mobj.group('course'): # A course page
d77c3dfd
FV
2706 course = mobj.group('course')
2707 info = {
2c288bda 2708 'id': course,
d77c3dfd
FV
2709 'type': 'playlist',
2710 }
2711
2712 self.report_download_webpage(info['id'])
2713 try:
2714 coursepage = urllib2.urlopen(url).read()
2715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2717 return
2718
2719 m = re.search('<h1>([^<]+)</h1>', coursepage)
2720 if m:
2721 info['title'] = unescapeHTML(m.group(1))
2722 else:
2723 info['title'] = info['id']
d77c3dfd
FV
2724
2725 m = re.search('<description>([^<]+)</description>', coursepage)
2726 if m:
2727 info['description'] = unescapeHTML(m.group(1))
2728
2729 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2730 info['list'] = [
2731 {
2732 'type': 'reference',
2733 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2734 }
2735 for vpage in links]
58ca755f 2736 results = []
d77c3dfd
FV
2737 for entry in info['list']:
2738 assert entry['type'] == 'reference'
58ca755f
FV
2739 results += self.extract(entry['url'])
2740 return results
2741
d77c3dfd 2742 else: # Root page
d77c3dfd
FV
2743 info = {
2744 'id': 'Stanford OpenClassroom',
2745 'type': 'playlist',
2746 }
2747
2748 self.report_download_webpage(info['id'])
2749 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2750 try:
2751 rootpage = urllib2.urlopen(rootURL).read()
2752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2754 return
2755
2756 info['title'] = info['id']
d77c3dfd
FV
2757
2758 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2759 info['list'] = [
2760 {
2761 'type': 'reference',
2762 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2763 }
2764 for cpage in links]
2765
58ca755f 2766 results = []
d77c3dfd
FV
2767 for entry in info['list']:
2768 assert entry['type'] == 'reference'
58ca755f
FV
2769 results += self.extract(entry['url'])
2770 return results
d77c3dfd
FV
2771
2772class MTVIE(InfoExtractor):
2773 """Information extractor for MTV.com"""
2774
2775 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2776 IE_NAME = u'mtv'
2777
2778 def report_webpage(self, video_id):
2779 """Report information extraction."""
2780 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2781
2782 def report_extraction(self, video_id):
2783 """Report information extraction."""
2784 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2785
2786 def _real_extract(self, url):
2787 mobj = re.match(self._VALID_URL, url)
2788 if mobj is None:
2789 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2790 return
2791 if not mobj.group('proto'):
2792 url = 'http://' + url
2793 video_id = mobj.group('videoid')
2794 self.report_webpage(video_id)
2795
2796 request = urllib2.Request(url)
2797 try:
2798 webpage = urllib2.urlopen(request).read()
2799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2800 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2801 return
2802
2803 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2804 if mobj is None:
2805 self._downloader.trouble(u'ERROR: unable to extract song name')
2806 return
2807 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2808 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2809 if mobj is None:
2810 self._downloader.trouble(u'ERROR: unable to extract performer')
2811 return
2812 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2813 video_title = performer + ' - ' + song_name
2814
2815 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2816 if mobj is None:
2817 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2818 return
2819 mtvn_uri = mobj.group(1)
2820
2821 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2822 if mobj is None:
2823 self._downloader.trouble(u'ERROR: unable to extract content id')
2824 return
2825 content_id = mobj.group(1)
2826
2827 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2828 self.report_extraction(video_id)
2829 request = urllib2.Request(videogen_url)
2830 try:
2831 metadataXml = urllib2.urlopen(request).read()
2832 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2833 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2834 return
2835
2836 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2837 renditions = mdoc.findall('.//rendition')
2838
2839 # For now, always pick the highest quality.
2840 rendition = renditions[-1]
2841
2842 try:
2843 _,_,ext = rendition.attrib['type'].partition('/')
2844 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2845 video_url = rendition.find('./src').text
2846 except KeyError:
2847 self._downloader.trouble('Invalid rendition field.')
2848 return
2849
d77c3dfd
FV
2850 info = {
2851 'id': video_id,
2852 'url': video_url,
2853 'uploader': performer,
2854 'title': video_title,
d77c3dfd
FV
2855 'ext': ext,
2856 'format': format,
2857 }
2858
58ca755f 2859 return [info]