]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Made the YouTubeIE regex verbose/commented
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import datetime
5import HTMLParser
6import httplib
7import netrc
8import os
9import re
10import socket
11import time
12import urllib
13import urllib2
14import email.utils
921a1455
FV
15import xml.etree.ElementTree
16from urlparse import parse_qs
d77c3dfd
FV
17
18try:
19 import cStringIO as StringIO
20except ImportError:
21 import StringIO
22
d11d05d0 23from utils import *
d77c3dfd
FV
24
25
26class InfoExtractor(object):
27 """Information Extractor class.
28
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
36 the following fields:
37
38 id: Video identifier.
39 url: Final video URL.
40 uploader: Nickname of the video uploader.
41 title: Literal title.
d77c3dfd
FV
42 ext: Video filename extension.
43 format: Video format.
44 player_url: SWF Player URL (may be None).
45
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
50
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57 """
58
59 _ready = False
60 _downloader = None
61
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
64 self._ready = False
65 self.set_downloader(downloader)
66
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
70
71 def initialize(self):
72 """Initializes an instance (authentication, etc)."""
73 if not self._ready:
74 self._real_initialize()
75 self._ready = True
76
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
79 self.initialize()
80 return self._real_extract(url)
81
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
85
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
88 pass
89
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
92 pass
93
94
95class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
97
202e76cf
FV
98 _VALID_URL = r"""^
99 (
100 (?:https?://)? # http(s):// (optional)
101 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/) # the various hostnames, with wildcard subdomains
102 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
103 (?: # the various things that can precede the ID:
104 (?:(?:v|embed|e)/) # v/ or embed/ or e/
105 |(?: # or the v= param in all its forms
106 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
107 (?:\?|\#!?) # the params delimiter ? or # or #!
108 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
109 v=
110 )
111 )? # optional -> youtube.com/xxxx is OK
112 )? # all until now is optional -> you can pass the naked ID
113 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
114 (?(1).+)? # if we found the ID, everything can follow
115 $"""
d77c3dfd
FV
116 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
117 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
118 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
119 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
120 _NETRC_MACHINE = 'youtube'
121 # Listed in order of quality
3fe294e4
FV
122 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
123 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
d77c3dfd
FV
124 _video_extensions = {
125 '13': '3gp',
126 '17': 'mp4',
127 '18': 'mp4',
128 '22': 'mp4',
129 '37': 'mp4',
130 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
131 '43': 'webm',
132 '44': 'webm',
133 '45': 'webm',
3fe294e4 134 '46': 'webm',
d77c3dfd
FV
135 }
136 _video_dimensions = {
137 '5': '240x400',
138 '6': '???',
139 '13': '???',
140 '17': '144x176',
141 '18': '360x640',
142 '22': '720x1280',
143 '34': '360x640',
144 '35': '480x854',
145 '37': '1080x1920',
146 '38': '3072x4096',
147 '43': '360x640',
148 '44': '480x854',
149 '45': '720x1280',
3fe294e4 150 '46': '1080x1920',
d77c3dfd
FV
151 }
152 IE_NAME = u'youtube'
153
202e76cf
FV
154 def suitable(self, url):
155 """Receives a URL and returns True if suitable for this IE."""
156 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
157
d77c3dfd
FV
158 def report_lang(self):
159 """Report attempt to set language."""
160 self._downloader.to_screen(u'[youtube] Setting language')
161
162 def report_login(self):
163 """Report attempt to log in."""
164 self._downloader.to_screen(u'[youtube] Logging in')
165
166 def report_age_confirmation(self):
167 """Report attempt to confirm age."""
168 self._downloader.to_screen(u'[youtube] Confirming age')
169
170 def report_video_webpage_download(self, video_id):
171 """Report attempt to download video webpage."""
172 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
173
174 def report_video_info_webpage_download(self, video_id):
175 """Report attempt to download video info webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
177
178 def report_video_subtitles_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
181
182 def report_information_extraction(self, video_id):
183 """Report attempt to extract video information."""
184 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
185
186 def report_unavailable_format(self, video_id, format):
187 """Report extracted video URL."""
188 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
189
190 def report_rtmp_download(self):
191 """Indicate the download will use the RTMP protocol."""
192 self._downloader.to_screen(u'[youtube] RTMP download detected')
193
194 def _closed_captions_xml_to_srt(self, xml_string):
195 srt = ''
196 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
197 # TODO parse xml instead of regex
198 for n, (start, dur_tag, dur, caption) in enumerate(texts):
199 if not dur: dur = '4'
200 start = float(start)
201 end = start + float(dur)
202 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
203 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
9e6dd238 204 caption = unescapeHTML(caption)
6ab92c8b 205 caption = unescapeHTML(caption) # double cycle, intentional
54041793 206 srt += str(n+1) + '\n'
d77c3dfd
FV
207 srt += start + ' --> ' + end + '\n'
208 srt += caption + '\n\n'
209 return srt
210
211 def _print_formats(self, formats):
212 print 'Available formats:'
213 for x in formats:
214 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
215
216 def _real_initialize(self):
217 if self._downloader is None:
218 return
219
220 username = None
221 password = None
222 downloader_params = self._downloader.params
223
224 # Attempt to use provided username and password or .netrc data
225 if downloader_params.get('username', None) is not None:
226 username = downloader_params['username']
227 password = downloader_params['password']
228 elif downloader_params.get('usenetrc', False):
229 try:
230 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
231 if info is not None:
232 username = info[0]
233 password = info[2]
234 else:
235 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
236 except (IOError, netrc.NetrcParseError), err:
237 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
238 return
239
240 # Set language
241 request = urllib2.Request(self._LANG_URL)
242 try:
243 self.report_lang()
244 urllib2.urlopen(request).read()
245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
246 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
247 return
248
249 # No authentication to be performed
250 if username is None:
251 return
252
253 # Log in
254 login_form = {
255 'current_form': 'loginForm',
256 'next': '/',
257 'action_login': 'Log In',
258 'username': username,
259 'password': password,
260 }
261 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
262 try:
263 self.report_login()
264 login_results = urllib2.urlopen(request).read()
265 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
266 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
267 return
268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
269 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
270 return
271
272 # Confirm age
273 age_form = {
274 'next_url': '/',
275 'action_confirm': 'Confirm',
276 }
277 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
278 try:
279 self.report_age_confirmation()
280 age_results = urllib2.urlopen(request).read()
281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
282 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
283 return
284
285 def _real_extract(self, url):
286 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
287 mobj = re.search(self._NEXT_URL_RE, url)
288 if mobj:
289 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
290
291 # Extract video id from URL
202e76cf 292 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
d77c3dfd
FV
293 if mobj is None:
294 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
295 return
296 video_id = mobj.group(2)
297
298 # Get video webpage
299 self.report_video_webpage_download(video_id)
300 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
301 try:
302 video_webpage = urllib2.urlopen(request).read()
303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
304 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
305 return
306
307 # Attempt to extract SWF player URL
308 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
309 if mobj is not None:
310 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
311 else:
312 player_url = None
313
314 # Get video info
315 self.report_video_info_webpage_download(video_id)
316 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
317 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
318 % (video_id, el_type))
319 request = urllib2.Request(video_info_url)
320 try:
321 video_info_webpage = urllib2.urlopen(request).read()
322 video_info = parse_qs(video_info_webpage)
323 if 'token' in video_info:
324 break
325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
326 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
327 return
328 if 'token' not in video_info:
329 if 'reason' in video_info:
330 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
331 else:
332 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
333 return
334
7df97fb5
FV
335 # Check for "rental" videos
336 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
337 self._downloader.trouble(u'ERROR: "rental" videos not supported')
338 return
339
d77c3dfd
FV
340 # Start extracting information
341 self.report_information_extraction(video_id)
342
343 # uploader
344 if 'author' not in video_info:
345 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
346 return
347 video_uploader = urllib.unquote_plus(video_info['author'][0])
348
349 # title
350 if 'title' not in video_info:
351 self._downloader.trouble(u'ERROR: unable to extract video title')
352 return
353 video_title = urllib.unquote_plus(video_info['title'][0])
354 video_title = video_title.decode('utf-8')
d77c3dfd
FV
355
356 # thumbnail image
357 if 'thumbnail_url' not in video_info:
358 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
359 video_thumbnail = ''
360 else: # don't panic if we can't find it
361 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
362
363 # upload date
364 upload_date = u'NA'
365 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
366 if mobj is not None:
367 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
368 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
369 for expression in format_expressions:
370 try:
371 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
372 except:
373 pass
374
375 # description
9beb5af8
FV
376 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
377 if video_description: video_description = clean_html(video_description)
9e6dd238 378 else: video_description = ''
d77c3dfd
FV
379
380 # closed captions
381 video_subtitles = None
382 if self._downloader.params.get('writesubtitles', False):
d77c3dfd 383 try:
0b8c922d
FV
384 self.report_video_subtitles_download(video_id)
385 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 try:
387 srt_list = urllib2.urlopen(request).read()
388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
389 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
dee5d769
FV
390 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
391 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
0b8c922d
FV
392 if not srt_lang_list:
393 raise Trouble(u'WARNING: video has no closed captions')
394 if self._downloader.params.get('subtitleslang', False):
395 srt_lang = self._downloader.params.get('subtitleslang')
396 elif 'en' in srt_lang_list:
397 srt_lang = 'en'
d77c3dfd 398 else:
dee5d769 399 srt_lang = srt_lang_list.keys()[0]
0b8c922d
FV
400 if not srt_lang in srt_lang_list:
401 raise Trouble(u'WARNING: no closed captions found in the specified language')
dee5d769 402 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
0b8c922d
FV
403 try:
404 srt_xml = urllib2.urlopen(request).read()
405 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
406 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
dee5d769
FV
407 if not srt_xml:
408 raise Trouble(u'WARNING: unable to download video subtitles')
0b8c922d
FV
409 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
410 except Trouble as trouble:
411 self._downloader.trouble(trouble[0])
d77c3dfd
FV
412
413 # token
414 video_token = urllib.unquote_plus(video_info['token'][0])
415
416 # Decide which formats to download
417 req_format = self._downloader.params.get('format', None)
418
419 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
420 self.report_rtmp_download()
421 video_url_list = [(None, video_info['conn'][0])]
422 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
423 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
424 url_data = [parse_qs(uds) for uds in url_data_strs]
425 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
426 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
427
428 format_limit = self._downloader.params.get('format_limit', None)
429 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
430 if format_limit is not None and format_limit in available_formats:
431 format_list = available_formats[available_formats.index(format_limit):]
432 else:
433 format_list = available_formats
434 existing_formats = [x for x in format_list if x in url_map]
435 if len(existing_formats) == 0:
436 self._downloader.trouble(u'ERROR: no known formats available for video')
437 return
438 if self._downloader.params.get('listformats', None):
439 self._print_formats(existing_formats)
440 return
441 if req_format is None or req_format == 'best':
442 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
443 elif req_format == 'worst':
444 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
445 elif req_format in ('-1', 'all'):
446 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
447 else:
448 # Specific formats. We pick the first in a slash-delimeted sequence.
449 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
450 req_formats = req_format.split('/')
451 video_url_list = None
452 for rf in req_formats:
453 if rf in url_map:
454 video_url_list = [(rf, url_map[rf])]
455 break
456 if video_url_list is None:
457 self._downloader.trouble(u'ERROR: requested format not available')
458 return
459 else:
460 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
461 return
462
58ca755f 463 results = []
d77c3dfd 464 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
465 # Extension
466 video_extension = self._video_extensions.get(format_param, 'flv')
467
58ca755f
FV
468 results.append({
469 'id': video_id.decode('utf-8'),
470 'url': video_real_url.decode('utf-8'),
471 'uploader': video_uploader.decode('utf-8'),
472 'upload_date': upload_date,
473 'title': video_title,
58ca755f
FV
474 'ext': video_extension.decode('utf-8'),
475 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
476 'thumbnail': video_thumbnail.decode('utf-8'),
477 'description': video_description,
478 'player_url': player_url,
479 'subtitles': video_subtitles
480 })
481 return results
d77c3dfd
FV
482
483
484class MetacafeIE(InfoExtractor):
485 """Information Extractor for metacafe.com."""
486
487 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
488 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
489 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
d77c3dfd
FV
490 IE_NAME = u'metacafe'
491
58ca755f 492 def __init__(self, downloader=None):
d77c3dfd 493 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
494
495 def report_disclaimer(self):
496 """Report disclaimer retrieval."""
497 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
498
499 def report_age_confirmation(self):
500 """Report attempt to confirm age."""
501 self._downloader.to_screen(u'[metacafe] Confirming age')
502
503 def report_download_webpage(self, video_id):
504 """Report webpage download."""
505 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
506
507 def report_extraction(self, video_id):
508 """Report information extraction."""
509 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
510
511 def _real_initialize(self):
512 # Retrieve disclaimer
513 request = urllib2.Request(self._DISCLAIMER)
514 try:
515 self.report_disclaimer()
516 disclaimer = urllib2.urlopen(request).read()
517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
518 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
519 return
520
521 # Confirm age
522 disclaimer_form = {
523 'filters': '0',
524 'submit': "Continue - I'm over 18",
525 }
526 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
527 try:
528 self.report_age_confirmation()
529 disclaimer = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
532 return
533
534 def _real_extract(self, url):
535 # Extract id and simplified title from URL
536 mobj = re.match(self._VALID_URL, url)
537 if mobj is None:
538 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
539 return
540
541 video_id = mobj.group(1)
542
543 # Check if video comes from YouTube
544 mobj2 = re.match(r'^yt-(.*)$', video_id)
545 if mobj2 is not None:
58ca755f 546 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
d77c3dfd
FV
547 return
548
d77c3dfd
FV
549 # Retrieve video webpage to extract further information
550 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
551 try:
552 self.report_download_webpage(video_id)
553 webpage = urllib2.urlopen(request).read()
554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
555 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
556 return
557
558 # Extract URL, uploader and title from webpage
559 self.report_extraction(video_id)
560 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
561 if mobj is not None:
562 mediaURL = urllib.unquote(mobj.group(1))
563 video_extension = mediaURL[-3:]
564
565 # Extract gdaKey if available
566 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
567 if mobj is None:
568 video_url = mediaURL
569 else:
570 gdaKey = mobj.group(1)
571 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
572 else:
573 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
574 if mobj is None:
575 self._downloader.trouble(u'ERROR: unable to extract media URL')
576 return
577 vardict = parse_qs(mobj.group(1))
578 if 'mediaData' not in vardict:
579 self._downloader.trouble(u'ERROR: unable to extract media URL')
580 return
581 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
582 if mobj is None:
583 self._downloader.trouble(u'ERROR: unable to extract media URL')
584 return
585 mediaURL = mobj.group(1).replace('\\/', '/')
586 video_extension = mediaURL[-3:]
587 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
588
589 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
590 if mobj is None:
591 self._downloader.trouble(u'ERROR: unable to extract title')
592 return
593 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
594
595 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
596 if mobj is None:
597 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
598 return
599 video_uploader = mobj.group(1)
600
58ca755f
FV
601 return [{
602 'id': video_id.decode('utf-8'),
603 'url': video_url.decode('utf-8'),
604 'uploader': video_uploader.decode('utf-8'),
605 'upload_date': u'NA',
606 'title': video_title,
58ca755f
FV
607 'ext': video_extension.decode('utf-8'),
608 'format': u'NA',
609 'player_url': None,
610 }]
d77c3dfd
FV
611
612
613class DailymotionIE(InfoExtractor):
614 """Information Extractor for Dailymotion"""
615
616 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
617 IE_NAME = u'dailymotion'
618
619 def __init__(self, downloader=None):
620 InfoExtractor.__init__(self, downloader)
621
622 def report_download_webpage(self, video_id):
623 """Report webpage download."""
624 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
625
626 def report_extraction(self, video_id):
627 """Report information extraction."""
628 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
629
630 def _real_extract(self, url):
631 # Extract id and simplified title from URL
632 mobj = re.match(self._VALID_URL, url)
633 if mobj is None:
634 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
635 return
636
d77c3dfd
FV
637 video_id = mobj.group(1)
638
639 video_extension = 'flv'
640
641 # Retrieve video webpage to extract further information
642 request = urllib2.Request(url)
643 request.add_header('Cookie', 'family_filter=off')
644 try:
645 self.report_download_webpage(video_id)
646 webpage = urllib2.urlopen(request).read()
647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
648 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
649 return
650
651 # Extract URL, uploader and title from webpage
652 self.report_extraction(video_id)
653 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
654 if mobj is None:
655 self._downloader.trouble(u'ERROR: unable to extract media URL')
656 return
657 sequence = urllib.unquote(mobj.group(1))
658 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
659 if mobj is None:
660 self._downloader.trouble(u'ERROR: unable to extract media URL')
661 return
662 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
663
664 # if needed add http://www.dailymotion.com/ if relative URL
665
666 video_url = mediaURL
667
668 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
669 if mobj is None:
670 self._downloader.trouble(u'ERROR: unable to extract title')
671 return
672 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
d77c3dfd
FV
673
674 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
675 if mobj is None:
676 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
677 return
678 video_uploader = mobj.group(1)
679
58ca755f
FV
680 return [{
681 'id': video_id.decode('utf-8'),
682 'url': video_url.decode('utf-8'),
683 'uploader': video_uploader.decode('utf-8'),
684 'upload_date': u'NA',
685 'title': video_title,
58ca755f
FV
686 'ext': video_extension.decode('utf-8'),
687 'format': u'NA',
688 'player_url': None,
689 }]
d77c3dfd
FV
690
691
692class GoogleIE(InfoExtractor):
693 """Information extractor for video.google.com."""
694
695 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
696 IE_NAME = u'video.google'
697
698 def __init__(self, downloader=None):
699 InfoExtractor.__init__(self, downloader)
700
701 def report_download_webpage(self, video_id):
702 """Report webpage download."""
703 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
704
705 def report_extraction(self, video_id):
706 """Report information extraction."""
707 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
708
709 def _real_extract(self, url):
710 # Extract id from URL
711 mobj = re.match(self._VALID_URL, url)
712 if mobj is None:
713 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
714 return
715
d77c3dfd
FV
716 video_id = mobj.group(1)
717
718 video_extension = 'mp4'
719
720 # Retrieve video webpage to extract further information
721 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
722 try:
723 self.report_download_webpage(video_id)
724 webpage = urllib2.urlopen(request).read()
725 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
726 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
727 return
728
729 # Extract URL, uploader, and title from webpage
730 self.report_extraction(video_id)
731 mobj = re.search(r"download_url:'([^']+)'", webpage)
732 if mobj is None:
733 video_extension = 'flv'
734 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
735 if mobj is None:
736 self._downloader.trouble(u'ERROR: unable to extract media URL')
737 return
738 mediaURL = urllib.unquote(mobj.group(1))
739 mediaURL = mediaURL.replace('\\x3d', '\x3d')
740 mediaURL = mediaURL.replace('\\x26', '\x26')
741
742 video_url = mediaURL
743
744 mobj = re.search(r'<title>(.*)</title>', webpage)
745 if mobj is None:
746 self._downloader.trouble(u'ERROR: unable to extract title')
747 return
748 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
749
750 # Extract video description
751 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
752 if mobj is None:
753 self._downloader.trouble(u'ERROR: unable to extract video description')
754 return
755 video_description = mobj.group(1).decode('utf-8')
756 if not video_description:
757 video_description = 'No description available.'
758
759 # Extract video thumbnail
760 if self._downloader.params.get('forcethumbnail', False):
761 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
762 try:
763 webpage = urllib2.urlopen(request).read()
764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
765 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
766 return
767 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
768 if mobj is None:
769 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
770 return
771 video_thumbnail = mobj.group(1)
772 else: # we need something to pass to process_info
773 video_thumbnail = ''
774
58ca755f
FV
775 return [{
776 'id': video_id.decode('utf-8'),
777 'url': video_url.decode('utf-8'),
778 'uploader': u'NA',
779 'upload_date': u'NA',
780 'title': video_title,
58ca755f
FV
781 'ext': video_extension.decode('utf-8'),
782 'format': u'NA',
783 'player_url': None,
784 }]
d77c3dfd
FV
785
786
787class PhotobucketIE(InfoExtractor):
788 """Information extractor for photobucket.com."""
789
790 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
791 IE_NAME = u'photobucket'
792
793 def __init__(self, downloader=None):
794 InfoExtractor.__init__(self, downloader)
795
796 def report_download_webpage(self, video_id):
797 """Report webpage download."""
798 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
799
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
803
804 def _real_extract(self, url):
805 # Extract id from URL
806 mobj = re.match(self._VALID_URL, url)
807 if mobj is None:
808 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
809 return
810
d77c3dfd
FV
811 video_id = mobj.group(1)
812
813 video_extension = 'flv'
814
815 # Retrieve video webpage to extract further information
816 request = urllib2.Request(url)
817 try:
818 self.report_download_webpage(video_id)
819 webpage = urllib2.urlopen(request).read()
820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
821 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
822 return
823
824 # Extract URL, uploader, and title from webpage
825 self.report_extraction(video_id)
826 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
827 if mobj is None:
828 self._downloader.trouble(u'ERROR: unable to extract media URL')
829 return
830 mediaURL = urllib.unquote(mobj.group(1))
831
832 video_url = mediaURL
833
834 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
835 if mobj is None:
836 self._downloader.trouble(u'ERROR: unable to extract title')
837 return
838 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
839
840 video_uploader = mobj.group(2).decode('utf-8')
841
58ca755f
FV
842 return [{
843 'id': video_id.decode('utf-8'),
844 'url': video_url.decode('utf-8'),
845 'uploader': video_uploader,
846 'upload_date': u'NA',
847 'title': video_title,
58ca755f
FV
848 'ext': video_extension.decode('utf-8'),
849 'format': u'NA',
850 'player_url': None,
851 }]
d77c3dfd
FV
852
853
854class YahooIE(InfoExtractor):
855 """Information extractor for video.yahoo.com."""
856
857 # _VALID_URL matches all Yahoo! Video URLs
858 # _VPAGE_URL matches only the extractable '/watch/' URLs
859 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
860 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
861 IE_NAME = u'video.yahoo'
862
863 def __init__(self, downloader=None):
864 InfoExtractor.__init__(self, downloader)
865
866 def report_download_webpage(self, video_id):
867 """Report webpage download."""
868 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
869
870 def report_extraction(self, video_id):
871 """Report information extraction."""
872 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
873
874 def _real_extract(self, url, new_video=True):
875 # Extract ID from URL
876 mobj = re.match(self._VALID_URL, url)
877 if mobj is None:
878 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
879 return
880
d77c3dfd
FV
881 video_id = mobj.group(2)
882 video_extension = 'flv'
883
884 # Rewrite valid but non-extractable URLs as
885 # extractable English language /watch/ URLs
886 if re.match(self._VPAGE_URL, url) is None:
887 request = urllib2.Request(url)
888 try:
889 webpage = urllib2.urlopen(request).read()
890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
892 return
893
894 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
895 if mobj is None:
896 self._downloader.trouble(u'ERROR: Unable to extract id field')
897 return
898 yahoo_id = mobj.group(1)
899
900 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
901 if mobj is None:
902 self._downloader.trouble(u'ERROR: Unable to extract vid field')
903 return
904 yahoo_vid = mobj.group(1)
905
906 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
907 return self._real_extract(url, new_video=False)
908
909 # Retrieve video webpage to extract further information
910 request = urllib2.Request(url)
911 try:
912 self.report_download_webpage(video_id)
913 webpage = urllib2.urlopen(request).read()
914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
916 return
917
918 # Extract uploader and title from webpage
919 self.report_extraction(video_id)
920 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
921 if mobj is None:
922 self._downloader.trouble(u'ERROR: unable to extract video title')
923 return
924 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
925
926 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
927 if mobj is None:
928 self._downloader.trouble(u'ERROR: unable to extract video uploader')
929 return
930 video_uploader = mobj.group(1).decode('utf-8')
931
932 # Extract video thumbnail
933 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
934 if mobj is None:
935 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
936 return
937 video_thumbnail = mobj.group(1).decode('utf-8')
938
939 # Extract video description
940 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
941 if mobj is None:
942 self._downloader.trouble(u'ERROR: unable to extract video description')
943 return
944 video_description = mobj.group(1).decode('utf-8')
945 if not video_description:
946 video_description = 'No description available.'
947
948 # Extract video height and width
949 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
950 if mobj is None:
951 self._downloader.trouble(u'ERROR: unable to extract video height')
952 return
953 yv_video_height = mobj.group(1)
954
955 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
956 if mobj is None:
957 self._downloader.trouble(u'ERROR: unable to extract video width')
958 return
959 yv_video_width = mobj.group(1)
960
961 # Retrieve video playlist to extract media URL
962 # I'm not completely sure what all these options are, but we
963 # seem to need most of them, otherwise the server sends a 401.
964 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
965 yv_bitrate = '700' # according to Wikipedia this is hard-coded
966 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
967 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
968 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
969 try:
970 self.report_download_webpage(video_id)
971 webpage = urllib2.urlopen(request).read()
972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
973 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
974 return
975
976 # Extract media URL from playlist XML
977 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
978 if mobj is None:
979 self._downloader.trouble(u'ERROR: Unable to extract media URL')
980 return
981 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
9e6dd238 982 video_url = unescapeHTML(video_url)
d77c3dfd 983
58ca755f
FV
984 return [{
985 'id': video_id.decode('utf-8'),
986 'url': video_url,
987 'uploader': video_uploader,
988 'upload_date': u'NA',
989 'title': video_title,
58ca755f
FV
990 'ext': video_extension.decode('utf-8'),
991 'thumbnail': video_thumbnail.decode('utf-8'),
992 'description': video_description,
993 'thumbnail': video_thumbnail,
994 'player_url': None,
995 }]
d77c3dfd
FV
996
997
998class VimeoIE(InfoExtractor):
999 """Information extractor for vimeo.com."""
1000
1001 # _VALID_URL matches Vimeo URLs
1002 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1003 IE_NAME = u'vimeo'
1004
1005 def __init__(self, downloader=None):
1006 InfoExtractor.__init__(self, downloader)
1007
1008 def report_download_webpage(self, video_id):
1009 """Report webpage download."""
1010 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1011
1012 def report_extraction(self, video_id):
1013 """Report information extraction."""
1014 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1015
1016 def _real_extract(self, url, new_video=True):
1017 # Extract ID from URL
1018 mobj = re.match(self._VALID_URL, url)
1019 if mobj is None:
1020 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1021 return
1022
d77c3dfd
FV
1023 video_id = mobj.group(1)
1024
1025 # Retrieve video webpage to extract further information
1026 request = urllib2.Request(url, None, std_headers)
1027 try:
1028 self.report_download_webpage(video_id)
1029 webpage = urllib2.urlopen(request).read()
1030 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1032 return
1033
1034 # Now we begin extracting as much information as we can from what we
1035 # retrieved. First we extract the information common to all extractors,
1036 # and latter we extract those that are Vimeo specific.
1037 self.report_extraction(video_id)
1038
1039 # Extract the config JSON
1040 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1041 try:
1042 config = json.loads(config)
1043 except:
1044 self._downloader.trouble(u'ERROR: unable to extract info section')
1045 return
1046
1047 # Extract title
1048 video_title = config["video"]["title"]
d77c3dfd
FV
1049
1050 # Extract uploader
1051 video_uploader = config["video"]["owner"]["name"]
1052
1053 # Extract video thumbnail
1054 video_thumbnail = config["video"]["thumbnail"]
1055
1056 # Extract video description
9beb5af8
FV
1057 video_description = get_element_by_id("description", webpage.decode('utf8'))
1058 if video_description: video_description = clean_html(video_description)
9e6dd238 1059 else: video_description = ''
d77c3dfd
FV
1060
1061 # Extract upload date
1062 video_upload_date = u'NA'
1063 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1064 if mobj is not None:
1065 video_upload_date = mobj.group(1)
1066
1067 # Vimeo specific: extract request signature and timestamp
1068 sig = config['request']['signature']
1069 timestamp = config['request']['timestamp']
1070
1071 # Vimeo specific: extract video codec and quality information
1072 # TODO bind to format param
1073 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1074 for codec in codecs:
1075 if codec[0] in config["video"]["files"]:
1076 video_codec = codec[0]
1077 video_extension = codec[1]
1078 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1079 else: quality = 'sd'
1080 break
1081 else:
1082 self._downloader.trouble(u'ERROR: no known codec found')
1083 return
1084
1085 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1086 %(video_id, sig, timestamp, quality, video_codec.upper())
1087
58ca755f
FV
1088 return [{
1089 'id': video_id,
1090 'url': video_url,
1091 'uploader': video_uploader,
1092 'upload_date': video_upload_date,
1093 'title': video_title,
58ca755f
FV
1094 'ext': video_extension,
1095 'thumbnail': video_thumbnail,
1096 'description': video_description,
1097 'player_url': None,
1098 }]
d77c3dfd
FV
1099
1100
1101class GenericIE(InfoExtractor):
1102 """Generic last-resort information extractor."""
1103
1104 _VALID_URL = r'.*'
1105 IE_NAME = u'generic'
1106
1107 def __init__(self, downloader=None):
1108 InfoExtractor.__init__(self, downloader)
1109
1110 def report_download_webpage(self, video_id):
1111 """Report webpage download."""
1112 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1113 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1114
1115 def report_extraction(self, video_id):
1116 """Report information extraction."""
1117 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1118
1119 def report_following_redirect(self, new_url):
1120 """Report information extraction."""
1121 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1122
1123 def _test_redirect(self, url):
1124 """Check if it is a redirect, like url shorteners, in case restart chain."""
1125 class HeadRequest(urllib2.Request):
1126 def get_method(self):
1127 return "HEAD"
1128
1129 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1130 """
1131 Subclass the HTTPRedirectHandler to make it use our
1132 HeadRequest also on the redirected URL
1133 """
1134 def redirect_request(self, req, fp, code, msg, headers, newurl):
1135 if code in (301, 302, 303, 307):
303692b5
FV
1136 newurl = newurl.replace(' ', '%20')
1137 newheaders = dict((k,v) for k,v in req.headers.items()
1138 if k.lower() not in ("content-length", "content-type"))
1139 return HeadRequest(newurl,
1140 headers=newheaders,
1141 origin_req_host=req.get_origin_req_host(),
1142 unverifiable=True)
d77c3dfd 1143 else:
303692b5
FV
1144 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1145
d77c3dfd
FV
1146 class HTTPMethodFallback(urllib2.BaseHandler):
1147 """
1148 Fallback to GET if HEAD is not allowed (405 HTTP error)
1149 """
1150 def http_error_405(self, req, fp, code, msg, headers):
1151 fp.read()
1152 fp.close()
1153
1154 newheaders = dict((k,v) for k,v in req.headers.items()
303692b5 1155 if k.lower() not in ("content-length", "content-type"))
d77c3dfd 1156 return self.parent.open(urllib2.Request(req.get_full_url(),
303692b5
FV
1157 headers=newheaders,
1158 origin_req_host=req.get_origin_req_host(),
1159 unverifiable=True))
d77c3dfd
FV
1160
1161 # Build our opener
1162 opener = urllib2.OpenerDirector()
1163 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
303692b5
FV
1164 HTTPMethodFallback, HEADRedirectHandler,
1165 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
d77c3dfd
FV
1166 opener.add_handler(handler())
1167
1168 response = opener.open(HeadRequest(url))
1169 new_url = response.geturl()
1170
1171 if url == new_url: return False
1172
1173 self.report_following_redirect(new_url)
1174 self._downloader.download([new_url])
1175 return True
1176
1177 def _real_extract(self, url):
1178 if self._test_redirect(url): return
d77c3dfd
FV
1179
1180 video_id = url.split('/')[-1]
1181 request = urllib2.Request(url)
1182 try:
1183 self.report_download_webpage(video_id)
1184 webpage = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1187 return
1188 except ValueError, err:
1189 # since this is the last-resort InfoExtractor, if
1190 # this error is thrown, it'll be thrown here
1191 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1192 return
1193
1194 self.report_extraction(video_id)
1195 # Start with something easy: JW Player in SWFObject
1196 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1197 if mobj is None:
1198 # Broaden the search a little bit
1199 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1200 if mobj is None:
1201 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1202 return
1203
1204 # It's possible that one of the regexes
1205 # matched, but returned an empty group:
1206 if mobj.group(1) is None:
1207 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1208 return
1209
1210 video_url = urllib.unquote(mobj.group(1))
1211 video_id = os.path.basename(video_url)
1212
1213 # here's a fun little line of code for you:
1214 video_extension = os.path.splitext(video_id)[1][1:]
1215 video_id = os.path.splitext(video_id)[0]
1216
1217 # it's tempting to parse this further, but you would
1218 # have to take into account all the variations like
1219 # Video Title - Site Name
1220 # Site Name | Video Title
1221 # Video Title - Tagline | Site Name
1222 # and so on and so forth; it's just not practical
1223 mobj = re.search(r'<title>(.*)</title>', webpage)
1224 if mobj is None:
1225 self._downloader.trouble(u'ERROR: unable to extract title')
1226 return
1227 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
1228
1229 # video uploader is domain name
1230 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1231 if mobj is None:
1232 self._downloader.trouble(u'ERROR: unable to extract title')
1233 return
1234 video_uploader = mobj.group(1).decode('utf-8')
1235
58ca755f
FV
1236 return [{
1237 'id': video_id.decode('utf-8'),
1238 'url': video_url.decode('utf-8'),
1239 'uploader': video_uploader,
1240 'upload_date': u'NA',
1241 'title': video_title,
58ca755f
FV
1242 'ext': video_extension.decode('utf-8'),
1243 'format': u'NA',
1244 'player_url': None,
1245 }]
d77c3dfd
FV
1246
1247
1248class YoutubeSearchIE(InfoExtractor):
1249 """Information Extractor for YouTube search queries."""
1250 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1251 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
d77c3dfd
FV
1252 _max_youtube_results = 1000
1253 IE_NAME = u'youtube:search'
1254
58ca755f 1255 def __init__(self, downloader=None):
d77c3dfd 1256 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1257
1258 def report_download_page(self, query, pagenum):
d4e16d3e 1259 """Report attempt to download search page with given number."""
d77c3dfd
FV
1260 query = query.decode(preferredencoding())
1261 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1262
d77c3dfd
FV
1263 def _real_extract(self, query):
1264 mobj = re.match(self._VALID_URL, query)
1265 if mobj is None:
1266 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1267 return
1268
1269 prefix, query = query.split(':')
1270 prefix = prefix[8:]
1271 query = query.encode('utf-8')
1272 if prefix == '':
1273 self._download_n_results(query, 1)
1274 return
1275 elif prefix == 'all':
1276 self._download_n_results(query, self._max_youtube_results)
1277 return
1278 else:
1279 try:
1280 n = long(prefix)
1281 if n <= 0:
1282 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1283 return
1284 elif n > self._max_youtube_results:
1285 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1286 n = self._max_youtube_results
1287 self._download_n_results(query, n)
1288 return
1289 except ValueError: # parsing prefix as integer fails
1290 self._download_n_results(query, 1)
1291 return
1292
1293 def _download_n_results(self, query, n):
1294 """Downloads a specified number of results for a query"""
1295
1296 video_ids = []
1297 pagenum = 0
1298 limit = n
1299
1300 while (50 * pagenum) < limit:
1301 self.report_download_page(query, pagenum+1)
1302 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1303 request = urllib2.Request(result_url)
1304 try:
1305 data = urllib2.urlopen(request).read()
1306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1307 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1308 return
1309 api_response = json.loads(data)['data']
1310
1311 new_ids = list(video['id'] for video in api_response['items'])
1312 video_ids += new_ids
1313
1314 limit = min(n, api_response['totalItems'])
1315 pagenum += 1
1316
1317 if len(video_ids) > n:
1318 video_ids = video_ids[:n]
1319 for id in video_ids:
58ca755f 1320 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1321 return
1322
1323
1324class GoogleSearchIE(InfoExtractor):
1325 """Information Extractor for Google Video search queries."""
1326 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1327 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1328 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1329 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
d77c3dfd
FV
1330 _max_google_results = 1000
1331 IE_NAME = u'video.google:search'
1332
58ca755f 1333 def __init__(self, downloader=None):
d77c3dfd 1334 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1335
1336 def report_download_page(self, query, pagenum):
1337 """Report attempt to download playlist page with given number."""
1338 query = query.decode(preferredencoding())
1339 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1340
d77c3dfd
FV
1341 def _real_extract(self, query):
1342 mobj = re.match(self._VALID_URL, query)
1343 if mobj is None:
1344 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1345 return
1346
1347 prefix, query = query.split(':')
1348 prefix = prefix[8:]
1349 query = query.encode('utf-8')
1350 if prefix == '':
1351 self._download_n_results(query, 1)
1352 return
1353 elif prefix == 'all':
1354 self._download_n_results(query, self._max_google_results)
1355 return
1356 else:
1357 try:
1358 n = long(prefix)
1359 if n <= 0:
1360 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1361 return
1362 elif n > self._max_google_results:
1363 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1364 n = self._max_google_results
1365 self._download_n_results(query, n)
1366 return
1367 except ValueError: # parsing prefix as integer fails
1368 self._download_n_results(query, 1)
1369 return
1370
1371 def _download_n_results(self, query, n):
1372 """Downloads a specified number of results for a query"""
1373
1374 video_ids = []
1375 pagenum = 0
1376
1377 while True:
1378 self.report_download_page(query, pagenum)
1379 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1380 request = urllib2.Request(result_url)
1381 try:
1382 page = urllib2.urlopen(request).read()
1383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1384 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1385 return
1386
1387 # Extract video identifiers
1388 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1389 video_id = mobj.group(1)
1390 if video_id not in video_ids:
1391 video_ids.append(video_id)
1392 if len(video_ids) == n:
1393 # Specified n videos reached
1394 for id in video_ids:
58ca755f 1395 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1396 return
1397
1398 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1399 for id in video_ids:
58ca755f 1400 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1401 return
1402
1403 pagenum = pagenum + 1
1404
1405
1406class YahooSearchIE(InfoExtractor):
1407 """Information Extractor for Yahoo! Video search queries."""
1408 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1409 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1410 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1411 _MORE_PAGES_INDICATOR = r'\s*Next'
d77c3dfd
FV
1412 _max_yahoo_results = 1000
1413 IE_NAME = u'video.yahoo:search'
1414
58ca755f 1415 def __init__(self, downloader=None):
d77c3dfd 1416 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1417
1418 def report_download_page(self, query, pagenum):
1419 """Report attempt to download playlist page with given number."""
1420 query = query.decode(preferredencoding())
1421 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1422
d77c3dfd
FV
1423 def _real_extract(self, query):
1424 mobj = re.match(self._VALID_URL, query)
1425 if mobj is None:
1426 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1427 return
1428
1429 prefix, query = query.split(':')
1430 prefix = prefix[8:]
1431 query = query.encode('utf-8')
1432 if prefix == '':
1433 self._download_n_results(query, 1)
1434 return
1435 elif prefix == 'all':
1436 self._download_n_results(query, self._max_yahoo_results)
1437 return
1438 else:
1439 try:
1440 n = long(prefix)
1441 if n <= 0:
1442 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1443 return
1444 elif n > self._max_yahoo_results:
1445 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1446 n = self._max_yahoo_results
1447 self._download_n_results(query, n)
1448 return
1449 except ValueError: # parsing prefix as integer fails
1450 self._download_n_results(query, 1)
1451 return
1452
1453 def _download_n_results(self, query, n):
1454 """Downloads a specified number of results for a query"""
1455
1456 video_ids = []
1457 already_seen = set()
1458 pagenum = 1
1459
1460 while True:
1461 self.report_download_page(query, pagenum)
1462 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1463 request = urllib2.Request(result_url)
1464 try:
1465 page = urllib2.urlopen(request).read()
1466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1468 return
1469
1470 # Extract video identifiers
1471 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1472 video_id = mobj.group(1)
1473 if video_id not in already_seen:
1474 video_ids.append(video_id)
1475 already_seen.add(video_id)
1476 if len(video_ids) == n:
1477 # Specified n videos reached
1478 for id in video_ids:
58ca755f 1479 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1480 return
1481
1482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1483 for id in video_ids:
58ca755f 1484 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1485 return
1486
1487 pagenum = pagenum + 1
1488
1489
1490class YoutubePlaylistIE(InfoExtractor):
1491 """Information Extractor for YouTube playlists."""
1492
1493 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1494 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
d4e16d3e
FV
1495 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1496 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
d77c3dfd
FV
1497 IE_NAME = u'youtube:playlist'
1498
58ca755f 1499 def __init__(self, downloader=None):
d77c3dfd 1500 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1501
1502 def report_download_page(self, playlist_id, pagenum):
1503 """Report attempt to download playlist page with given number."""
1504 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1505
d77c3dfd
FV
1506 def _real_extract(self, url):
1507 # Extract playlist id
1508 mobj = re.match(self._VALID_URL, url)
1509 if mobj is None:
1510 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1511 return
1512
1513 # Single video case
1514 if mobj.group(3) is not None:
58ca755f 1515 self._downloader.download([mobj.group(3)])
d77c3dfd
FV
1516 return
1517
1518 # Download playlist pages
1519 # prefix is 'p' as default for playlists but there are other types that need extra care
1520 playlist_prefix = mobj.group(1)
1521 if playlist_prefix == 'a':
1522 playlist_access = 'artist'
1523 else:
1524 playlist_prefix = 'p'
1525 playlist_access = 'view_play_list'
1526 playlist_id = mobj.group(2)
1527 video_ids = []
1528 pagenum = 1
1529
1530 while True:
1531 self.report_download_page(playlist_id, pagenum)
1532 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1533 request = urllib2.Request(url)
1534 try:
1535 page = urllib2.urlopen(request).read()
1536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1537 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1538 return
1539
1540 # Extract video identifiers
1541 ids_in_page = []
1542 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1543 if mobj.group(1) not in ids_in_page:
1544 ids_in_page.append(mobj.group(1))
1545 video_ids.extend(ids_in_page)
1546
1547 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1548 break
1549 pagenum = pagenum + 1
1550
1551 playliststart = self._downloader.params.get('playliststart', 1) - 1
1552 playlistend = self._downloader.params.get('playlistend', -1)
1553 if playlistend == -1:
1554 video_ids = video_ids[playliststart:]
1555 else:
1556 video_ids = video_ids[playliststart:playlistend]
1557
1558 for id in video_ids:
58ca755f 1559 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1560 return
1561
1562
1563class YoutubeUserIE(InfoExtractor):
1564 """Information Extractor for YouTube users."""
1565
1566 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1567 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1568 _GDATA_PAGE_SIZE = 50
1569 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1570 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
d77c3dfd
FV
1571 IE_NAME = u'youtube:user'
1572
58ca755f 1573 def __init__(self, downloader=None):
d77c3dfd 1574 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1575
1576 def report_download_page(self, username, start_index):
1577 """Report attempt to download user page."""
1578 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1579 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1580
d77c3dfd
FV
1581 def _real_extract(self, url):
1582 # Extract username
1583 mobj = re.match(self._VALID_URL, url)
1584 if mobj is None:
1585 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1586 return
1587
1588 username = mobj.group(1)
1589
1590 # Download video ids using YouTube Data API. Result size per
1591 # query is limited (currently to 50 videos) so we need to query
1592 # page by page until there are no video ids - it means we got
1593 # all of them.
1594
1595 video_ids = []
1596 pagenum = 0
1597
1598 while True:
1599 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1600 self.report_download_page(username, start_index)
1601
1602 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1603
1604 try:
1605 page = urllib2.urlopen(request).read()
1606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1607 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1608 return
1609
1610 # Extract video identifiers
1611 ids_in_page = []
1612
1613 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1614 if mobj.group(1) not in ids_in_page:
1615 ids_in_page.append(mobj.group(1))
1616
1617 video_ids.extend(ids_in_page)
1618
1619 # A little optimization - if current page is not
1620 # "full", ie. does not contain PAGE_SIZE video ids then
1621 # we can assume that this page is the last one - there
1622 # are no more ids on further pages - no need to query
1623 # again.
1624
1625 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1626 break
1627
1628 pagenum += 1
1629
1630 all_ids_count = len(video_ids)
1631 playliststart = self._downloader.params.get('playliststart', 1) - 1
1632 playlistend = self._downloader.params.get('playlistend', -1)
1633
1634 if playlistend == -1:
1635 video_ids = video_ids[playliststart:]
1636 else:
1637 video_ids = video_ids[playliststart:playlistend]
1638
1639 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1640 (username, all_ids_count, len(video_ids)))
1641
1642 for video_id in video_ids:
58ca755f 1643 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1644
1645
eeeb4daa
JCGS
1646class BlipTVUserIE(InfoExtractor):
1647 """Information Extractor for blip.tv users."""
1648
1649 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
11a141de 1650 _PAGE_SIZE = 12
eeeb4daa
JCGS
1651 IE_NAME = u'blip.tv:user'
1652
1653 def __init__(self, downloader=None):
1654 InfoExtractor.__init__(self, downloader)
1655
1656 def report_download_page(self, username, pagenum):
1657 """Report attempt to download user page."""
1658 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1659 (self.IE_NAME, username, pagenum))
1660
1661 def _real_extract(self, url):
1662 # Extract username
1663 mobj = re.match(self._VALID_URL, url)
1664 if mobj is None:
1665 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1666 return
1667
1668 username = mobj.group(1)
1669
11a141de 1670 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa
JCGS
1671
1672 request = urllib2.Request(url)
1673
1674 try:
1675 page = urllib2.urlopen(request).read().decode('utf-8')
11a141de
FV
1676 mobj = re.search(r'data-users-id="([^"]+)"', page)
1677 page_base = page_base % mobj.group(1)
eeeb4daa
JCGS
1678 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1679 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1680 return
1681
1682
11a141de
FV
1683 # Download video ids using BlipTV Ajax calls. Result size per
1684 # query is limited (currently to 12 videos) so we need to query
eeeb4daa
JCGS
1685 # page by page until there are no video ids - it means we got
1686 # all of them.
1687
1688 video_ids = []
11a141de 1689 pagenum = 1
eeeb4daa
JCGS
1690
1691 while True:
1692 self.report_download_page(username, pagenum)
1693
11a141de 1694 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa
JCGS
1695
1696 try:
1697 page = urllib2.urlopen(request).read().decode('utf-8')
1698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1699 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1700 return
1701
1702 # Extract video identifiers
1703 ids_in_page = []
1704
1705 for mobj in re.finditer(r'href="/([^"]+)"', page):
1706 if mobj.group(1) not in ids_in_page:
1707 ids_in_page.append(unescapeHTML(mobj.group(1)))
1708
1709 video_ids.extend(ids_in_page)
1710
1711 # A little optimization - if current page is not
1712 # "full", ie. does not contain PAGE_SIZE video ids then
1713 # we can assume that this page is the last one - there
1714 # are no more ids on further pages - no need to query
1715 # again.
1716
1717 if len(ids_in_page) < self._PAGE_SIZE:
1718 break
1719
1720 pagenum += 1
1721
1722 all_ids_count = len(video_ids)
1723 playliststart = self._downloader.params.get('playliststart', 1) - 1
1724 playlistend = self._downloader.params.get('playlistend', -1)
1725
1726 if playlistend == -1:
1727 video_ids = video_ids[playliststart:]
1728 else:
1729 video_ids = video_ids[playliststart:playlistend]
1730
1731 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1732 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1733
1734 for video_id in video_ids:
1735 self._downloader.download([u'http://blip.tv/'+video_id])
1736
1737
d77c3dfd
FV
1738class DepositFilesIE(InfoExtractor):
1739 """Information extractor for depositfiles.com"""
1740
1741 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1742 IE_NAME = u'DepositFiles'
1743
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1746
1747 def report_download_webpage(self, file_id):
1748 """Report webpage download."""
1749 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1750
1751 def report_extraction(self, file_id):
1752 """Report information extraction."""
1753 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1754
1755 def _real_extract(self, url):
d77c3dfd
FV
1756 file_id = url.split('/')[-1]
1757 # Rebuild url in english locale
1758 url = 'http://depositfiles.com/en/files/' + file_id
1759
1760 # Retrieve file webpage with 'Free download' button pressed
1761 free_download_indication = { 'gateway_result' : '1' }
1762 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1763 try:
1764 self.report_download_webpage(file_id)
1765 webpage = urllib2.urlopen(request).read()
1766 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1767 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1768 return
1769
1770 # Search for the real file URL
1771 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1772 if (mobj is None) or (mobj.group(1) is None):
1773 # Try to figure out reason of the error.
1774 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1775 if (mobj is not None) and (mobj.group(1) is not None):
1776 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1777 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1778 else:
1779 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1780 return
1781
1782 file_url = mobj.group(1)
1783 file_extension = os.path.splitext(file_url)[1][1:]
1784
1785 # Search for file title
1786 mobj = re.search(r'<b title="(.*?)">', webpage)
1787 if mobj is None:
1788 self._downloader.trouble(u'ERROR: unable to extract title')
1789 return
1790 file_title = mobj.group(1).decode('utf-8')
1791
58ca755f
FV
1792 return [{
1793 'id': file_id.decode('utf-8'),
1794 'url': file_url.decode('utf-8'),
1795 'uploader': u'NA',
1796 'upload_date': u'NA',
1797 'title': file_title,
58ca755f
FV
1798 'ext': file_extension.decode('utf-8'),
1799 'format': u'NA',
1800 'player_url': None,
1801 }]
d77c3dfd
FV
1802
1803
1804class FacebookIE(InfoExtractor):
1805 """Information Extractor for Facebook"""
1806
1807 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1808 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1809 _NETRC_MACHINE = 'facebook'
1810 _available_formats = ['video', 'highqual', 'lowqual']
1811 _video_extensions = {
1812 'video': 'mp4',
1813 'highqual': 'mp4',
1814 'lowqual': 'mp4',
1815 }
1816 IE_NAME = u'facebook'
1817
1818 def __init__(self, downloader=None):
1819 InfoExtractor.__init__(self, downloader)
1820
1821 def _reporter(self, message):
1822 """Add header and report message."""
1823 self._downloader.to_screen(u'[facebook] %s' % message)
1824
1825 def report_login(self):
1826 """Report attempt to log in."""
1827 self._reporter(u'Logging in')
1828
1829 def report_video_webpage_download(self, video_id):
1830 """Report attempt to download video webpage."""
1831 self._reporter(u'%s: Downloading video webpage' % video_id)
1832
1833 def report_information_extraction(self, video_id):
1834 """Report attempt to extract video information."""
1835 self._reporter(u'%s: Extracting video information' % video_id)
1836
1837 def _parse_page(self, video_webpage):
1838 """Extract video information from page"""
1839 # General data
1840 data = {'title': r'\("video_title", "(.*?)"\)',
1841 'description': r'<div class="datawrap">(.*?)</div>',
1842 'owner': r'\("video_owner_name", "(.*?)"\)',
1843 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1844 }
1845 video_info = {}
1846 for piece in data.keys():
1847 mobj = re.search(data[piece], video_webpage)
1848 if mobj is not None:
1849 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1850
1851 # Video urls
1852 video_urls = {}
1853 for fmt in self._available_formats:
1854 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1855 if mobj is not None:
1856 # URL is in a Javascript segment inside an escaped Unicode format within
1857 # the generally utf-8 page
1858 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1859 video_info['video_urls'] = video_urls
1860
1861 return video_info
1862
1863 def _real_initialize(self):
1864 if self._downloader is None:
1865 return
1866
1867 useremail = None
1868 password = None
1869 downloader_params = self._downloader.params
1870
1871 # Attempt to use provided username and password or .netrc data
1872 if downloader_params.get('username', None) is not None:
1873 useremail = downloader_params['username']
1874 password = downloader_params['password']
1875 elif downloader_params.get('usenetrc', False):
1876 try:
1877 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1878 if info is not None:
1879 useremail = info[0]
1880 password = info[2]
1881 else:
1882 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1883 except (IOError, netrc.NetrcParseError), err:
1884 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1885 return
1886
1887 if useremail is None:
1888 return
1889
1890 # Log in
1891 login_form = {
1892 'email': useremail,
1893 'pass': password,
1894 'login': 'Log+In'
1895 }
1896 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1897 try:
1898 self.report_login()
1899 login_results = urllib2.urlopen(request).read()
1900 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1901 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1902 return
1903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1904 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1905 return
1906
1907 def _real_extract(self, url):
1908 mobj = re.match(self._VALID_URL, url)
1909 if mobj is None:
1910 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1911 return
1912 video_id = mobj.group('ID')
1913
1914 # Get video webpage
1915 self.report_video_webpage_download(video_id)
1916 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1917 try:
1918 page = urllib2.urlopen(request)
1919 video_webpage = page.read()
1920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1921 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1922 return
1923
1924 # Start extracting information
1925 self.report_information_extraction(video_id)
1926
1927 # Extract information
1928 video_info = self._parse_page(video_webpage)
1929
1930 # uploader
1931 if 'owner' not in video_info:
1932 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1933 return
1934 video_uploader = video_info['owner']
1935
1936 # title
1937 if 'title' not in video_info:
1938 self._downloader.trouble(u'ERROR: unable to extract video title')
1939 return
1940 video_title = video_info['title']
1941 video_title = video_title.decode('utf-8')
d77c3dfd
FV
1942
1943 # thumbnail image
1944 if 'thumbnail' not in video_info:
1945 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1946 video_thumbnail = ''
1947 else:
1948 video_thumbnail = video_info['thumbnail']
1949
1950 # upload date
1951 upload_date = u'NA'
1952 if 'upload_date' in video_info:
1953 upload_time = video_info['upload_date']
1954 timetuple = email.utils.parsedate_tz(upload_time)
1955 if timetuple is not None:
1956 try:
1957 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1958 except:
1959 pass
1960
1961 # description
1962 video_description = video_info.get('description', 'No description available.')
1963
1964 url_map = video_info['video_urls']
1965 if len(url_map.keys()) > 0:
1966 # Decide which formats to download
1967 req_format = self._downloader.params.get('format', None)
1968 format_limit = self._downloader.params.get('format_limit', None)
1969
1970 if format_limit is not None and format_limit in self._available_formats:
1971 format_list = self._available_formats[self._available_formats.index(format_limit):]
1972 else:
1973 format_list = self._available_formats
1974 existing_formats = [x for x in format_list if x in url_map]
1975 if len(existing_formats) == 0:
1976 self._downloader.trouble(u'ERROR: no known formats available for video')
1977 return
1978 if req_format is None:
1979 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1980 elif req_format == 'worst':
1981 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1982 elif req_format == '-1':
1983 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1984 else:
1985 # Specific format
1986 if req_format not in url_map:
1987 self._downloader.trouble(u'ERROR: requested format not available')
1988 return
1989 video_url_list = [(req_format, url_map[req_format])] # Specific format
1990
58ca755f 1991 results = []
d77c3dfd 1992 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
1993 # Extension
1994 video_extension = self._video_extensions.get(format_param, 'mp4')
1995
58ca755f
FV
1996 results.append({
1997 'id': video_id.decode('utf-8'),
1998 'url': video_real_url.decode('utf-8'),
1999 'uploader': video_uploader.decode('utf-8'),
2000 'upload_date': upload_date,
2001 'title': video_title,
58ca755f
FV
2002 'ext': video_extension.decode('utf-8'),
2003 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2004 'thumbnail': video_thumbnail.decode('utf-8'),
2005 'description': video_description.decode('utf-8'),
2006 'player_url': None,
2007 })
2008 return results
d77c3dfd
FV
2009
2010class BlipTVIE(InfoExtractor):
2011 """Information extractor for blip.tv"""
2012
2013 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2014 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2015 IE_NAME = u'blip.tv'
2016
2017 def report_extraction(self, file_id):
2018 """Report information extraction."""
2019 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2020
2021 def report_direct_download(self, title):
2022 """Report information extraction."""
2023 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2024
2025 def _real_extract(self, url):
2026 mobj = re.match(self._VALID_URL, url)
2027 if mobj is None:
2028 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2029 return
2030
2031 if '?' in url:
2032 cchar = '&'
2033 else:
2034 cchar = '?'
2035 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
f1927d71 2036 request = urllib2.Request(json_url.encode('utf-8'))
d77c3dfd
FV
2037 self.report_extraction(mobj.group(1))
2038 info = None
2039 try:
2040 urlh = urllib2.urlopen(request)
2041 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2042 basename = url.split('/')[-1]
2043 title,ext = os.path.splitext(basename)
2044 title = title.decode('UTF-8')
2045 ext = ext.replace('.', '')
2046 self.report_direct_download(title)
2047 info = {
2048 'id': title,
2049 'url': url,
2050 'title': title,
d77c3dfd
FV
2051 'ext': ext,
2052 'urlhandle': urlh
2053 }
2054 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2056 return
2057 if info is None: # Regular URL
2058 try:
2059 json_code = urlh.read()
2060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2062 return
2063
2064 try:
2065 json_data = json.loads(json_code)
2066 if 'Post' in json_data:
2067 data = json_data['Post']
2068 else:
2069 data = json_data
3fe294e4 2070
d77c3dfd
FV
2071 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2072 video_url = data['media']['url']
2073 umobj = re.match(self._URL_EXT, video_url)
2074 if umobj is None:
2075 raise ValueError('Can not determine filename extension')
2076 ext = umobj.group(1)
3fe294e4 2077
d77c3dfd
FV
2078 info = {
2079 'id': data['item_id'],
2080 'url': video_url,
2081 'uploader': data['display_name'],
2082 'upload_date': upload_date,
2083 'title': data['title'],
d77c3dfd
FV
2084 'ext': ext,
2085 'format': data['media']['mimeType'],
2086 'thumbnail': data['thumbnailUrl'],
2087 'description': data['description'],
2088 'player_url': data['embedUrl']
2089 }
2090 except (ValueError,KeyError), err:
2091 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2092 return
2093
81828271 2094 std_headers['User-Agent'] = 'iTunes/10.6.1'
58ca755f 2095 return [info]
d77c3dfd
FV
2096
2097
2098class MyVideoIE(InfoExtractor):
2099 """Information Extractor for myvideo.de."""
2100
2101 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2102 IE_NAME = u'myvideo'
2103
2104 def __init__(self, downloader=None):
2105 InfoExtractor.__init__(self, downloader)
2106
2107 def report_download_webpage(self, video_id):
2108 """Report webpage download."""
2109 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2110
2111 def report_extraction(self, video_id):
2112 """Report information extraction."""
2113 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2114
2115 def _real_extract(self,url):
2116 mobj = re.match(self._VALID_URL, url)
2117 if mobj is None:
2118 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2119 return
2120
2121 video_id = mobj.group(1)
2122
2123 # Get video webpage
2124 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2125 try:
2126 self.report_download_webpage(video_id)
2127 webpage = urllib2.urlopen(request).read()
2128 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2129 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2130 return
2131
2132 self.report_extraction(video_id)
2133 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2134 webpage)
2135 if mobj is None:
2136 self._downloader.trouble(u'ERROR: unable to extract media URL')
2137 return
2138 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2139
2140 mobj = re.search('<title>([^<]+)</title>', webpage)
2141 if mobj is None:
2142 self._downloader.trouble(u'ERROR: unable to extract title')
2143 return
2144
2145 video_title = mobj.group(1)
d77c3dfd 2146
58ca755f
FV
2147 return [{
2148 'id': video_id,
2149 'url': video_url,
2150 'uploader': u'NA',
2151 'upload_date': u'NA',
2152 'title': video_title,
58ca755f
FV
2153 'ext': u'flv',
2154 'format': u'NA',
2155 'player_url': None,
2156 }]
d77c3dfd
FV
2157
2158class ComedyCentralIE(InfoExtractor):
2159 """Information extractor for The Daily Show and Colbert Report """
2160
2161 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2162 IE_NAME = u'comedycentral'
2163
2164 def report_extraction(self, episode_id):
2165 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3fe294e4 2166
d77c3dfd
FV
2167 def report_config_download(self, episode_id):
2168 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2169
2170 def report_index_download(self, episode_id):
2171 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2172
2173 def report_player_url(self, episode_id):
2174 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2175
2176 def _real_extract(self, url):
2177 mobj = re.match(self._VALID_URL, url)
2178 if mobj is None:
2179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2180 return
2181
2182 if mobj.group('shortname'):
2183 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2184 url = u'http://www.thedailyshow.com/full-episodes/'
2185 else:
2186 url = u'http://www.colbertnation.com/full-episodes/'
2187 mobj = re.match(self._VALID_URL, url)
2188 assert mobj is not None
2189
2190 dlNewest = not mobj.group('episode')
2191 if dlNewest:
2192 epTitle = mobj.group('showname')
2193 else:
2194 epTitle = mobj.group('episode')
2195
2196 req = urllib2.Request(url)
2197 self.report_extraction(epTitle)
2198 try:
2199 htmlHandle = urllib2.urlopen(req)
2200 html = htmlHandle.read()
2201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2203 return
2204 if dlNewest:
2205 url = htmlHandle.geturl()
2206 mobj = re.match(self._VALID_URL, url)
2207 if mobj is None:
2208 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2209 return
2210 if mobj.group('episode') == '':
2211 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2212 return
2213 epTitle = mobj.group('episode')
2214
2215 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2216 if len(mMovieParams) == 0:
2217 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2218 return
2219
2220 playerUrl_raw = mMovieParams[0][0]
2221 self.report_player_url(epTitle)
2222 try:
2223 urlHandle = urllib2.urlopen(playerUrl_raw)
2224 playerUrl = urlHandle.geturl()
2225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2227 return
2228
2229 uri = mMovieParams[0][1]
2230 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2231 self.report_index_download(epTitle)
2232 try:
2233 indexXml = urllib2.urlopen(indexUrl).read()
2234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2236 return
2237
58ca755f
FV
2238 results = []
2239
d77c3dfd
FV
2240 idoc = xml.etree.ElementTree.fromstring(indexXml)
2241 itemEls = idoc.findall('.//item')
2242 for itemEl in itemEls:
2243 mediaId = itemEl.findall('./guid')[0].text
2244 shortMediaId = mediaId.split(':')[-1]
2245 showId = mediaId.split(':')[-2].replace('.com', '')
2246 officialTitle = itemEl.findall('./title')[0].text
2247 officialDate = itemEl.findall('./pubDate')[0].text
2248
2249 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2250 urllib.urlencode({'uri': mediaId}))
2251 configReq = urllib2.Request(configUrl)
2252 self.report_config_download(epTitle)
2253 try:
2254 configXml = urllib2.urlopen(configReq).read()
2255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2256 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2257 return
2258
2259 cdoc = xml.etree.ElementTree.fromstring(configXml)
2260 turls = []
2261 for rendition in cdoc.findall('.//rendition'):
2262 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2263 turls.append(finfo)
2264
2265 if len(turls) == 0:
2266 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2267 continue
2268
2269 # For now, just pick the highest bitrate
2270 format,video_url = turls[-1]
2271
d77c3dfd
FV
2272 effTitle = showId + u'-' + epTitle
2273 info = {
2274 'id': shortMediaId,
2275 'url': video_url,
2276 'uploader': showId,
2277 'upload_date': officialDate,
2278 'title': effTitle,
d77c3dfd
FV
2279 'ext': 'mp4',
2280 'format': format,
2281 'thumbnail': None,
2282 'description': officialTitle,
2283 'player_url': playerUrl
2284 }
2285
58ca755f
FV
2286 results.append(info)
2287
2288 return results
d77c3dfd
FV
2289
2290
2291class EscapistIE(InfoExtractor):
2292 """Information extractor for The Escapist """
2293
2294 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2295 IE_NAME = u'escapist'
2296
2297 def report_extraction(self, showName):
2298 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2299
2300 def report_config_download(self, showName):
2301 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2302
2303 def _real_extract(self, url):
d77c3dfd
FV
2304 mobj = re.match(self._VALID_URL, url)
2305 if mobj is None:
2306 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2307 return
2308 showName = mobj.group('showname')
2309 videoId = mobj.group('episode')
2310
2311 self.report_extraction(showName)
2312 try:
3210735c
PH
2313 webPage = urllib2.urlopen(url)
2314 webPageBytes = webPage.read()
2315 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2316 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
d77c3dfd
FV
2317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2318 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2319 return
2320
2321 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
9e6dd238 2322 description = unescapeHTML(descMatch.group(1))
d77c3dfd 2323 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
9e6dd238 2324 imgUrl = unescapeHTML(imgMatch.group(1))
d77c3dfd 2325 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
9e6dd238 2326 playerUrl = unescapeHTML(playerUrlMatch.group(1))
d77c3dfd
FV
2327 configUrlMatch = re.search('config=(.*)$', playerUrl)
2328 configUrl = urllib2.unquote(configUrlMatch.group(1))
2329
2330 self.report_config_download(showName)
2331 try:
2332 configJSON = urllib2.urlopen(configUrl).read()
2333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2334 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2335 return
2336
2337 # Technically, it's JavaScript, not JSON
2338 configJSON = configJSON.replace("'", '"')
2339
2340 try:
2341 config = json.loads(configJSON)
2342 except (ValueError,), err:
2343 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2344 return
2345
2346 playlist = config['playlist']
2347 videoUrl = playlist[1]['url']
2348
d77c3dfd
FV
2349 info = {
2350 'id': videoId,
2351 'url': videoUrl,
2352 'uploader': showName,
2353 'upload_date': None,
2354 'title': showName,
d77c3dfd
FV
2355 'ext': 'flv',
2356 'format': 'flv',
2357 'thumbnail': imgUrl,
2358 'description': description,
2359 'player_url': playerUrl,
2360 }
2361
58ca755f 2362 return [info]
d77c3dfd
FV
2363
2364
2365class CollegeHumorIE(InfoExtractor):
2366 """Information extractor for collegehumor.com"""
2367
2368 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2369 IE_NAME = u'collegehumor'
2370
2371 def report_webpage(self, video_id):
2372 """Report information extraction."""
2373 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2374
2375 def report_extraction(self, video_id):
2376 """Report information extraction."""
2377 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2378
2379 def _real_extract(self, url):
d77c3dfd
FV
2380 mobj = re.match(self._VALID_URL, url)
2381 if mobj is None:
2382 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2383 return
2384 video_id = mobj.group('videoid')
2385
2386 self.report_webpage(video_id)
2387 request = urllib2.Request(url)
2388 try:
2389 webpage = urllib2.urlopen(request).read()
2390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2392 return
2393
2394 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2395 if m is None:
2396 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2397 return
2398 internal_video_id = m.group('internalvideoid')
2399
2400 info = {
2401 'id': video_id,
2402 'internal_id': internal_video_id,
2403 }
2404
2405 self.report_extraction(video_id)
2406 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2407 try:
2408 metaXml = urllib2.urlopen(xmlUrl).read()
2409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2410 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2411 return
2412
2413 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2414 try:
2415 videoNode = mdoc.findall('./video')[0]
2416 info['description'] = videoNode.findall('./description')[0].text
2417 info['title'] = videoNode.findall('./caption')[0].text
d77c3dfd
FV
2418 info['url'] = videoNode.findall('./file')[0].text
2419 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2420 info['ext'] = info['url'].rpartition('.')[2]
2421 info['format'] = info['ext']
2422 except IndexError:
2423 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2424 return
2425
58ca755f 2426 return [info]
d77c3dfd
FV
2427
2428
2429class XVideosIE(InfoExtractor):
2430 """Information extractor for xvideos.com"""
2431
2432 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2433 IE_NAME = u'xvideos'
2434
2435 def report_webpage(self, video_id):
2436 """Report information extraction."""
2437 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2438
2439 def report_extraction(self, video_id):
2440 """Report information extraction."""
2441 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2442
2443 def _real_extract(self, url):
d77c3dfd
FV
2444 mobj = re.match(self._VALID_URL, url)
2445 if mobj is None:
2446 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2447 return
2448 video_id = mobj.group(1).decode('utf-8')
2449
2450 self.report_webpage(video_id)
2451
2452 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2453 try:
2454 webpage = urllib2.urlopen(request).read()
2455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2456 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2457 return
2458
2459 self.report_extraction(video_id)
2460
2461
2462 # Extract video URL
2463 mobj = re.search(r'flv_url=(.+?)&', webpage)
2464 if mobj is None:
2465 self._downloader.trouble(u'ERROR: unable to extract video url')
2466 return
2467 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2468
2469
2470 # Extract title
2471 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2472 if mobj is None:
2473 self._downloader.trouble(u'ERROR: unable to extract video title')
2474 return
2475 video_title = mobj.group(1).decode('utf-8')
2476
2477
2478 # Extract video thumbnail
363a4e11 2479 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
d77c3dfd
FV
2480 if mobj is None:
2481 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2482 return
363a4e11 2483 video_thumbnail = mobj.group(0).decode('utf-8')
d77c3dfd 2484
d77c3dfd
FV
2485 info = {
2486 'id': video_id,
2487 'url': video_url,
2488 'uploader': None,
2489 'upload_date': None,
2490 'title': video_title,
d77c3dfd
FV
2491 'ext': 'flv',
2492 'format': 'flv',
2493 'thumbnail': video_thumbnail,
2494 'description': None,
2495 'player_url': None,
2496 }
2497
58ca755f 2498 return [info]
d77c3dfd
FV
2499
2500
2501class SoundcloudIE(InfoExtractor):
2502 """Information extractor for soundcloud.com
2503 To access the media, the uid of the song and a stream token
2504 must be extracted from the page source and the script must make
2505 a request to media.soundcloud.com/crossdomain.xml. Then
2506 the media can be grabbed by requesting from an url composed
2507 of the stream token and uid
2508 """
2509
2510 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2511 IE_NAME = u'soundcloud'
2512
2513 def __init__(self, downloader=None):
2514 InfoExtractor.__init__(self, downloader)
2515
2516 def report_webpage(self, video_id):
2517 """Report information extraction."""
2518 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519
2520 def report_extraction(self, video_id):
2521 """Report information extraction."""
2522 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523
2524 def _real_extract(self, url):
d77c3dfd
FV
2525 mobj = re.match(self._VALID_URL, url)
2526 if mobj is None:
2527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2528 return
2529
2530 # extract uploader (which is in the url)
2531 uploader = mobj.group(1).decode('utf-8')
2532 # extract simple title (uploader + slug of song title)
2533 slug_title = mobj.group(2).decode('utf-8')
2c288bda 2534 simple_title = uploader + u'-' + slug_title
d77c3dfd
FV
2535
2536 self.report_webpage('%s/%s' % (uploader, slug_title))
2537
2538 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2539 try:
2540 webpage = urllib2.urlopen(request).read()
2541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2543 return
2544
2545 self.report_extraction('%s/%s' % (uploader, slug_title))
2546
2547 # extract uid and stream token that soundcloud hands out for access
2548 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2549 if mobj:
2550 video_id = mobj.group(1)
2551 stream_token = mobj.group(2)
2552
2553 # extract unsimplified title
2554 mobj = re.search('"title":"(.*?)",', webpage)
2555 if mobj:
2c288bda
FV
2556 title = mobj.group(1).decode('utf-8')
2557 else:
2558 title = simple_title
d77c3dfd
FV
2559
2560 # construct media url (with uid/token)
2561 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2562 mediaURL = mediaURL % (video_id, stream_token)
2563
2564 # description
2565 description = u'No description available'
2566 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2567 if mobj:
2568 description = mobj.group(1)
2569
2570 # upload date
2571 upload_date = None
2572 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2573 if mobj:
2574 try:
2575 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2576 except Exception, e:
6ab92c8b 2577 self._downloader.to_stderr(str(e))
d77c3dfd
FV
2578
2579 # for soundcloud, a request to a cross domain is required for cookies
2580 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2581
58ca755f
FV
2582 return [{
2583 'id': video_id.decode('utf-8'),
2584 'url': mediaURL,
2585 'uploader': uploader.decode('utf-8'),
2586 'upload_date': upload_date,
2c288bda 2587 'title': title,
58ca755f
FV
2588 'ext': u'mp3',
2589 'format': u'NA',
2590 'player_url': None,
2591 'description': description.decode('utf-8')
2592 }]
d77c3dfd
FV
2593
2594
2595class InfoQIE(InfoExtractor):
2596 """Information extractor for infoq.com"""
2597
2598 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2599 IE_NAME = u'infoq'
2600
2601 def report_webpage(self, video_id):
2602 """Report information extraction."""
2603 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2604
2605 def report_extraction(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2608
2609 def _real_extract(self, url):
d77c3dfd
FV
2610 mobj = re.match(self._VALID_URL, url)
2611 if mobj is None:
2612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613 return
2614
2615 self.report_webpage(url)
2616
2617 request = urllib2.Request(url)
2618 try:
2619 webpage = urllib2.urlopen(request).read()
2620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2622 return
2623
2624 self.report_extraction(url)
2625
2626
2627 # Extract video URL
2628 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2629 if mobj is None:
2630 self._downloader.trouble(u'ERROR: unable to extract video url')
2631 return
2632 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2633
2634
2635 # Extract title
2636 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2637 if mobj is None:
2638 self._downloader.trouble(u'ERROR: unable to extract video title')
2639 return
2640 video_title = mobj.group(1).decode('utf-8')
2641
2642 # Extract description
2643 video_description = u'No description available.'
2644 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2645 if mobj is not None:
2646 video_description = mobj.group(1).decode('utf-8')
2647
2648 video_filename = video_url.split('/')[-1]
2649 video_id, extension = video_filename.split('.')
2650
d77c3dfd
FV
2651 info = {
2652 'id': video_id,
2653 'url': video_url,
2654 'uploader': None,
2655 'upload_date': None,
2656 'title': video_title,
d77c3dfd
FV
2657 'ext': extension,
2658 'format': extension, # Extension is always(?) mp4, but seems to be flv
2659 'thumbnail': None,
2660 'description': video_description,
2661 'player_url': None,
2662 }
2663
58ca755f 2664 return [info]
d77c3dfd
FV
2665
2666class MixcloudIE(InfoExtractor):
2667 """Information extractor for www.mixcloud.com"""
2668 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2669 IE_NAME = u'mixcloud'
2670
2671 def __init__(self, downloader=None):
2672 InfoExtractor.__init__(self, downloader)
2673
2674 def report_download_json(self, file_id):
2675 """Report JSON download."""
2676 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2677
2678 def report_extraction(self, file_id):
2679 """Report information extraction."""
2680 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2681
2682 def get_urls(self, jsonData, fmt, bitrate='best'):
2683 """Get urls from 'audio_formats' section in json"""
2684 file_url = None
2685 try:
2686 bitrate_list = jsonData[fmt]
2687 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2688 bitrate = max(bitrate_list) # select highest
2689
2690 url_list = jsonData[fmt][bitrate]
2691 except TypeError: # we have no bitrate info.
2692 url_list = jsonData[fmt]
d77c3dfd
FV
2693 return url_list
2694
2695 def check_urls(self, url_list):
2696 """Returns 1st active url from list"""
2697 for url in url_list:
2698 try:
2699 urllib2.urlopen(url)
2700 return url
2701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2702 url = None
2703
2704 return None
2705
2706 def _print_formats(self, formats):
2707 print 'Available formats:'
2708 for fmt in formats.keys():
2709 for b in formats[fmt]:
2710 try:
2711 ext = formats[fmt][b][0]
2712 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2713 except TypeError: # we have no bitrate info
2714 ext = formats[fmt][0]
2715 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2716 break
2717
2718 def _real_extract(self, url):
2719 mobj = re.match(self._VALID_URL, url)
2720 if mobj is None:
2721 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2722 return
2723 # extract uploader & filename from url
2724 uploader = mobj.group(1).decode('utf-8')
2725 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2726
2727 # construct API request
2728 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2729 # retrieve .json file with links to files
2730 request = urllib2.Request(file_url)
2731 try:
2732 self.report_download_json(file_url)
2733 jsonData = urllib2.urlopen(request).read()
2734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2735 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2736 return
2737
2738 # parse JSON
2739 json_data = json.loads(jsonData)
2740 player_url = json_data['player_swf_url']
2741 formats = dict(json_data['audio_formats'])
2742
2743 req_format = self._downloader.params.get('format', None)
2744 bitrate = None
2745
2746 if self._downloader.params.get('listformats', None):
2747 self._print_formats(formats)
2748 return
2749
2750 if req_format is None or req_format == 'best':
2751 for format_param in formats.keys():
2752 url_list = self.get_urls(formats, format_param)
2753 # check urls
2754 file_url = self.check_urls(url_list)
2755 if file_url is not None:
2756 break # got it!
2757 else:
2758 if req_format not in formats.keys():
2759 self._downloader.trouble(u'ERROR: format is not available')
2760 return
2761
2762 url_list = self.get_urls(formats, req_format)
2763 file_url = self.check_urls(url_list)
2764 format_param = req_format
2765
58ca755f
FV
2766 return [{
2767 'id': file_id.decode('utf-8'),
2768 'url': file_url.decode('utf-8'),
2769 'uploader': uploader.decode('utf-8'),
2770 'upload_date': u'NA',
2771 'title': json_data['name'],
58ca755f
FV
2772 'ext': file_url.split('.')[-1].decode('utf-8'),
2773 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2774 'thumbnail': json_data['thumbnail_url'],
2775 'description': json_data['description'],
2776 'player_url': player_url.decode('utf-8'),
2777 }]
d77c3dfd
FV
2778
2779class StanfordOpenClassroomIE(InfoExtractor):
2780 """Information extractor for Stanford's Open ClassRoom"""
2781
2782 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2783 IE_NAME = u'stanfordoc'
2784
2785 def report_download_webpage(self, objid):
2786 """Report information extraction."""
2787 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2788
2789 def report_extraction(self, video_id):
2790 """Report information extraction."""
2791 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2792
2793 def _real_extract(self, url):
2794 mobj = re.match(self._VALID_URL, url)
2795 if mobj is None:
2796 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797 return
2798
2799 if mobj.group('course') and mobj.group('video'): # A specific video
2800 course = mobj.group('course')
2801 video = mobj.group('video')
2802 info = {
2c288bda 2803 'id': course + '_' + video,
d77c3dfd 2804 }
3fe294e4 2805
d77c3dfd
FV
2806 self.report_extraction(info['id'])
2807 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808 xmlUrl = baseUrl + video + '.xml'
2809 try:
2810 metaXml = urllib2.urlopen(xmlUrl).read()
2811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2812 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2813 return
2814 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2815 try:
2816 info['title'] = mdoc.findall('./title')[0].text
2817 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2818 except IndexError:
2819 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2820 return
d77c3dfd
FV
2821 info['ext'] = info['url'].rpartition('.')[2]
2822 info['format'] = info['ext']
58ca755f 2823 return [info]
d77c3dfd 2824 elif mobj.group('course'): # A course page
d77c3dfd
FV
2825 course = mobj.group('course')
2826 info = {
2c288bda 2827 'id': course,
d77c3dfd
FV
2828 'type': 'playlist',
2829 }
2830
2831 self.report_download_webpage(info['id'])
2832 try:
2833 coursepage = urllib2.urlopen(url).read()
2834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2835 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2836 return
2837
2838 m = re.search('<h1>([^<]+)</h1>', coursepage)
2839 if m:
2840 info['title'] = unescapeHTML(m.group(1))
2841 else:
2842 info['title'] = info['id']
d77c3dfd
FV
2843
2844 m = re.search('<description>([^<]+)</description>', coursepage)
2845 if m:
2846 info['description'] = unescapeHTML(m.group(1))
2847
2848 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2849 info['list'] = [
2850 {
2851 'type': 'reference',
2852 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2853 }
2854 for vpage in links]
58ca755f 2855 results = []
d77c3dfd
FV
2856 for entry in info['list']:
2857 assert entry['type'] == 'reference'
58ca755f
FV
2858 results += self.extract(entry['url'])
2859 return results
2860
d77c3dfd 2861 else: # Root page
d77c3dfd
FV
2862 info = {
2863 'id': 'Stanford OpenClassroom',
2864 'type': 'playlist',
2865 }
2866
2867 self.report_download_webpage(info['id'])
2868 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869 try:
2870 rootpage = urllib2.urlopen(rootURL).read()
2871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2873 return
2874
2875 info['title'] = info['id']
d77c3dfd
FV
2876
2877 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2878 info['list'] = [
2879 {
2880 'type': 'reference',
2881 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2882 }
2883 for cpage in links]
2884
58ca755f 2885 results = []
d77c3dfd
FV
2886 for entry in info['list']:
2887 assert entry['type'] == 'reference'
58ca755f
FV
2888 results += self.extract(entry['url'])
2889 return results
d77c3dfd
FV
2890
2891class MTVIE(InfoExtractor):
2892 """Information extractor for MTV.com"""
2893
2894 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2895 IE_NAME = u'mtv'
2896
2897 def report_webpage(self, video_id):
2898 """Report information extraction."""
2899 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2900
2901 def report_extraction(self, video_id):
2902 """Report information extraction."""
2903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2904
2905 def _real_extract(self, url):
2906 mobj = re.match(self._VALID_URL, url)
2907 if mobj is None:
2908 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2909 return
2910 if not mobj.group('proto'):
2911 url = 'http://' + url
2912 video_id = mobj.group('videoid')
2913 self.report_webpage(video_id)
2914
2915 request = urllib2.Request(url)
2916 try:
2917 webpage = urllib2.urlopen(request).read()
2918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2919 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2920 return
2921
2922 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2923 if mobj is None:
2924 self._downloader.trouble(u'ERROR: unable to extract song name')
2925 return
2926 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2927 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2928 if mobj is None:
2929 self._downloader.trouble(u'ERROR: unable to extract performer')
2930 return
2931 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2932 video_title = performer + ' - ' + song_name
2933
2934 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2935 if mobj is None:
2936 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2937 return
2938 mtvn_uri = mobj.group(1)
2939
2940 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2941 if mobj is None:
2942 self._downloader.trouble(u'ERROR: unable to extract content id')
2943 return
2944 content_id = mobj.group(1)
2945
2946 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2947 self.report_extraction(video_id)
2948 request = urllib2.Request(videogen_url)
2949 try:
2950 metadataXml = urllib2.urlopen(request).read()
2951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2952 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2953 return
2954
2955 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2956 renditions = mdoc.findall('.//rendition')
2957
2958 # For now, always pick the highest quality.
2959 rendition = renditions[-1]
2960
2961 try:
2962 _,_,ext = rendition.attrib['type'].partition('/')
2963 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2964 video_url = rendition.find('./src').text
2965 except KeyError:
2966 self._downloader.trouble('Invalid rendition field.')
2967 return
2968
d77c3dfd
FV
2969 info = {
2970 'id': video_id,
2971 'url': video_url,
2972 'uploader': performer,
2973 'title': video_title,
d77c3dfd
FV
2974 'ext': ext,
2975 'format': format,
2976 }
2977
58ca755f 2978 return [info]