]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Bump version number
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import datetime
5import HTMLParser
6import httplib
7import netrc
8import os
9import re
10import socket
11import time
12import urllib
13import urllib2
14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
921a1455 18from urlparse import parse_qs
d77c3dfd
FV
19
20try:
21 import cStringIO as StringIO
22except ImportError:
23 import StringIO
24
d11d05d0 25from utils import *
d77c3dfd
FV
26
27
28class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
38 the following fields:
39
40 id: Video identifier.
41 url: Final video URL.
42 uploader: Nickname of the video uploader.
43 title: Literal title.
d77c3dfd
FV
44 ext: Video filename extension.
45 format: Video format.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
52
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59 """
60
61 _ready = False
62 _downloader = None
63
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
66 self._ready = False
67 self.set_downloader(downloader)
68
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
72
73 def initialize(self):
74 """Initializes an instance (authentication, etc)."""
75 if not self._ready:
76 self._real_initialize()
77 self._ready = True
78
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
81 self.initialize()
82 return self._real_extract(url)
83
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
87
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
90 pass
91
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
94 pass
95
96
97class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
99
202e76cf
FV
100 _VALID_URL = r"""^
101 (
102 (?:https?://)? # http(s):// (optional)
5c961d89
FV
103 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
104 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
b8005afc 105 (?:.*?\#/)? # handle anchor (#/) redirect urls
202e76cf
FV
106 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
107 (?: # the various things that can precede the ID:
108 (?:(?:v|embed|e)/) # v/ or embed/ or e/
109 |(?: # or the v= param in all its forms
110 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
111 (?:\?|\#!?) # the params delimiter ? or # or #!
112 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
113 v=
114 )
115 )? # optional -> youtube.com/xxxx is OK
116 )? # all until now is optional -> you can pass the naked ID
117 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
118 (?(1).+)? # if we found the ID, everything can follow
119 $"""
d77c3dfd
FV
120 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
121 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
122 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
123 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
124 _NETRC_MACHINE = 'youtube'
125 # Listed in order of quality
3fe294e4
FV
126 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
127 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
d77c3dfd
FV
128 _video_extensions = {
129 '13': '3gp',
130 '17': 'mp4',
131 '18': 'mp4',
132 '22': 'mp4',
133 '37': 'mp4',
134 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
135 '43': 'webm',
136 '44': 'webm',
137 '45': 'webm',
3fe294e4 138 '46': 'webm',
d77c3dfd
FV
139 }
140 _video_dimensions = {
141 '5': '240x400',
142 '6': '???',
143 '13': '???',
144 '17': '144x176',
145 '18': '360x640',
146 '22': '720x1280',
147 '34': '360x640',
148 '35': '480x854',
149 '37': '1080x1920',
150 '38': '3072x4096',
151 '43': '360x640',
152 '44': '480x854',
153 '45': '720x1280',
3fe294e4 154 '46': '1080x1920',
d77c3dfd
FV
155 }
156 IE_NAME = u'youtube'
157
202e76cf
FV
158 def suitable(self, url):
159 """Receives a URL and returns True if suitable for this IE."""
160 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
161
d77c3dfd
FV
162 def report_lang(self):
163 """Report attempt to set language."""
164 self._downloader.to_screen(u'[youtube] Setting language')
165
166 def report_login(self):
167 """Report attempt to log in."""
168 self._downloader.to_screen(u'[youtube] Logging in')
169
170 def report_age_confirmation(self):
171 """Report attempt to confirm age."""
172 self._downloader.to_screen(u'[youtube] Confirming age')
173
174 def report_video_webpage_download(self, video_id):
175 """Report attempt to download video webpage."""
176 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
177
178 def report_video_info_webpage_download(self, video_id):
179 """Report attempt to download video info webpage."""
180 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
181
182 def report_video_subtitles_download(self, video_id):
183 """Report attempt to download video info webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
185
186 def report_information_extraction(self, video_id):
187 """Report attempt to extract video information."""
188 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
189
190 def report_unavailable_format(self, video_id, format):
191 """Report extracted video URL."""
192 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
193
194 def report_rtmp_download(self):
195 """Indicate the download will use the RTMP protocol."""
196 self._downloader.to_screen(u'[youtube] RTMP download detected')
197
198 def _closed_captions_xml_to_srt(self, xml_string):
199 srt = ''
200 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
201 # TODO parse xml instead of regex
202 for n, (start, dur_tag, dur, caption) in enumerate(texts):
203 if not dur: dur = '4'
204 start = float(start)
205 end = start + float(dur)
206 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
207 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
9e6dd238 208 caption = unescapeHTML(caption)
6ab92c8b 209 caption = unescapeHTML(caption) # double cycle, intentional
54041793 210 srt += str(n+1) + '\n'
d77c3dfd
FV
211 srt += start + ' --> ' + end + '\n'
212 srt += caption + '\n\n'
213 return srt
214
215 def _print_formats(self, formats):
51937c08 216 print('Available formats:')
d77c3dfd 217 for x in formats:
51937c08 218 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
d77c3dfd
FV
219
220 def _real_initialize(self):
221 if self._downloader is None:
222 return
223
224 username = None
225 password = None
226 downloader_params = self._downloader.params
227
228 # Attempt to use provided username and password or .netrc data
229 if downloader_params.get('username', None) is not None:
230 username = downloader_params['username']
231 password = downloader_params['password']
232 elif downloader_params.get('usenetrc', False):
233 try:
234 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
235 if info is not None:
236 username = info[0]
237 password = info[2]
238 else:
239 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
240 except (IOError, netrc.NetrcParseError), err:
92b91c18 241 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
d77c3dfd
FV
242 return
243
244 # Set language
245 request = urllib2.Request(self._LANG_URL)
246 try:
247 self.report_lang()
248 urllib2.urlopen(request).read()
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 250 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
d77c3dfd
FV
251 return
252
253 # No authentication to be performed
254 if username is None:
255 return
256
257 # Log in
258 login_form = {
259 'current_form': 'loginForm',
260 'next': '/',
261 'action_login': 'Log In',
262 'username': username,
263 'password': password,
264 }
265 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
266 try:
267 self.report_login()
268 login_results = urllib2.urlopen(request).read()
269 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
270 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
271 return
272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 273 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
d77c3dfd
FV
274 return
275
276 # Confirm age
277 age_form = {
278 'next_url': '/',
279 'action_confirm': 'Confirm',
280 }
281 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
282 try:
283 self.report_age_confirmation()
284 age_results = urllib2.urlopen(request).read()
285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 286 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
d77c3dfd
FV
287 return
288
289 def _real_extract(self, url):
290 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
291 mobj = re.search(self._NEXT_URL_RE, url)
292 if mobj:
293 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
294
295 # Extract video id from URL
202e76cf 296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
d77c3dfd
FV
297 if mobj is None:
298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
299 return
300 video_id = mobj.group(2)
301
302 # Get video webpage
303 self.report_video_webpage_download(video_id)
304 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
305 try:
306 video_webpage = urllib2.urlopen(request).read()
307 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
309 return
310
311 # Attempt to extract SWF player URL
312 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
313 if mobj is not None:
314 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
315 else:
316 player_url = None
317
318 # Get video info
319 self.report_video_info_webpage_download(video_id)
320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
321 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
322 % (video_id, el_type))
323 request = urllib2.Request(video_info_url)
324 try:
325 video_info_webpage = urllib2.urlopen(request).read()
326 video_info = parse_qs(video_info_webpage)
327 if 'token' in video_info:
328 break
329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 330 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
d77c3dfd
FV
331 return
332 if 'token' not in video_info:
333 if 'reason' in video_info:
334 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
335 else:
336 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
337 return
338
7df97fb5
FV
339 # Check for "rental" videos
340 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
341 self._downloader.trouble(u'ERROR: "rental" videos not supported')
342 return
343
d77c3dfd
FV
344 # Start extracting information
345 self.report_information_extraction(video_id)
346
347 # uploader
348 if 'author' not in video_info:
349 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
350 return
351 video_uploader = urllib.unquote_plus(video_info['author'][0])
352
353 # title
354 if 'title' not in video_info:
355 self._downloader.trouble(u'ERROR: unable to extract video title')
356 return
357 video_title = urllib.unquote_plus(video_info['title'][0])
358 video_title = video_title.decode('utf-8')
d77c3dfd
FV
359
360 # thumbnail image
361 if 'thumbnail_url' not in video_info:
362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
363 video_thumbnail = ''
364 else: # don't panic if we can't find it
365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
366
367 # upload date
368 upload_date = u'NA'
369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
370 if mobj is not None:
371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
373 for expression in format_expressions:
374 try:
375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
376 except:
377 pass
378
379 # description
9beb5af8
FV
380 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
381 if video_description: video_description = clean_html(video_description)
9e6dd238 382 else: video_description = ''
d77c3dfd
FV
383
384 # closed captions
385 video_subtitles = None
386 if self._downloader.params.get('writesubtitles', False):
d77c3dfd 387 try:
0b8c922d
FV
388 self.report_video_subtitles_download(video_id)
389 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
390 try:
391 srt_list = urllib2.urlopen(request).read()
392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 393 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
dee5d769
FV
394 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
395 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
0b8c922d
FV
396 if not srt_lang_list:
397 raise Trouble(u'WARNING: video has no closed captions')
398 if self._downloader.params.get('subtitleslang', False):
399 srt_lang = self._downloader.params.get('subtitleslang')
400 elif 'en' in srt_lang_list:
401 srt_lang = 'en'
d77c3dfd 402 else:
dee5d769 403 srt_lang = srt_lang_list.keys()[0]
0b8c922d
FV
404 if not srt_lang in srt_lang_list:
405 raise Trouble(u'WARNING: no closed captions found in the specified language')
dee5d769 406 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
0b8c922d
FV
407 try:
408 srt_xml = urllib2.urlopen(request).read()
409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 410 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
dee5d769
FV
411 if not srt_xml:
412 raise Trouble(u'WARNING: unable to download video subtitles')
0b8c922d
FV
413 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
414 except Trouble as trouble:
415 self._downloader.trouble(trouble[0])
d77c3dfd 416
2262a32d
T
417 if 'length_seconds' not in video_info:
418 self._downloader.trouble(u'WARNING: unable to extract video duration')
419 video_duration = ''
420 else:
421 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
422
d77c3dfd
FV
423 # token
424 video_token = urllib.unquote_plus(video_info['token'][0])
425
426 # Decide which formats to download
427 req_format = self._downloader.params.get('format', None)
428
429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
430 self.report_rtmp_download()
431 video_url_list = [(None, video_info['conn'][0])]
432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
433 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
434 url_data = [parse_qs(uds) for uds in url_data_strs]
435 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
9ca66706 436 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
d77c3dfd
FV
437
438 format_limit = self._downloader.params.get('format_limit', None)
439 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
440 if format_limit is not None and format_limit in available_formats:
441 format_list = available_formats[available_formats.index(format_limit):]
442 else:
443 format_list = available_formats
444 existing_formats = [x for x in format_list if x in url_map]
445 if len(existing_formats) == 0:
446 self._downloader.trouble(u'ERROR: no known formats available for video')
447 return
448 if self._downloader.params.get('listformats', None):
449 self._print_formats(existing_formats)
450 return
451 if req_format is None or req_format == 'best':
452 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
453 elif req_format == 'worst':
454 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
455 elif req_format in ('-1', 'all'):
456 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
457 else:
458 # Specific formats. We pick the first in a slash-delimeted sequence.
459 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
460 req_formats = req_format.split('/')
461 video_url_list = None
462 for rf in req_formats:
463 if rf in url_map:
464 video_url_list = [(rf, url_map[rf])]
465 break
466 if video_url_list is None:
467 self._downloader.trouble(u'ERROR: requested format not available')
468 return
469 else:
470 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
471 return
472
58ca755f 473 results = []
d77c3dfd 474 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
475 # Extension
476 video_extension = self._video_extensions.get(format_param, 'flv')
477
58ca755f
FV
478 results.append({
479 'id': video_id.decode('utf-8'),
480 'url': video_real_url.decode('utf-8'),
481 'uploader': video_uploader.decode('utf-8'),
482 'upload_date': upload_date,
483 'title': video_title,
58ca755f
FV
484 'ext': video_extension.decode('utf-8'),
485 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
486 'thumbnail': video_thumbnail.decode('utf-8'),
487 'description': video_description,
488 'player_url': player_url,
2262a32d
T
489 'subtitles': video_subtitles,
490 'duration': video_duration
58ca755f
FV
491 })
492 return results
d77c3dfd
FV
493
494
495class MetacafeIE(InfoExtractor):
496 """Information Extractor for metacafe.com."""
497
498 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
499 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
500 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
d77c3dfd
FV
501 IE_NAME = u'metacafe'
502
58ca755f 503 def __init__(self, downloader=None):
d77c3dfd 504 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
505
506 def report_disclaimer(self):
507 """Report disclaimer retrieval."""
508 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
509
510 def report_age_confirmation(self):
511 """Report attempt to confirm age."""
512 self._downloader.to_screen(u'[metacafe] Confirming age')
513
514 def report_download_webpage(self, video_id):
515 """Report webpage download."""
516 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
517
518 def report_extraction(self, video_id):
519 """Report information extraction."""
520 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
521
522 def _real_initialize(self):
523 # Retrieve disclaimer
524 request = urllib2.Request(self._DISCLAIMER)
525 try:
526 self.report_disclaimer()
527 disclaimer = urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 529 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
d77c3dfd
FV
530 return
531
532 # Confirm age
533 disclaimer_form = {
534 'filters': '0',
535 'submit': "Continue - I'm over 18",
536 }
537 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
538 try:
539 self.report_age_confirmation()
540 disclaimer = urllib2.urlopen(request).read()
541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 542 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
d77c3dfd
FV
543 return
544
545 def _real_extract(self, url):
546 # Extract id and simplified title from URL
547 mobj = re.match(self._VALID_URL, url)
548 if mobj is None:
549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
550 return
551
552 video_id = mobj.group(1)
553
554 # Check if video comes from YouTube
555 mobj2 = re.match(r'^yt-(.*)$', video_id)
556 if mobj2 is not None:
58ca755f 557 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
d77c3dfd
FV
558 return
559
d77c3dfd
FV
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
562 try:
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
567 return
568
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
572 if mobj is not None:
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
575
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
578 if mobj is None:
579 video_url = mediaURL
580 else:
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
583 else:
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
585 if mobj is None:
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
587 return
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
591 return
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
593 if mobj is None:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
599
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: unable to extract title')
603 return
604 video_title = mobj.group(1).decode('utf-8')
d77c3dfd 605
bf95333e 606 mobj = re.search(r'submitter=(.*?);', webpage)
d77c3dfd
FV
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
609 return
610 video_uploader = mobj.group(1)
611
58ca755f
FV
612 return [{
613 'id': video_id.decode('utf-8'),
614 'url': video_url.decode('utf-8'),
615 'uploader': video_uploader.decode('utf-8'),
616 'upload_date': u'NA',
617 'title': video_title,
58ca755f
FV
618 'ext': video_extension.decode('utf-8'),
619 'format': u'NA',
620 'player_url': None,
621 }]
d77c3dfd
FV
622
623
624class DailymotionIE(InfoExtractor):
625 """Information Extractor for Dailymotion"""
626
63ec7b74 627 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
d77c3dfd
FV
628 IE_NAME = u'dailymotion'
629
630 def __init__(self, downloader=None):
631 InfoExtractor.__init__(self, downloader)
632
633 def report_download_webpage(self, video_id):
634 """Report webpage download."""
635 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
636
637 def report_extraction(self, video_id):
638 """Report information extraction."""
639 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
640
641 def _real_extract(self, url):
642 # Extract id and simplified title from URL
643 mobj = re.match(self._VALID_URL, url)
644 if mobj is None:
645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 return
647
13e69f54 648 video_id = mobj.group(1).split('_')[0].split('?')[0]
d77c3dfd 649
349e2e3e 650 video_extension = 'mp4'
d77c3dfd
FV
651
652 # Retrieve video webpage to extract further information
653 request = urllib2.Request(url)
654 request.add_header('Cookie', 'family_filter=off')
655 try:
656 self.report_download_webpage(video_id)
657 webpage = urllib2.urlopen(request).read()
658 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 659 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
660 return
661
662 # Extract URL, uploader and title from webpage
663 self.report_extraction(video_id)
349e2e3e 664 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
d77c3dfd
FV
665 if mobj is None:
666 self._downloader.trouble(u'ERROR: unable to extract media URL')
667 return
349e2e3e 668 flashvars = urllib.unquote(mobj.group(1))
0bfd0b59
FV
669
670 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
671 if key in flashvars:
672 max_quality = key
673 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
674 break
675 else:
676 self._downloader.trouble(u'ERROR: unable to extract video URL')
677 return
678
3c4d6c9e 679 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
7b6d7001 680 if mobj is None:
0bfd0b59 681 self._downloader.trouble(u'ERROR: unable to extract video URL')
d77c3dfd 682 return
0bfd0b59 683
7b6d7001 684 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
d77c3dfd 685
3c4d6c9e 686 # TODO: support choosing qualities
d77c3dfd
FV
687
688 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
689 if mobj is None:
690 self._downloader.trouble(u'ERROR: unable to extract title')
691 return
692 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
d77c3dfd 693
0bfd0b59 694 video_uploader = u'NA'
fe4d68e1 695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
d77c3dfd 696 if mobj is None:
4cc39146
FV
697 # lookin for official user
698 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
699 if mobj_official is None:
700 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
701 else:
702 video_uploader = mobj_official.group(1)
0bfd0b59
FV
703 else:
704 video_uploader = mobj.group(1)
d77c3dfd 705
413575f7
T
706 video_upload_date = u'NA'
707 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
708 if mobj is not None:
709 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
710
58ca755f
FV
711 return [{
712 'id': video_id.decode('utf-8'),
3c4d6c9e 713 'url': video_url.decode('utf-8'),
58ca755f 714 'uploader': video_uploader.decode('utf-8'),
413575f7 715 'upload_date': video_upload_date,
58ca755f 716 'title': video_title,
58ca755f
FV
717 'ext': video_extension.decode('utf-8'),
718 'format': u'NA',
719 'player_url': None,
720 }]
d77c3dfd
FV
721
722
723class GoogleIE(InfoExtractor):
724 """Information extractor for video.google.com."""
725
726 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
727 IE_NAME = u'video.google'
728
729 def __init__(self, downloader=None):
730 InfoExtractor.__init__(self, downloader)
731
732 def report_download_webpage(self, video_id):
733 """Report webpage download."""
734 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
735
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
739
740 def _real_extract(self, url):
741 # Extract id from URL
742 mobj = re.match(self._VALID_URL, url)
743 if mobj is None:
744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
745 return
746
d77c3dfd
FV
747 video_id = mobj.group(1)
748
749 video_extension = 'mp4'
750
751 # Retrieve video webpage to extract further information
752 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
753 try:
754 self.report_download_webpage(video_id)
755 webpage = urllib2.urlopen(request).read()
756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
758 return
759
760 # Extract URL, uploader, and title from webpage
761 self.report_extraction(video_id)
762 mobj = re.search(r"download_url:'([^']+)'", webpage)
763 if mobj is None:
764 video_extension = 'flv'
765 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
766 if mobj is None:
767 self._downloader.trouble(u'ERROR: unable to extract media URL')
768 return
769 mediaURL = urllib.unquote(mobj.group(1))
770 mediaURL = mediaURL.replace('\\x3d', '\x3d')
771 mediaURL = mediaURL.replace('\\x26', '\x26')
772
773 video_url = mediaURL
774
775 mobj = re.search(r'<title>(.*)</title>', webpage)
776 if mobj is None:
777 self._downloader.trouble(u'ERROR: unable to extract title')
778 return
779 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
780
781 # Extract video description
782 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: unable to extract video description')
785 return
786 video_description = mobj.group(1).decode('utf-8')
787 if not video_description:
788 video_description = 'No description available.'
789
790 # Extract video thumbnail
791 if self._downloader.params.get('forcethumbnail', False):
792 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
793 try:
794 webpage = urllib2.urlopen(request).read()
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
797 return
798 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
799 if mobj is None:
800 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
801 return
802 video_thumbnail = mobj.group(1)
803 else: # we need something to pass to process_info
804 video_thumbnail = ''
805
58ca755f
FV
806 return [{
807 'id': video_id.decode('utf-8'),
808 'url': video_url.decode('utf-8'),
809 'uploader': u'NA',
810 'upload_date': u'NA',
811 'title': video_title,
58ca755f
FV
812 'ext': video_extension.decode('utf-8'),
813 'format': u'NA',
814 'player_url': None,
815 }]
d77c3dfd
FV
816
817
818class PhotobucketIE(InfoExtractor):
819 """Information extractor for photobucket.com."""
820
821 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
822 IE_NAME = u'photobucket'
823
824 def __init__(self, downloader=None):
825 InfoExtractor.__init__(self, downloader)
826
827 def report_download_webpage(self, video_id):
828 """Report webpage download."""
829 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
830
831 def report_extraction(self, video_id):
832 """Report information extraction."""
833 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
834
835 def _real_extract(self, url):
836 # Extract id from URL
837 mobj = re.match(self._VALID_URL, url)
838 if mobj is None:
839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
840 return
841
d77c3dfd
FV
842 video_id = mobj.group(1)
843
844 video_extension = 'flv'
845
846 # Retrieve video webpage to extract further information
847 request = urllib2.Request(url)
848 try:
849 self.report_download_webpage(video_id)
850 webpage = urllib2.urlopen(request).read()
851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 852 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
853 return
854
855 # Extract URL, uploader, and title from webpage
856 self.report_extraction(video_id)
857 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
858 if mobj is None:
859 self._downloader.trouble(u'ERROR: unable to extract media URL')
860 return
861 mediaURL = urllib.unquote(mobj.group(1))
862
863 video_url = mediaURL
864
865 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
866 if mobj is None:
867 self._downloader.trouble(u'ERROR: unable to extract title')
868 return
869 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
870
871 video_uploader = mobj.group(2).decode('utf-8')
872
58ca755f
FV
873 return [{
874 'id': video_id.decode('utf-8'),
875 'url': video_url.decode('utf-8'),
876 'uploader': video_uploader,
877 'upload_date': u'NA',
878 'title': video_title,
58ca755f
FV
879 'ext': video_extension.decode('utf-8'),
880 'format': u'NA',
881 'player_url': None,
882 }]
d77c3dfd
FV
883
884
885class YahooIE(InfoExtractor):
886 """Information extractor for video.yahoo.com."""
887
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME = u'video.yahoo'
893
894 def __init__(self, downloader=None):
895 InfoExtractor.__init__(self, downloader)
896
897 def report_download_webpage(self, video_id):
898 """Report webpage download."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
900
901 def report_extraction(self, video_id):
902 """Report information extraction."""
903 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
904
905 def _real_extract(self, url, new_video=True):
906 # Extract ID from URL
907 mobj = re.match(self._VALID_URL, url)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
910 return
911
d77c3dfd
FV
912 video_id = mobj.group(2)
913 video_extension = 'flv'
914
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re.match(self._VPAGE_URL, url) is None:
918 request = urllib2.Request(url)
919 try:
920 webpage = urllib2.urlopen(request).read()
921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
923 return
924
925 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
926 if mobj is None:
927 self._downloader.trouble(u'ERROR: Unable to extract id field')
928 return
929 yahoo_id = mobj.group(1)
930
931 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract vid field')
934 return
935 yahoo_vid = mobj.group(1)
936
937 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
938 return self._real_extract(url, new_video=False)
939
940 # Retrieve video webpage to extract further information
941 request = urllib2.Request(url)
942 try:
943 self.report_download_webpage(video_id)
944 webpage = urllib2.urlopen(request).read()
945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
947 return
948
949 # Extract uploader and title from webpage
950 self.report_extraction(video_id)
951 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
952 if mobj is None:
953 self._downloader.trouble(u'ERROR: unable to extract video title')
954 return
955 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
956
957 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video uploader')
960 return
961 video_uploader = mobj.group(1).decode('utf-8')
962
963 # Extract video thumbnail
964 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
965 if mobj is None:
966 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
967 return
968 video_thumbnail = mobj.group(1).decode('utf-8')
969
970 # Extract video description
971 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video description')
974 return
975 video_description = mobj.group(1).decode('utf-8')
976 if not video_description:
977 video_description = 'No description available.'
978
979 # Extract video height and width
980 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: unable to extract video height')
983 return
984 yv_video_height = mobj.group(1)
985
986 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video width')
989 return
990 yv_video_width = mobj.group(1)
991
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate = '700' # according to Wikipedia this is hard-coded
997 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
998 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
999 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000 try:
1001 self.report_download_webpage(video_id)
1002 webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
1005 return
1006
1007 # Extract media URL from playlist XML
1008 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009 if mobj is None:
1010 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011 return
1012 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
9e6dd238 1013 video_url = unescapeHTML(video_url)
d77c3dfd 1014
58ca755f
FV
1015 return [{
1016 'id': video_id.decode('utf-8'),
1017 'url': video_url,
1018 'uploader': video_uploader,
1019 'upload_date': u'NA',
1020 'title': video_title,
58ca755f
FV
1021 'ext': video_extension.decode('utf-8'),
1022 'thumbnail': video_thumbnail.decode('utf-8'),
1023 'description': video_description,
1024 'thumbnail': video_thumbnail,
1025 'player_url': None,
1026 }]
d77c3dfd
FV
1027
1028
1029class VimeoIE(InfoExtractor):
1030 """Information extractor for vimeo.com."""
1031
1032 # _VALID_URL matches Vimeo URLs
297d7fd9 1033 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
d77c3dfd
FV
1034 IE_NAME = u'vimeo'
1035
1036 def __init__(self, downloader=None):
1037 InfoExtractor.__init__(self, downloader)
1038
1039 def report_download_webpage(self, video_id):
1040 """Report webpage download."""
1041 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043 def report_extraction(self, video_id):
1044 """Report information extraction."""
1045 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047 def _real_extract(self, url, new_video=True):
1048 # Extract ID from URL
1049 mobj = re.match(self._VALID_URL, url)
1050 if mobj is None:
1051 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052 return
1053
d77c3dfd
FV
1054 video_id = mobj.group(1)
1055
1056 # Retrieve video webpage to extract further information
1057 request = urllib2.Request(url, None, std_headers)
1058 try:
1059 self.report_download_webpage(video_id)
1060 webpage = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
1063 return
1064
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self.report_extraction(video_id)
1069
1070 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072 try:
1073 config = json.loads(config)
1074 except:
1075 self._downloader.trouble(u'ERROR: unable to extract info section')
1076 return
1077
1078 # Extract title
1079 video_title = config["video"]["title"]
d77c3dfd
FV
1080
1081 # Extract uploader
1082 video_uploader = config["video"]["owner"]["name"]
1083
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1086
1087 # Extract video description
9beb5af8
FV
1088 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089 if video_description: video_description = clean_html(video_description)
9e6dd238 1090 else: video_description = ''
d77c3dfd
FV
1091
1092 # Extract upload date
1093 video_upload_date = u'NA'
1094 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1)
1097
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1101
1102 # Vimeo specific: extract video codec and quality information
74033a66 1103 # First consider quality, then codecs, then take everything
d77c3dfd
FV
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
74033a66
FV
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1113 else:
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
d77c3dfd
FV
1122 break
1123 else:
1124 self._downloader.trouble(u'ERROR: no known codec found')
1125 return
1126
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
74033a66 1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
d77c3dfd 1129
58ca755f
FV
1130 return [{
1131 'id': video_id,
1132 'url': video_url,
1133 'uploader': video_uploader,
1134 'upload_date': video_upload_date,
1135 'title': video_title,
58ca755f
FV
1136 'ext': video_extension,
1137 'thumbnail': video_thumbnail,
1138 'description': video_description,
1139 'player_url': None,
1140 }]
d77c3dfd
FV
1141
1142
1143class GenericIE(InfoExtractor):
1144 """Generic last-resort information extractor."""
1145
1146 _VALID_URL = r'.*'
1147 IE_NAME = u'generic'
1148
1149 def __init__(self, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1151
1152 def report_download_webpage(self, video_id):
1153 """Report webpage download."""
1154 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1155 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1156
1157 def report_extraction(self, video_id):
1158 """Report information extraction."""
1159 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1160
1161 def report_following_redirect(self, new_url):
1162 """Report information extraction."""
1163 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1164
1165 def _test_redirect(self, url):
1166 """Check if it is a redirect, like url shorteners, in case restart chain."""
1167 class HeadRequest(urllib2.Request):
1168 def get_method(self):
1169 return "HEAD"
1170
1171 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1172 """
1173 Subclass the HTTPRedirectHandler to make it use our
1174 HeadRequest also on the redirected URL
1175 """
1176 def redirect_request(self, req, fp, code, msg, headers, newurl):
1177 if code in (301, 302, 303, 307):
303692b5
FV
1178 newurl = newurl.replace(' ', '%20')
1179 newheaders = dict((k,v) for k,v in req.headers.items()
1180 if k.lower() not in ("content-length", "content-type"))
1181 return HeadRequest(newurl,
1182 headers=newheaders,
1183 origin_req_host=req.get_origin_req_host(),
1184 unverifiable=True)
d77c3dfd 1185 else:
303692b5
FV
1186 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1187
d77c3dfd
FV
1188 class HTTPMethodFallback(urllib2.BaseHandler):
1189 """
1190 Fallback to GET if HEAD is not allowed (405 HTTP error)
1191 """
1192 def http_error_405(self, req, fp, code, msg, headers):
1193 fp.read()
1194 fp.close()
1195
1196 newheaders = dict((k,v) for k,v in req.headers.items()
303692b5 1197 if k.lower() not in ("content-length", "content-type"))
d77c3dfd 1198 return self.parent.open(urllib2.Request(req.get_full_url(),
303692b5
FV
1199 headers=newheaders,
1200 origin_req_host=req.get_origin_req_host(),
1201 unverifiable=True))
d77c3dfd
FV
1202
1203 # Build our opener
1204 opener = urllib2.OpenerDirector()
1205 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
303692b5
FV
1206 HTTPMethodFallback, HEADRedirectHandler,
1207 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
d77c3dfd
FV
1208 opener.add_handler(handler())
1209
1210 response = opener.open(HeadRequest(url))
1211 new_url = response.geturl()
1212
1213 if url == new_url: return False
1214
1215 self.report_following_redirect(new_url)
1216 self._downloader.download([new_url])
1217 return True
1218
1219 def _real_extract(self, url):
1220 if self._test_redirect(url): return
d77c3dfd
FV
1221
1222 video_id = url.split('/')[-1]
1223 request = urllib2.Request(url)
1224 try:
1225 self.report_download_webpage(video_id)
1226 webpage = urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1228 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
1229 return
1230 except ValueError, err:
1231 # since this is the last-resort InfoExtractor, if
1232 # this error is thrown, it'll be thrown here
1233 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1234 return
1235
1236 self.report_extraction(video_id)
1237 # Start with something easy: JW Player in SWFObject
1238 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1239 if mobj is None:
1240 # Broaden the search a little bit
1241 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1242 if mobj is None:
1243 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1244 return
1245
1246 # It's possible that one of the regexes
1247 # matched, but returned an empty group:
1248 if mobj.group(1) is None:
1249 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1250 return
1251
1252 video_url = urllib.unquote(mobj.group(1))
1253 video_id = os.path.basename(video_url)
1254
1255 # here's a fun little line of code for you:
1256 video_extension = os.path.splitext(video_id)[1][1:]
1257 video_id = os.path.splitext(video_id)[0]
1258
1259 # it's tempting to parse this further, but you would
1260 # have to take into account all the variations like
1261 # Video Title - Site Name
1262 # Site Name | Video Title
1263 # Video Title - Tagline | Site Name
1264 # and so on and so forth; it's just not practical
1265 mobj = re.search(r'<title>(.*)</title>', webpage)
1266 if mobj is None:
1267 self._downloader.trouble(u'ERROR: unable to extract title')
1268 return
1269 video_title = mobj.group(1).decode('utf-8')
d77c3dfd
FV
1270
1271 # video uploader is domain name
1272 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1273 if mobj is None:
1274 self._downloader.trouble(u'ERROR: unable to extract title')
1275 return
1276 video_uploader = mobj.group(1).decode('utf-8')
1277
58ca755f
FV
1278 return [{
1279 'id': video_id.decode('utf-8'),
1280 'url': video_url.decode('utf-8'),
1281 'uploader': video_uploader,
1282 'upload_date': u'NA',
1283 'title': video_title,
58ca755f
FV
1284 'ext': video_extension.decode('utf-8'),
1285 'format': u'NA',
1286 'player_url': None,
1287 }]
d77c3dfd
FV
1288
1289
1290class YoutubeSearchIE(InfoExtractor):
1291 """Information Extractor for YouTube search queries."""
1292 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1293 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
d77c3dfd
FV
1294 _max_youtube_results = 1000
1295 IE_NAME = u'youtube:search'
1296
58ca755f 1297 def __init__(self, downloader=None):
d77c3dfd 1298 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1299
1300 def report_download_page(self, query, pagenum):
d4e16d3e 1301 """Report attempt to download search page with given number."""
d77c3dfd
FV
1302 query = query.decode(preferredencoding())
1303 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1304
d77c3dfd
FV
1305 def _real_extract(self, query):
1306 mobj = re.match(self._VALID_URL, query)
1307 if mobj is None:
1308 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1309 return
1310
1311 prefix, query = query.split(':')
1312 prefix = prefix[8:]
1313 query = query.encode('utf-8')
1314 if prefix == '':
1315 self._download_n_results(query, 1)
1316 return
1317 elif prefix == 'all':
1318 self._download_n_results(query, self._max_youtube_results)
1319 return
1320 else:
1321 try:
1322 n = long(prefix)
1323 if n <= 0:
1324 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1325 return
1326 elif n > self._max_youtube_results:
1327 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1328 n = self._max_youtube_results
1329 self._download_n_results(query, n)
1330 return
1331 except ValueError: # parsing prefix as integer fails
1332 self._download_n_results(query, 1)
1333 return
1334
1335 def _download_n_results(self, query, n):
1336 """Downloads a specified number of results for a query"""
1337
1338 video_ids = []
1339 pagenum = 0
1340 limit = n
1341
1342 while (50 * pagenum) < limit:
1343 self.report_download_page(query, pagenum+1)
1344 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1345 request = urllib2.Request(result_url)
1346 try:
1347 data = urllib2.urlopen(request).read()
1348 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1349 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
d77c3dfd
FV
1350 return
1351 api_response = json.loads(data)['data']
1352
1353 new_ids = list(video['id'] for video in api_response['items'])
1354 video_ids += new_ids
1355
1356 limit = min(n, api_response['totalItems'])
1357 pagenum += 1
1358
1359 if len(video_ids) > n:
1360 video_ids = video_ids[:n]
1361 for id in video_ids:
58ca755f 1362 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1363 return
1364
1365
1366class GoogleSearchIE(InfoExtractor):
1367 """Information Extractor for Google Video search queries."""
1368 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1369 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1370 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1371 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
d77c3dfd
FV
1372 _max_google_results = 1000
1373 IE_NAME = u'video.google:search'
1374
58ca755f 1375 def __init__(self, downloader=None):
d77c3dfd 1376 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1377
1378 def report_download_page(self, query, pagenum):
1379 """Report attempt to download playlist page with given number."""
1380 query = query.decode(preferredencoding())
1381 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1382
d77c3dfd
FV
1383 def _real_extract(self, query):
1384 mobj = re.match(self._VALID_URL, query)
1385 if mobj is None:
1386 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1387 return
1388
1389 prefix, query = query.split(':')
1390 prefix = prefix[8:]
1391 query = query.encode('utf-8')
1392 if prefix == '':
1393 self._download_n_results(query, 1)
1394 return
1395 elif prefix == 'all':
1396 self._download_n_results(query, self._max_google_results)
1397 return
1398 else:
1399 try:
1400 n = long(prefix)
1401 if n <= 0:
1402 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1403 return
1404 elif n > self._max_google_results:
1405 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1406 n = self._max_google_results
1407 self._download_n_results(query, n)
1408 return
1409 except ValueError: # parsing prefix as integer fails
1410 self._download_n_results(query, 1)
1411 return
1412
1413 def _download_n_results(self, query, n):
1414 """Downloads a specified number of results for a query"""
1415
1416 video_ids = []
1417 pagenum = 0
1418
1419 while True:
1420 self.report_download_page(query, pagenum)
1421 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1422 request = urllib2.Request(result_url)
1423 try:
1424 page = urllib2.urlopen(request).read()
1425 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1426 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
d77c3dfd
FV
1427 return
1428
1429 # Extract video identifiers
1430 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1431 video_id = mobj.group(1)
1432 if video_id not in video_ids:
1433 video_ids.append(video_id)
1434 if len(video_ids) == n:
1435 # Specified n videos reached
1436 for id in video_ids:
58ca755f 1437 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1438 return
1439
1440 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1441 for id in video_ids:
58ca755f 1442 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
d77c3dfd
FV
1443 return
1444
1445 pagenum = pagenum + 1
1446
1447
1448class YahooSearchIE(InfoExtractor):
1449 """Information Extractor for Yahoo! Video search queries."""
1450 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1451 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1452 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1453 _MORE_PAGES_INDICATOR = r'\s*Next'
d77c3dfd
FV
1454 _max_yahoo_results = 1000
1455 IE_NAME = u'video.yahoo:search'
1456
58ca755f 1457 def __init__(self, downloader=None):
d77c3dfd 1458 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1459
1460 def report_download_page(self, query, pagenum):
1461 """Report attempt to download playlist page with given number."""
1462 query = query.decode(preferredencoding())
1463 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1464
d77c3dfd
FV
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1467 if mobj is None:
1468 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1469 return
1470
1471 prefix, query = query.split(':')
1472 prefix = prefix[8:]
1473 query = query.encode('utf-8')
1474 if prefix == '':
1475 self._download_n_results(query, 1)
1476 return
1477 elif prefix == 'all':
1478 self._download_n_results(query, self._max_yahoo_results)
1479 return
1480 else:
1481 try:
1482 n = long(prefix)
1483 if n <= 0:
1484 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1485 return
1486 elif n > self._max_yahoo_results:
1487 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1488 n = self._max_yahoo_results
1489 self._download_n_results(query, n)
1490 return
1491 except ValueError: # parsing prefix as integer fails
1492 self._download_n_results(query, 1)
1493 return
1494
1495 def _download_n_results(self, query, n):
1496 """Downloads a specified number of results for a query"""
1497
1498 video_ids = []
1499 already_seen = set()
1500 pagenum = 1
1501
1502 while True:
1503 self.report_download_page(query, pagenum)
1504 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1505 request = urllib2.Request(result_url)
1506 try:
1507 page = urllib2.urlopen(request).read()
1508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1509 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
d77c3dfd
FV
1510 return
1511
1512 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514 video_id = mobj.group(1)
1515 if video_id not in already_seen:
1516 video_ids.append(video_id)
1517 already_seen.add(video_id)
1518 if len(video_ids) == n:
1519 # Specified n videos reached
1520 for id in video_ids:
58ca755f 1521 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1522 return
1523
1524 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1525 for id in video_ids:
58ca755f 1526 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
d77c3dfd
FV
1527 return
1528
1529 pagenum = pagenum + 1
1530
1531
1532class YoutubePlaylistIE(InfoExtractor):
1533 """Information Extractor for YouTube playlists."""
1534
c6c0e23a 1535 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
d77c3dfd 1536 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
10daa766 1537 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
d4e16d3e 1538 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
d77c3dfd
FV
1539 IE_NAME = u'youtube:playlist'
1540
58ca755f 1541 def __init__(self, downloader=None):
d77c3dfd 1542 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1543
1544 def report_download_page(self, playlist_id, pagenum):
1545 """Report attempt to download playlist page with given number."""
1546 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1547
d77c3dfd
FV
1548 def _real_extract(self, url):
1549 # Extract playlist id
1550 mobj = re.match(self._VALID_URL, url)
1551 if mobj is None:
1552 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1553 return
1554
1555 # Single video case
1556 if mobj.group(3) is not None:
58ca755f 1557 self._downloader.download([mobj.group(3)])
d77c3dfd
FV
1558 return
1559
1560 # Download playlist pages
1561 # prefix is 'p' as default for playlists but there are other types that need extra care
1562 playlist_prefix = mobj.group(1)
1563 if playlist_prefix == 'a':
1564 playlist_access = 'artist'
1565 else:
1566 playlist_prefix = 'p'
1567 playlist_access = 'view_play_list'
1568 playlist_id = mobj.group(2)
1569 video_ids = []
1570 pagenum = 1
1571
1572 while True:
1573 self.report_download_page(playlist_id, pagenum)
1574 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1575 request = urllib2.Request(url)
1576 try:
1577 page = urllib2.urlopen(request).read()
1578 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1579 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
d77c3dfd
FV
1580 return
1581
1582 # Extract video identifiers
1583 ids_in_page = []
1584 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1585 if mobj.group(1) not in ids_in_page:
1586 ids_in_page.append(mobj.group(1))
1587 video_ids.extend(ids_in_page)
1588
1589 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1590 break
1591 pagenum = pagenum + 1
1592
1593 playliststart = self._downloader.params.get('playliststart', 1) - 1
1594 playlistend = self._downloader.params.get('playlistend', -1)
1595 if playlistend == -1:
1596 video_ids = video_ids[playliststart:]
1597 else:
1598 video_ids = video_ids[playliststart:playlistend]
1599
1600 for id in video_ids:
58ca755f 1601 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
d77c3dfd
FV
1602 return
1603
1604
902b2a0a
FV
1605class YoutubeChannelIE(InfoExtractor):
1606 """Information Extractor for YouTube channels."""
1607
1608 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1609 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1610 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1611 IE_NAME = u'youtube:channel'
1612
1613 def report_download_page(self, channel_id, pagenum):
1614 """Report attempt to download channel page with given number."""
1615 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1616
1617 def _real_extract(self, url):
1618 # Extract channel id
1619 mobj = re.match(self._VALID_URL, url)
1620 if mobj is None:
1621 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1622 return
1623
1624 # Download channel pages
1625 channel_id = mobj.group(1)
1626 video_ids = []
1627 pagenum = 1
1628
1629 while True:
1630 self.report_download_page(channel_id, pagenum)
1631 url = self._TEMPLATE_URL % (channel_id, pagenum)
1632 request = urllib2.Request(url)
1633 try:
1634 page = urllib2.urlopen(request).read()
1635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1636 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
902b2a0a
FV
1637 return
1638
1639 # Extract video identifiers
1640 ids_in_page = []
1641 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1642 if mobj.group(1) not in ids_in_page:
1643 ids_in_page.append(mobj.group(1))
1644 video_ids.extend(ids_in_page)
1645
1646 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1647 break
1648 pagenum = pagenum + 1
1649
1650 for id in video_ids:
1651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1652 return
1653
1654
d77c3dfd
FV
1655class YoutubeUserIE(InfoExtractor):
1656 """Information Extractor for YouTube users."""
1657
1658 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1659 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1660 _GDATA_PAGE_SIZE = 50
1661 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1662 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
d77c3dfd
FV
1663 IE_NAME = u'youtube:user'
1664
58ca755f 1665 def __init__(self, downloader=None):
d77c3dfd 1666 InfoExtractor.__init__(self, downloader)
d77c3dfd
FV
1667
1668 def report_download_page(self, username, start_index):
1669 """Report attempt to download user page."""
1670 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1671 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1672
d77c3dfd
FV
1673 def _real_extract(self, url):
1674 # Extract username
1675 mobj = re.match(self._VALID_URL, url)
1676 if mobj is None:
1677 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1678 return
1679
1680 username = mobj.group(1)
1681
1682 # Download video ids using YouTube Data API. Result size per
1683 # query is limited (currently to 50 videos) so we need to query
1684 # page by page until there are no video ids - it means we got
1685 # all of them.
1686
1687 video_ids = []
1688 pagenum = 0
1689
1690 while True:
1691 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1692 self.report_download_page(username, start_index)
1693
1694 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1695
1696 try:
1697 page = urllib2.urlopen(request).read()
1698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1699 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
d77c3dfd
FV
1700 return
1701
1702 # Extract video identifiers
1703 ids_in_page = []
1704
1705 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1706 if mobj.group(1) not in ids_in_page:
1707 ids_in_page.append(mobj.group(1))
1708
1709 video_ids.extend(ids_in_page)
1710
1711 # A little optimization - if current page is not
1712 # "full", ie. does not contain PAGE_SIZE video ids then
1713 # we can assume that this page is the last one - there
1714 # are no more ids on further pages - no need to query
1715 # again.
1716
1717 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1718 break
1719
1720 pagenum += 1
1721
1722 all_ids_count = len(video_ids)
1723 playliststart = self._downloader.params.get('playliststart', 1) - 1
1724 playlistend = self._downloader.params.get('playlistend', -1)
1725
1726 if playlistend == -1:
1727 video_ids = video_ids[playliststart:]
1728 else:
1729 video_ids = video_ids[playliststart:playlistend]
1730
1731 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1732 (username, all_ids_count, len(video_ids)))
1733
1734 for video_id in video_ids:
58ca755f 1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1736
1737
eeeb4daa
JCGS
1738class BlipTVUserIE(InfoExtractor):
1739 """Information Extractor for blip.tv users."""
1740
1741 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
11a141de 1742 _PAGE_SIZE = 12
eeeb4daa
JCGS
1743 IE_NAME = u'blip.tv:user'
1744
1745 def __init__(self, downloader=None):
1746 InfoExtractor.__init__(self, downloader)
1747
1748 def report_download_page(self, username, pagenum):
1749 """Report attempt to download user page."""
1750 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1751 (self.IE_NAME, username, pagenum))
1752
1753 def _real_extract(self, url):
1754 # Extract username
1755 mobj = re.match(self._VALID_URL, url)
1756 if mobj is None:
1757 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1758 return
1759
1760 username = mobj.group(1)
1761
11a141de 1762 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa
JCGS
1763
1764 request = urllib2.Request(url)
1765
1766 try:
1767 page = urllib2.urlopen(request).read().decode('utf-8')
11a141de
FV
1768 mobj = re.search(r'data-users-id="([^"]+)"', page)
1769 page_base = page_base % mobj.group(1)
eeeb4daa 1770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1771 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
eeeb4daa
JCGS
1772 return
1773
1774
11a141de
FV
1775 # Download video ids using BlipTV Ajax calls. Result size per
1776 # query is limited (currently to 12 videos) so we need to query
eeeb4daa
JCGS
1777 # page by page until there are no video ids - it means we got
1778 # all of them.
1779
1780 video_ids = []
11a141de 1781 pagenum = 1
eeeb4daa
JCGS
1782
1783 while True:
1784 self.report_download_page(username, pagenum)
1785
11a141de 1786 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa
JCGS
1787
1788 try:
1789 page = urllib2.urlopen(request).read().decode('utf-8')
1790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1792 return
1793
1794 # Extract video identifiers
1795 ids_in_page = []
1796
1797 for mobj in re.finditer(r'href="/([^"]+)"', page):
1798 if mobj.group(1) not in ids_in_page:
1799 ids_in_page.append(unescapeHTML(mobj.group(1)))
1800
1801 video_ids.extend(ids_in_page)
1802
1803 # A little optimization - if current page is not
1804 # "full", ie. does not contain PAGE_SIZE video ids then
1805 # we can assume that this page is the last one - there
1806 # are no more ids on further pages - no need to query
1807 # again.
1808
1809 if len(ids_in_page) < self._PAGE_SIZE:
1810 break
1811
1812 pagenum += 1
1813
1814 all_ids_count = len(video_ids)
1815 playliststart = self._downloader.params.get('playliststart', 1) - 1
1816 playlistend = self._downloader.params.get('playlistend', -1)
1817
1818 if playlistend == -1:
1819 video_ids = video_ids[playliststart:]
1820 else:
1821 video_ids = video_ids[playliststart:playlistend]
1822
1823 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1824 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1825
1826 for video_id in video_ids:
1827 self._downloader.download([u'http://blip.tv/'+video_id])
1828
1829
d77c3dfd
FV
1830class DepositFilesIE(InfoExtractor):
1831 """Information extractor for depositfiles.com"""
1832
1833 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1834 IE_NAME = u'DepositFiles'
1835
1836 def __init__(self, downloader=None):
1837 InfoExtractor.__init__(self, downloader)
1838
1839 def report_download_webpage(self, file_id):
1840 """Report webpage download."""
1841 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1842
1843 def report_extraction(self, file_id):
1844 """Report information extraction."""
1845 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1846
1847 def _real_extract(self, url):
d77c3dfd
FV
1848 file_id = url.split('/')[-1]
1849 # Rebuild url in english locale
1850 url = 'http://depositfiles.com/en/files/' + file_id
1851
1852 # Retrieve file webpage with 'Free download' button pressed
1853 free_download_indication = { 'gateway_result' : '1' }
1854 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1855 try:
1856 self.report_download_webpage(file_id)
1857 webpage = urllib2.urlopen(request).read()
1858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1859 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
d77c3dfd
FV
1860 return
1861
1862 # Search for the real file URL
1863 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1864 if (mobj is None) or (mobj.group(1) is None):
1865 # Try to figure out reason of the error.
1866 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1867 if (mobj is not None) and (mobj.group(1) is not None):
1868 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1869 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1870 else:
1871 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1872 return
1873
1874 file_url = mobj.group(1)
1875 file_extension = os.path.splitext(file_url)[1][1:]
1876
1877 # Search for file title
1878 mobj = re.search(r'<b title="(.*?)">', webpage)
1879 if mobj is None:
1880 self._downloader.trouble(u'ERROR: unable to extract title')
1881 return
1882 file_title = mobj.group(1).decode('utf-8')
1883
58ca755f
FV
1884 return [{
1885 'id': file_id.decode('utf-8'),
1886 'url': file_url.decode('utf-8'),
1887 'uploader': u'NA',
1888 'upload_date': u'NA',
1889 'title': file_title,
58ca755f
FV
1890 'ext': file_extension.decode('utf-8'),
1891 'format': u'NA',
1892 'player_url': None,
1893 }]
d77c3dfd
FV
1894
1895
1896class FacebookIE(InfoExtractor):
1897 """Information Extractor for Facebook"""
1898
1899 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1900 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1901 _NETRC_MACHINE = 'facebook'
1902 _available_formats = ['video', 'highqual', 'lowqual']
1903 _video_extensions = {
1904 'video': 'mp4',
1905 'highqual': 'mp4',
1906 'lowqual': 'mp4',
1907 }
1908 IE_NAME = u'facebook'
1909
1910 def __init__(self, downloader=None):
1911 InfoExtractor.__init__(self, downloader)
1912
1913 def _reporter(self, message):
1914 """Add header and report message."""
1915 self._downloader.to_screen(u'[facebook] %s' % message)
1916
1917 def report_login(self):
1918 """Report attempt to log in."""
1919 self._reporter(u'Logging in')
1920
1921 def report_video_webpage_download(self, video_id):
1922 """Report attempt to download video webpage."""
1923 self._reporter(u'%s: Downloading video webpage' % video_id)
1924
1925 def report_information_extraction(self, video_id):
1926 """Report attempt to extract video information."""
1927 self._reporter(u'%s: Extracting video information' % video_id)
1928
1929 def _parse_page(self, video_webpage):
1930 """Extract video information from page"""
1931 # General data
1932 data = {'title': r'\("video_title", "(.*?)"\)',
1933 'description': r'<div class="datawrap">(.*?)</div>',
1934 'owner': r'\("video_owner_name", "(.*?)"\)',
1935 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1936 }
1937 video_info = {}
1938 for piece in data.keys():
1939 mobj = re.search(data[piece], video_webpage)
1940 if mobj is not None:
1941 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942
1943 # Video urls
1944 video_urls = {}
1945 for fmt in self._available_formats:
1946 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1947 if mobj is not None:
1948 # URL is in a Javascript segment inside an escaped Unicode format within
1949 # the generally utf-8 page
1950 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1951 video_info['video_urls'] = video_urls
1952
1953 return video_info
1954
1955 def _real_initialize(self):
1956 if self._downloader is None:
1957 return
1958
1959 useremail = None
1960 password = None
1961 downloader_params = self._downloader.params
1962
1963 # Attempt to use provided username and password or .netrc data
1964 if downloader_params.get('username', None) is not None:
1965 useremail = downloader_params['username']
1966 password = downloader_params['password']
1967 elif downloader_params.get('usenetrc', False):
1968 try:
1969 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1970 if info is not None:
1971 useremail = info[0]
1972 password = info[2]
1973 else:
1974 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1975 except (IOError, netrc.NetrcParseError), err:
92b91c18 1976 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
d77c3dfd
FV
1977 return
1978
1979 if useremail is None:
1980 return
1981
1982 # Log in
1983 login_form = {
1984 'email': useremail,
1985 'pass': password,
1986 'login': 'Log+In'
1987 }
1988 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1989 try:
1990 self.report_login()
1991 login_results = urllib2.urlopen(request).read()
1992 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1993 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1994 return
1995 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 1996 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
d77c3dfd
FV
1997 return
1998
1999 def _real_extract(self, url):
2000 mobj = re.match(self._VALID_URL, url)
2001 if mobj is None:
2002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2003 return
2004 video_id = mobj.group('ID')
2005
2006 # Get video webpage
2007 self.report_video_webpage_download(video_id)
2008 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2009 try:
2010 page = urllib2.urlopen(request)
2011 video_webpage = page.read()
2012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2013 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
2014 return
2015
2016 # Start extracting information
2017 self.report_information_extraction(video_id)
2018
2019 # Extract information
2020 video_info = self._parse_page(video_webpage)
2021
2022 # uploader
2023 if 'owner' not in video_info:
2024 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2025 return
2026 video_uploader = video_info['owner']
2027
2028 # title
2029 if 'title' not in video_info:
2030 self._downloader.trouble(u'ERROR: unable to extract video title')
2031 return
2032 video_title = video_info['title']
2033 video_title = video_title.decode('utf-8')
d77c3dfd
FV
2034
2035 # thumbnail image
2036 if 'thumbnail' not in video_info:
2037 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2038 video_thumbnail = ''
2039 else:
2040 video_thumbnail = video_info['thumbnail']
2041
2042 # upload date
2043 upload_date = u'NA'
2044 if 'upload_date' in video_info:
2045 upload_time = video_info['upload_date']
2046 timetuple = email.utils.parsedate_tz(upload_time)
2047 if timetuple is not None:
2048 try:
2049 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2050 except:
2051 pass
2052
2053 # description
2054 video_description = video_info.get('description', 'No description available.')
2055
2056 url_map = video_info['video_urls']
2057 if len(url_map.keys()) > 0:
2058 # Decide which formats to download
2059 req_format = self._downloader.params.get('format', None)
2060 format_limit = self._downloader.params.get('format_limit', None)
2061
2062 if format_limit is not None and format_limit in self._available_formats:
2063 format_list = self._available_formats[self._available_formats.index(format_limit):]
2064 else:
2065 format_list = self._available_formats
2066 existing_formats = [x for x in format_list if x in url_map]
2067 if len(existing_formats) == 0:
2068 self._downloader.trouble(u'ERROR: no known formats available for video')
2069 return
2070 if req_format is None:
2071 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2072 elif req_format == 'worst':
2073 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2074 elif req_format == '-1':
2075 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2076 else:
2077 # Specific format
2078 if req_format not in url_map:
2079 self._downloader.trouble(u'ERROR: requested format not available')
2080 return
2081 video_url_list = [(req_format, url_map[req_format])] # Specific format
2082
58ca755f 2083 results = []
d77c3dfd 2084 for format_param, video_real_url in video_url_list:
d77c3dfd
FV
2085 # Extension
2086 video_extension = self._video_extensions.get(format_param, 'mp4')
2087
58ca755f
FV
2088 results.append({
2089 'id': video_id.decode('utf-8'),
2090 'url': video_real_url.decode('utf-8'),
2091 'uploader': video_uploader.decode('utf-8'),
2092 'upload_date': upload_date,
2093 'title': video_title,
58ca755f
FV
2094 'ext': video_extension.decode('utf-8'),
2095 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2096 'thumbnail': video_thumbnail.decode('utf-8'),
2097 'description': video_description.decode('utf-8'),
2098 'player_url': None,
2099 })
2100 return results
d77c3dfd
FV
2101
2102class BlipTVIE(InfoExtractor):
2103 """Information extractor for blip.tv"""
2104
2105 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2106 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2107 IE_NAME = u'blip.tv'
2108
2109 def report_extraction(self, file_id):
2110 """Report information extraction."""
2111 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2112
2113 def report_direct_download(self, title):
2114 """Report information extraction."""
2115 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2116
2117 def _real_extract(self, url):
2118 mobj = re.match(self._VALID_URL, url)
2119 if mobj is None:
2120 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2121 return
2122
2123 if '?' in url:
2124 cchar = '&'
2125 else:
2126 cchar = '?'
2127 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
f1927d71 2128 request = urllib2.Request(json_url.encode('utf-8'))
d77c3dfd
FV
2129 self.report_extraction(mobj.group(1))
2130 info = None
2131 try:
2132 urlh = urllib2.urlopen(request)
2133 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2134 basename = url.split('/')[-1]
2135 title,ext = os.path.splitext(basename)
2136 title = title.decode('UTF-8')
2137 ext = ext.replace('.', '')
2138 self.report_direct_download(title)
2139 info = {
2140 'id': title,
2141 'url': url,
2142 'title': title,
d77c3dfd
FV
2143 'ext': ext,
2144 'urlhandle': urlh
2145 }
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2147 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
d77c3dfd
FV
2148 return
2149 if info is None: # Regular URL
2150 try:
2151 json_code = urlh.read()
2152 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2153 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
d77c3dfd
FV
2154 return
2155
2156 try:
2157 json_data = json.loads(json_code)
2158 if 'Post' in json_data:
2159 data = json_data['Post']
2160 else:
2161 data = json_data
3fe294e4 2162
d77c3dfd
FV
2163 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2164 video_url = data['media']['url']
2165 umobj = re.match(self._URL_EXT, video_url)
2166 if umobj is None:
2167 raise ValueError('Can not determine filename extension')
2168 ext = umobj.group(1)
3fe294e4 2169
d77c3dfd
FV
2170 info = {
2171 'id': data['item_id'],
2172 'url': video_url,
2173 'uploader': data['display_name'],
2174 'upload_date': upload_date,
2175 'title': data['title'],
d77c3dfd
FV
2176 'ext': ext,
2177 'format': data['media']['mimeType'],
2178 'thumbnail': data['thumbnailUrl'],
2179 'description': data['description'],
2180 'player_url': data['embedUrl']
2181 }
2182 except (ValueError,KeyError), err:
2183 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2184 return
2185
81828271 2186 std_headers['User-Agent'] = 'iTunes/10.6.1'
58ca755f 2187 return [info]
d77c3dfd
FV
2188
2189
2190class MyVideoIE(InfoExtractor):
2191 """Information Extractor for myvideo.de."""
2192
2193 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2194 IE_NAME = u'myvideo'
2195
2196 def __init__(self, downloader=None):
2197 InfoExtractor.__init__(self, downloader)
2198
2199 def report_download_webpage(self, video_id):
2200 """Report webpage download."""
2201 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2202
2203 def report_extraction(self, video_id):
2204 """Report information extraction."""
2205 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2206
2207 def _real_extract(self,url):
2208 mobj = re.match(self._VALID_URL, url)
2209 if mobj is None:
2210 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2211 return
2212
2213 video_id = mobj.group(1)
2214
2215 # Get video webpage
2216 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2217 try:
2218 self.report_download_webpage(video_id)
2219 webpage = urllib2.urlopen(request).read()
2220 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2221 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d77c3dfd
FV
2222 return
2223
2224 self.report_extraction(video_id)
2225 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2226 webpage)
2227 if mobj is None:
2228 self._downloader.trouble(u'ERROR: unable to extract media URL')
2229 return
2230 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2231
2232 mobj = re.search('<title>([^<]+)</title>', webpage)
2233 if mobj is None:
2234 self._downloader.trouble(u'ERROR: unable to extract title')
2235 return
2236
2237 video_title = mobj.group(1)
d77c3dfd 2238
58ca755f
FV
2239 return [{
2240 'id': video_id,
2241 'url': video_url,
2242 'uploader': u'NA',
2243 'upload_date': u'NA',
2244 'title': video_title,
58ca755f
FV
2245 'ext': u'flv',
2246 'format': u'NA',
2247 'player_url': None,
2248 }]
d77c3dfd
FV
2249
2250class ComedyCentralIE(InfoExtractor):
2251 """Information extractor for The Daily Show and Colbert Report """
2252
2253 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2254 IE_NAME = u'comedycentral'
2255
4408d996
CN
2256 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2257
2258 _video_extensions = {
2259 '3500': 'mp4',
2260 '2200': 'mp4',
2261 '1700': 'mp4',
2262 '1200': 'mp4',
2263 '750': 'mp4',
2264 '400': 'mp4',
2265 }
2266 _video_dimensions = {
2267 '3500': '1280x720',
2268 '2200': '960x540',
2269 '1700': '768x432',
2270 '1200': '640x360',
2271 '750': '512x288',
2272 '400': '384x216',
2273 }
2274
d77c3dfd
FV
2275 def report_extraction(self, episode_id):
2276 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3fe294e4 2277
d77c3dfd
FV
2278 def report_config_download(self, episode_id):
2279 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2280
2281 def report_index_download(self, episode_id):
2282 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2283
2284 def report_player_url(self, episode_id):
2285 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2286
4408d996
CN
2287
2288 def _print_formats(self, formats):
2289 print('Available formats:')
2290 for x in formats:
2291 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2292
2293
d77c3dfd
FV
2294 def _real_extract(self, url):
2295 mobj = re.match(self._VALID_URL, url)
2296 if mobj is None:
2297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2298 return
2299
2300 if mobj.group('shortname'):
2301 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2302 url = u'http://www.thedailyshow.com/full-episodes/'
2303 else:
2304 url = u'http://www.colbertnation.com/full-episodes/'
2305 mobj = re.match(self._VALID_URL, url)
2306 assert mobj is not None
2307
2308 dlNewest = not mobj.group('episode')
2309 if dlNewest:
2310 epTitle = mobj.group('showname')
2311 else:
2312 epTitle = mobj.group('episode')
2313
2314 req = urllib2.Request(url)
2315 self.report_extraction(epTitle)
2316 try:
2317 htmlHandle = urllib2.urlopen(req)
2318 html = htmlHandle.read()
2319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2321 return
2322 if dlNewest:
2323 url = htmlHandle.geturl()
2324 mobj = re.match(self._VALID_URL, url)
2325 if mobj is None:
2326 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2327 return
2328 if mobj.group('episode') == '':
2329 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2330 return
2331 epTitle = mobj.group('episode')
2332
2333 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
4a702f38 2334
d77c3dfd 2335 if len(mMovieParams) == 0:
4a702f38
CN
2336 # The Colbert Report embeds the information in a without
2337 # a URL prefix; so extract the alternate reference
2338 # and then add the URL prefix manually.
d77c3dfd 2339
4a702f38
CN
2340 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2341 if len(altMovieParams) == 0:
2342 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2343 return
2344 else:
2345 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2346
d77c3dfd
FV
2347 playerUrl_raw = mMovieParams[0][0]
2348 self.report_player_url(epTitle)
2349 try:
2350 urlHandle = urllib2.urlopen(playerUrl_raw)
2351 playerUrl = urlHandle.geturl()
2352 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2353 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2354 return
2355
2356 uri = mMovieParams[0][1]
2357 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2358 self.report_index_download(epTitle)
2359 try:
2360 indexXml = urllib2.urlopen(indexUrl).read()
2361 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2362 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2363 return
2364
58ca755f
FV
2365 results = []
2366
d77c3dfd
FV
2367 idoc = xml.etree.ElementTree.fromstring(indexXml)
2368 itemEls = idoc.findall('.//item')
2369 for itemEl in itemEls:
2370 mediaId = itemEl.findall('./guid')[0].text
2371 shortMediaId = mediaId.split(':')[-1]
2372 showId = mediaId.split(':')[-2].replace('.com', '')
2373 officialTitle = itemEl.findall('./title')[0].text
2374 officialDate = itemEl.findall('./pubDate')[0].text
2375
2376 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2377 urllib.urlencode({'uri': mediaId}))
2378 configReq = urllib2.Request(configUrl)
2379 self.report_config_download(epTitle)
2380 try:
2381 configXml = urllib2.urlopen(configReq).read()
2382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2383 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2384 return
2385
2386 cdoc = xml.etree.ElementTree.fromstring(configXml)
2387 turls = []
2388 for rendition in cdoc.findall('.//rendition'):
2389 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2390 turls.append(finfo)
2391
2392 if len(turls) == 0:
2393 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2394 continue
4408d996 2395
4408d996 2396 if self._downloader.params.get('listformats', None):
6bac102a
PH
2397 self._print_formats([i[0] for i in turls])
2398 return
4408d996 2399
d8dddb7c 2400 # For now, just pick the highest bitrate
feb22fe5 2401 format,video_url = turls[-1]
d8dddb7c
CN
2402
2403 # Get the format arg from the arg stream
4408d996
CN
2404 req_format = self._downloader.params.get('format', None)
2405
d8dddb7c 2406 # Select format if we can find one
4408d996 2407 for f,v in turls:
6bac102a
PH
2408 if f == req_format:
2409 format, video_url = f, v
2410 break
36a9c0b5 2411
6bac102a
PH
2412 # Patch to download from alternative CDN, which does not
2413 # break on current RTMPDump builds
36a9c0b5
CN
2414 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2415 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
6bac102a 2416
36a9c0b5 2417 if video_url.startswith(broken_cdn):
6bac102a 2418 video_url = video_url.replace(broken_cdn, better_cdn)
36a9c0b5 2419
d77c3dfd
FV
2420 effTitle = showId + u'-' + epTitle
2421 info = {
2422 'id': shortMediaId,
2423 'url': video_url,
2424 'uploader': showId,
2425 'upload_date': officialDate,
2426 'title': effTitle,
d77c3dfd
FV
2427 'ext': 'mp4',
2428 'format': format,
2429 'thumbnail': None,
2430 'description': officialTitle,
36a9c0b5 2431 'player_url': None #playerUrl
d77c3dfd
FV
2432 }
2433
58ca755f
FV
2434 results.append(info)
2435
2436 return results
d77c3dfd
FV
2437
2438
2439class EscapistIE(InfoExtractor):
2440 """Information extractor for The Escapist """
2441
2442 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2443 IE_NAME = u'escapist'
2444
2445 def report_extraction(self, showName):
2446 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2447
2448 def report_config_download(self, showName):
2449 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2450
2451 def _real_extract(self, url):
d77c3dfd
FV
2452 mobj = re.match(self._VALID_URL, url)
2453 if mobj is None:
2454 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2455 return
2456 showName = mobj.group('showname')
2457 videoId = mobj.group('episode')
2458
2459 self.report_extraction(showName)
2460 try:
3210735c
PH
2461 webPage = urllib2.urlopen(url)
2462 webPageBytes = webPage.read()
2463 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2464 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
d77c3dfd
FV
2465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2467 return
2468
2469 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
9e6dd238 2470 description = unescapeHTML(descMatch.group(1))
d77c3dfd 2471 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
9e6dd238 2472 imgUrl = unescapeHTML(imgMatch.group(1))
d77c3dfd 2473 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
9e6dd238 2474 playerUrl = unescapeHTML(playerUrlMatch.group(1))
d77c3dfd
FV
2475 configUrlMatch = re.search('config=(.*)$', playerUrl)
2476 configUrl = urllib2.unquote(configUrlMatch.group(1))
2477
2478 self.report_config_download(showName)
2479 try:
2480 configJSON = urllib2.urlopen(configUrl).read()
2481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2482 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2483 return
2484
2485 # Technically, it's JavaScript, not JSON
2486 configJSON = configJSON.replace("'", '"')
2487
2488 try:
2489 config = json.loads(configJSON)
2490 except (ValueError,), err:
2491 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2492 return
2493
2494 playlist = config['playlist']
2495 videoUrl = playlist[1]['url']
2496
d77c3dfd
FV
2497 info = {
2498 'id': videoId,
2499 'url': videoUrl,
2500 'uploader': showName,
2501 'upload_date': None,
2502 'title': showName,
d77c3dfd
FV
2503 'ext': 'flv',
2504 'format': 'flv',
2505 'thumbnail': imgUrl,
2506 'description': description,
2507 'player_url': playerUrl,
2508 }
2509
58ca755f 2510 return [info]
d77c3dfd
FV
2511
2512
2513class CollegeHumorIE(InfoExtractor):
2514 """Information extractor for collegehumor.com"""
2515
2516 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2517 IE_NAME = u'collegehumor'
2518
2519 def report_webpage(self, video_id):
2520 """Report information extraction."""
2521 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2522
2523 def report_extraction(self, video_id):
2524 """Report information extraction."""
2525 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2526
2527 def _real_extract(self, url):
d77c3dfd
FV
2528 mobj = re.match(self._VALID_URL, url)
2529 if mobj is None:
2530 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2531 return
2532 video_id = mobj.group('videoid')
2533
2534 self.report_webpage(video_id)
2535 request = urllib2.Request(url)
2536 try:
2537 webpage = urllib2.urlopen(request).read()
2538 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2539 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
2540 return
2541
2542 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2543 if m is None:
2544 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2545 return
2546 internal_video_id = m.group('internalvideoid')
2547
2548 info = {
2549 'id': video_id,
2550 'internal_id': internal_video_id,
2551 }
2552
2553 self.report_extraction(video_id)
2554 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2555 try:
2556 metaXml = urllib2.urlopen(xmlUrl).read()
2557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2558 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
d77c3dfd
FV
2559 return
2560
2561 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2562 try:
2563 videoNode = mdoc.findall('./video')[0]
2564 info['description'] = videoNode.findall('./description')[0].text
2565 info['title'] = videoNode.findall('./caption')[0].text
d77c3dfd
FV
2566 info['url'] = videoNode.findall('./file')[0].text
2567 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2568 info['ext'] = info['url'].rpartition('.')[2]
2569 info['format'] = info['ext']
2570 except IndexError:
2571 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2572 return
2573
58ca755f 2574 return [info]
d77c3dfd
FV
2575
2576
2577class XVideosIE(InfoExtractor):
2578 """Information extractor for xvideos.com"""
2579
2580 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2581 IE_NAME = u'xvideos'
2582
2583 def report_webpage(self, video_id):
2584 """Report information extraction."""
2585 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2586
2587 def report_extraction(self, video_id):
2588 """Report information extraction."""
2589 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2590
2591 def _real_extract(self, url):
d77c3dfd
FV
2592 mobj = re.match(self._VALID_URL, url)
2593 if mobj is None:
2594 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2595 return
2596 video_id = mobj.group(1).decode('utf-8')
2597
2598 self.report_webpage(video_id)
2599
2600 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2601 try:
2602 webpage = urllib2.urlopen(request).read()
2603 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2604 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
2605 return
2606
2607 self.report_extraction(video_id)
2608
2609
2610 # Extract video URL
2611 mobj = re.search(r'flv_url=(.+?)&', webpage)
2612 if mobj is None:
2613 self._downloader.trouble(u'ERROR: unable to extract video url')
2614 return
2615 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2616
2617
2618 # Extract title
2619 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2620 if mobj is None:
2621 self._downloader.trouble(u'ERROR: unable to extract video title')
2622 return
2623 video_title = mobj.group(1).decode('utf-8')
2624
2625
2626 # Extract video thumbnail
363a4e11 2627 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
d77c3dfd
FV
2628 if mobj is None:
2629 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2630 return
363a4e11 2631 video_thumbnail = mobj.group(0).decode('utf-8')
d77c3dfd 2632
d77c3dfd
FV
2633 info = {
2634 'id': video_id,
2635 'url': video_url,
2636 'uploader': None,
2637 'upload_date': None,
2638 'title': video_title,
d77c3dfd
FV
2639 'ext': 'flv',
2640 'format': 'flv',
2641 'thumbnail': video_thumbnail,
2642 'description': None,
2643 'player_url': None,
2644 }
2645
58ca755f 2646 return [info]
d77c3dfd
FV
2647
2648
2649class SoundcloudIE(InfoExtractor):
2650 """Information extractor for soundcloud.com
2651 To access the media, the uid of the song and a stream token
2652 must be extracted from the page source and the script must make
2653 a request to media.soundcloud.com/crossdomain.xml. Then
2654 the media can be grabbed by requesting from an url composed
2655 of the stream token and uid
2656 """
2657
2658 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2659 IE_NAME = u'soundcloud'
2660
2661 def __init__(self, downloader=None):
2662 InfoExtractor.__init__(self, downloader)
2663
2664 def report_webpage(self, video_id):
2665 """Report information extraction."""
2666 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2667
2668 def report_extraction(self, video_id):
2669 """Report information extraction."""
2670 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2671
2672 def _real_extract(self, url):
d77c3dfd
FV
2673 mobj = re.match(self._VALID_URL, url)
2674 if mobj is None:
2675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2676 return
2677
2678 # extract uploader (which is in the url)
2679 uploader = mobj.group(1).decode('utf-8')
2680 # extract simple title (uploader + slug of song title)
2681 slug_title = mobj.group(2).decode('utf-8')
2c288bda 2682 simple_title = uploader + u'-' + slug_title
d77c3dfd
FV
2683
2684 self.report_webpage('%s/%s' % (uploader, slug_title))
2685
2686 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2687 try:
2688 webpage = urllib2.urlopen(request).read()
2689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2690 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
2691 return
2692
2693 self.report_extraction('%s/%s' % (uploader, slug_title))
2694
2695 # extract uid and stream token that soundcloud hands out for access
2696 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2697 if mobj:
2698 video_id = mobj.group(1)
2699 stream_token = mobj.group(2)
2700
2701 # extract unsimplified title
2702 mobj = re.search('"title":"(.*?)",', webpage)
2703 if mobj:
2c288bda
FV
2704 title = mobj.group(1).decode('utf-8')
2705 else:
2706 title = simple_title
d77c3dfd
FV
2707
2708 # construct media url (with uid/token)
2709 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2710 mediaURL = mediaURL % (video_id, stream_token)
2711
2712 # description
2713 description = u'No description available'
2714 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2715 if mobj:
2716 description = mobj.group(1)
92b91c18 2717
d77c3dfd
FV
2718 # upload date
2719 upload_date = None
2720 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2721 if mobj:
2722 try:
2723 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2724 except Exception, e:
92b91c18 2725 self._downloader.to_stderr(compat_str(e))
d77c3dfd
FV
2726
2727 # for soundcloud, a request to a cross domain is required for cookies
2728 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2729
58ca755f
FV
2730 return [{
2731 'id': video_id.decode('utf-8'),
2732 'url': mediaURL,
2733 'uploader': uploader.decode('utf-8'),
2734 'upload_date': upload_date,
2c288bda 2735 'title': title,
58ca755f
FV
2736 'ext': u'mp3',
2737 'format': u'NA',
2738 'player_url': None,
2739 'description': description.decode('utf-8')
2740 }]
d77c3dfd
FV
2741
2742
2743class InfoQIE(InfoExtractor):
2744 """Information extractor for infoq.com"""
2745
2746 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2747 IE_NAME = u'infoq'
2748
2749 def report_webpage(self, video_id):
2750 """Report information extraction."""
2751 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2752
2753 def report_extraction(self, video_id):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2756
2757 def _real_extract(self, url):
d77c3dfd
FV
2758 mobj = re.match(self._VALID_URL, url)
2759 if mobj is None:
2760 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2761 return
2762
2763 self.report_webpage(url)
2764
2765 request = urllib2.Request(url)
2766 try:
2767 webpage = urllib2.urlopen(request).read()
2768 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2769 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
2770 return
2771
2772 self.report_extraction(url)
2773
2774
2775 # Extract video URL
2776 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2777 if mobj is None:
2778 self._downloader.trouble(u'ERROR: unable to extract video url')
2779 return
2780 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2781
2782
2783 # Extract title
2784 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2785 if mobj is None:
2786 self._downloader.trouble(u'ERROR: unable to extract video title')
2787 return
2788 video_title = mobj.group(1).decode('utf-8')
2789
2790 # Extract description
2791 video_description = u'No description available.'
2792 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2793 if mobj is not None:
2794 video_description = mobj.group(1).decode('utf-8')
2795
2796 video_filename = video_url.split('/')[-1]
2797 video_id, extension = video_filename.split('.')
2798
d77c3dfd
FV
2799 info = {
2800 'id': video_id,
2801 'url': video_url,
2802 'uploader': None,
2803 'upload_date': None,
2804 'title': video_title,
d77c3dfd
FV
2805 'ext': extension,
2806 'format': extension, # Extension is always(?) mp4, but seems to be flv
2807 'thumbnail': None,
2808 'description': video_description,
2809 'player_url': None,
2810 }
2811
58ca755f 2812 return [info]
d77c3dfd
FV
2813
2814class MixcloudIE(InfoExtractor):
2815 """Information extractor for www.mixcloud.com"""
2816 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2817 IE_NAME = u'mixcloud'
2818
2819 def __init__(self, downloader=None):
2820 InfoExtractor.__init__(self, downloader)
2821
2822 def report_download_json(self, file_id):
2823 """Report JSON download."""
2824 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2825
2826 def report_extraction(self, file_id):
2827 """Report information extraction."""
2828 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2829
2830 def get_urls(self, jsonData, fmt, bitrate='best'):
2831 """Get urls from 'audio_formats' section in json"""
2832 file_url = None
2833 try:
2834 bitrate_list = jsonData[fmt]
2835 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2836 bitrate = max(bitrate_list) # select highest
2837
2838 url_list = jsonData[fmt][bitrate]
2839 except TypeError: # we have no bitrate info.
2840 url_list = jsonData[fmt]
d77c3dfd
FV
2841 return url_list
2842
2843 def check_urls(self, url_list):
2844 """Returns 1st active url from list"""
2845 for url in url_list:
2846 try:
2847 urllib2.urlopen(url)
2848 return url
2849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2850 url = None
2851
2852 return None
2853
2854 def _print_formats(self, formats):
51937c08 2855 print('Available formats:')
d77c3dfd
FV
2856 for fmt in formats.keys():
2857 for b in formats[fmt]:
2858 try:
2859 ext = formats[fmt][b][0]
51937c08 2860 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
d77c3dfd
FV
2861 except TypeError: # we have no bitrate info
2862 ext = formats[fmt][0]
51937c08 2863 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
d77c3dfd
FV
2864 break
2865
2866 def _real_extract(self, url):
2867 mobj = re.match(self._VALID_URL, url)
2868 if mobj is None:
2869 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2870 return
2871 # extract uploader & filename from url
2872 uploader = mobj.group(1).decode('utf-8')
2873 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2874
2875 # construct API request
2876 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2877 # retrieve .json file with links to files
2878 request = urllib2.Request(file_url)
2879 try:
2880 self.report_download_json(file_url)
2881 jsonData = urllib2.urlopen(request).read()
2882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 2883 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
d77c3dfd
FV
2884 return
2885
2886 # parse JSON
2887 json_data = json.loads(jsonData)
2888 player_url = json_data['player_swf_url']
2889 formats = dict(json_data['audio_formats'])
2890
2891 req_format = self._downloader.params.get('format', None)
2892 bitrate = None
2893
2894 if self._downloader.params.get('listformats', None):
2895 self._print_formats(formats)
2896 return
2897
2898 if req_format is None or req_format == 'best':
2899 for format_param in formats.keys():
2900 url_list = self.get_urls(formats, format_param)
2901 # check urls
2902 file_url = self.check_urls(url_list)
2903 if file_url is not None:
2904 break # got it!
2905 else:
2906 if req_format not in formats.keys():
2907 self._downloader.trouble(u'ERROR: format is not available')
2908 return
2909
2910 url_list = self.get_urls(formats, req_format)
2911 file_url = self.check_urls(url_list)
2912 format_param = req_format
2913
58ca755f
FV
2914 return [{
2915 'id': file_id.decode('utf-8'),
2916 'url': file_url.decode('utf-8'),
2917 'uploader': uploader.decode('utf-8'),
2918 'upload_date': u'NA',
2919 'title': json_data['name'],
58ca755f
FV
2920 'ext': file_url.split('.')[-1].decode('utf-8'),
2921 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2922 'thumbnail': json_data['thumbnail_url'],
2923 'description': json_data['description'],
2924 'player_url': player_url.decode('utf-8'),
2925 }]
d77c3dfd
FV
2926
2927class StanfordOpenClassroomIE(InfoExtractor):
2928 """Information extractor for Stanford's Open ClassRoom"""
2929
2930 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2931 IE_NAME = u'stanfordoc'
2932
2933 def report_download_webpage(self, objid):
2934 """Report information extraction."""
2935 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2936
2937 def report_extraction(self, video_id):
2938 """Report information extraction."""
2939 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2940
2941 def _real_extract(self, url):
2942 mobj = re.match(self._VALID_URL, url)
2943 if mobj is None:
2944 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2945 return
2946
2947 if mobj.group('course') and mobj.group('video'): # A specific video
2948 course = mobj.group('course')
2949 video = mobj.group('video')
2950 info = {
2c288bda 2951 'id': course + '_' + video,
d77c3dfd 2952 }
3fe294e4 2953
d77c3dfd
FV
2954 self.report_extraction(info['id'])
2955 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2956 xmlUrl = baseUrl + video + '.xml'
2957 try:
2958 metaXml = urllib2.urlopen(xmlUrl).read()
2959 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2960 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2961 return
2962 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2963 try:
2964 info['title'] = mdoc.findall('./title')[0].text
2965 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2966 except IndexError:
2967 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2968 return
d77c3dfd
FV
2969 info['ext'] = info['url'].rpartition('.')[2]
2970 info['format'] = info['ext']
58ca755f 2971 return [info]
d77c3dfd 2972 elif mobj.group('course'): # A course page
d77c3dfd
FV
2973 course = mobj.group('course')
2974 info = {
2c288bda 2975 'id': course,
d77c3dfd
FV
2976 'type': 'playlist',
2977 }
2978
2979 self.report_download_webpage(info['id'])
2980 try:
2981 coursepage = urllib2.urlopen(url).read()
2982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2984 return
2985
2986 m = re.search('<h1>([^<]+)</h1>', coursepage)
2987 if m:
2988 info['title'] = unescapeHTML(m.group(1))
2989 else:
2990 info['title'] = info['id']
d77c3dfd
FV
2991
2992 m = re.search('<description>([^<]+)</description>', coursepage)
2993 if m:
2994 info['description'] = unescapeHTML(m.group(1))
2995
2996 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2997 info['list'] = [
2998 {
2999 'type': 'reference',
3000 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3001 }
3002 for vpage in links]
58ca755f 3003 results = []
d77c3dfd
FV
3004 for entry in info['list']:
3005 assert entry['type'] == 'reference'
58ca755f
FV
3006 results += self.extract(entry['url'])
3007 return results
3008
d77c3dfd 3009 else: # Root page
d77c3dfd
FV
3010 info = {
3011 'id': 'Stanford OpenClassroom',
3012 'type': 'playlist',
3013 }
3014
3015 self.report_download_webpage(info['id'])
3016 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3017 try:
3018 rootpage = urllib2.urlopen(rootURL).read()
3019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3020 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3021 return
3022
3023 info['title'] = info['id']
d77c3dfd
FV
3024
3025 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3026 info['list'] = [
3027 {
3028 'type': 'reference',
3029 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3030 }
3031 for cpage in links]
3032
58ca755f 3033 results = []
d77c3dfd
FV
3034 for entry in info['list']:
3035 assert entry['type'] == 'reference'
58ca755f
FV
3036 results += self.extract(entry['url'])
3037 return results
d77c3dfd
FV
3038
3039class MTVIE(InfoExtractor):
3040 """Information extractor for MTV.com"""
3041
3042 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3043 IE_NAME = u'mtv'
3044
3045 def report_webpage(self, video_id):
3046 """Report information extraction."""
3047 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3048
3049 def report_extraction(self, video_id):
3050 """Report information extraction."""
3051 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3052
3053 def _real_extract(self, url):
3054 mobj = re.match(self._VALID_URL, url)
3055 if mobj is None:
3056 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3057 return
3058 if not mobj.group('proto'):
3059 url = 'http://' + url
3060 video_id = mobj.group('videoid')
3061 self.report_webpage(video_id)
3062
3063 request = urllib2.Request(url)
3064 try:
3065 webpage = urllib2.urlopen(request).read()
3066 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 3067 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
d77c3dfd
FV
3068 return
3069
3070 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3071 if mobj is None:
3072 self._downloader.trouble(u'ERROR: unable to extract song name')
3073 return
3074 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3075 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3076 if mobj is None:
3077 self._downloader.trouble(u'ERROR: unable to extract performer')
3078 return
3079 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3080 video_title = performer + ' - ' + song_name
3081
3082 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3083 if mobj is None:
3084 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3085 return
3086 mtvn_uri = mobj.group(1)
3087
3088 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3089 if mobj is None:
3090 self._downloader.trouble(u'ERROR: unable to extract content id')
3091 return
3092 content_id = mobj.group(1)
3093
3094 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3095 self.report_extraction(video_id)
3096 request = urllib2.Request(videogen_url)
3097 try:
3098 metadataXml = urllib2.urlopen(request).read()
3099 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 3100 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
d77c3dfd
FV
3101 return
3102
3103 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3104 renditions = mdoc.findall('.//rendition')
3105
3106 # For now, always pick the highest quality.
3107 rendition = renditions[-1]
3108
3109 try:
3110 _,_,ext = rendition.attrib['type'].partition('/')
3111 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3112 video_url = rendition.find('./src').text
3113 except KeyError:
3114 self._downloader.trouble('Invalid rendition field.')
3115 return
3116
d77c3dfd
FV
3117 info = {
3118 'id': video_id,
3119 'url': video_url,
3120 'uploader': performer,
3121 'title': video_title,
d77c3dfd
FV
3122 'ext': ext,
3123 'format': format,
3124 }
3125
58ca755f 3126 return [info]
6de7ef9b 3127
302efc19 3128
302efc19 3129class YoukuIE(InfoExtractor):
3130
3131 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3132 IE_NAME = u'Youku'
3133
3134 def __init__(self, downloader=None):
3135 InfoExtractor.__init__(self, downloader)
3136
3137 def report_download_webpage(self, file_id):
3138 """Report webpage download."""
3139 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3140
3141 def report_extraction(self, file_id):
3142 """Report information extraction."""
3143 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3144
3145 def _gen_sid(self):
3146 nowTime = int(time.time() * 1000)
3147 random1 = random.randint(1000,1998)
3148 random2 = random.randint(1000,9999)
3149
3150 return "%d%d%d" %(nowTime,random1,random2)
3151
3152 def _get_file_ID_mix_string(self, seed):
3153 mixed = []
3154 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3155 seed = float(seed)
3156 for i in range(len(source)):
3157 seed = (seed * 211 + 30031 ) % 65536
3158 index = math.floor(seed / 65536 * len(source) )
3159 mixed.append(source[int(index)])
3160 source.remove(source[int(index)])
3161 #return ''.join(mixed)
3162 return mixed
3163
302efc19 3164 def _get_file_id(self, fileId, seed):
3165 mixed = self._get_file_ID_mix_string(seed)
3166 ids = fileId.split('*')
3167 realId = []
3168 for ch in ids:
d5c4c4c1 3169 if ch:
302efc19 3170 realId.append(mixed[int(ch)])
3171 return ''.join(realId)
3172
302efc19 3173 def _real_extract(self, url):
3174 mobj = re.match(self._VALID_URL, url)
3175 if mobj is None:
3176 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3177 return
3178 video_id = mobj.group('ID')
3179
3180 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
302efc19 3181
3182 request = urllib2.Request(info_url, None, std_headers)
3183 try:
3184 self.report_download_webpage(video_id)
3185 jsondata = urllib2.urlopen(request).read()
3186 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
92b91c18 3187 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
302efc19 3188 return
3189
3190 self.report_extraction(video_id)
3191 try:
3192 config = json.loads(jsondata)
3193
3194 video_title = config['data'][0]['title']
3195 seed = config['data'][0]['seed']
0a98b09b 3196
3197 format = self._downloader.params.get('format', None)
3198 supported_format = config['data'][0]['streamfileids'].keys()
3199
3200 if format is None or format == 'best':
3201 if 'hd2' in supported_format:
3202 format = 'hd2'
3203 else:
3204 format = 'flv'
3205 ext = u'flv'
3206 elif format == 'worst':
3207 format = 'mp4'
3208 ext = u'mp4'
3209 else:
3210 format = 'flv'
3211 ext = u'flv'
3212
302efc19 3213
3214 fileid = config['data'][0]['streamfileids'][format]
3215 seg_number = len(config['data'][0]['segs'][format])
0a98b09b 3216
302efc19 3217 keys=[]
3218 for i in xrange(seg_number):
3219 keys.append(config['data'][0]['segs'][format][i]['k'])
3220
3221 #TODO check error
3222 #youku only could be viewed from mainland china
3223 except:
3224 self._downloader.trouble(u'ERROR: unable to extract info section')
3225 return
3226
3227 files_info=[]
3228 sid = self._gen_sid()
3229 fileid = self._get_file_id(fileid, seed)
3230
3231 #column 8,9 of fileid represent the segment number
3232 #fileid[7:9] should be changed
3233 for index, key in enumerate(keys):
3234
7733d455 3235 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3236 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
d5c4c4c1 3237
302efc19 3238 info = {
b5809a68 3239 'id': '%s_part%02d' % (video_id, index),
302efc19 3240 'url': download_url,
3241 'uploader': None,
b5809a68 3242 'title': video_title,
0a98b09b 3243 'ext': ext,
302efc19 3244 'format': u'NA'
3245 }
3246 files_info.append(info)
3247
3248 return files_info
5dc846fa
FV
3249
3250
6de7ef9b 3251class XNXXIE(InfoExtractor):
3252 """Information extractor for xnxx.com"""
3253
3254 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3255 IE_NAME = u'xnxx'
3256 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3257 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3258 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3259
3260 def report_webpage(self, video_id):
3261 """Report information extraction"""
3262 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3263
3264 def report_extraction(self, video_id):
3265 """Report information extraction"""
3266 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3267
6de7ef9b 3268 def _real_extract(self, url):
3269 mobj = re.match(self._VALID_URL, url)
3270 if mobj is None:
3271 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3272 return
3273 video_id = mobj.group(1).decode('utf-8')
3274
3275 self.report_webpage(video_id)
3276
3277 # Get webpage content
3278 try:
3279 webpage = urllib2.urlopen(url).read()
3280 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3281 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3282 return
3283
795cc505
FV
3284 result = re.search(self.VIDEO_URL_RE, webpage)
3285 if result is None:
3286 self._downloader.trouble(u'ERROR: unable to extract video url')
3287 return
3288 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3289
3290 result = re.search(self.VIDEO_TITLE_RE, webpage)
3291 if result is None:
3292 self._downloader.trouble(u'ERROR: unable to extract video title')
3293 return
3294 video_title = result.group(1).decode('utf-8')
3295
3296 result = re.search(self.VIDEO_THUMB_RE, webpage)
3297 if result is None:
3298 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3299 return
3300 video_thumbnail = result.group(1).decode('utf-8')
3301
6de7ef9b 3302 info = {'id': video_id,
795cc505 3303 'url': video_url,
6de7ef9b 3304 'uploader': None,
3305 'upload_date': None,
795cc505 3306 'title': video_title,
6de7ef9b 3307 'ext': 'flv',
3308 'format': 'flv',
795cc505 3309 'thumbnail': video_thumbnail,
6de7ef9b 3310 'description': None,
3311 'player_url': None}
3312
ebe3f89e 3313 return [info]
fd873c69
FV
3314
3315
d443aca8
KK
3316class GooglePlusIE(InfoExtractor):
3317 """Information extractor for plus.google.com."""
3318
fd873c69 3319 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
d443aca8
KK
3320 IE_NAME = u'plus.google'
3321
3322 def __init__(self, downloader=None):
3323 InfoExtractor.__init__(self, downloader)
3324
3325 def report_extract_entry(self, url):
3326 """Report downloading extry"""
3327 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3328
3329 def report_date(self, upload_date):
3330 """Report downloading extry"""
3331 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3332
3333 def report_uploader(self, uploader):
3334 """Report downloading extry"""
3335 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3336
3337 def report_title(self, video_title):
3338 """Report downloading extry"""
3339 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3340
3341 def report_extract_vid_page(self, video_page):
3342 """Report information extraction."""
3343 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3344
3345 def _real_extract(self, url):
3346 # Extract id from URL
3347 mobj = re.match(self._VALID_URL, url)
3348 if mobj is None:
3349 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3350 return
3351
3352 post_url = mobj.group(0)
3353 video_id = mobj.group(2)
3354
3355 video_extension = 'flv'
3356
3357 # Step 1, Retrieve post webpage to extract further information
fd873c69 3358 self.report_extract_entry(post_url)
d443aca8
KK
3359 request = urllib2.Request(post_url)
3360 try:
d443aca8
KK
3361 webpage = urllib2.urlopen(request).read()
3362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 3363 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
d443aca8
KK
3364 return
3365
3366 # Extract update date
3367 upload_date = u'NA'
3368 pattern = 'title="Timestamp">(.*?)</a>'
3369 mobj = re.search(pattern, webpage)
3370 if mobj:
3371 upload_date = mobj.group(1)
fd873c69 3372 # Convert timestring to a format suitable for filename
d443aca8
KK
3373 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3374 upload_date = upload_date.strftime('%Y%m%d')
3375 self.report_date(upload_date)
3376
3377 # Extract uploader
3378 uploader = u'NA'
3379 pattern = r'rel\="author".*?>(.*?)</a>'
3380 mobj = re.search(pattern, webpage)
3381 if mobj:
3382 uploader = mobj.group(1)
3383 self.report_uploader(uploader)
3384
3385 # Extract title
fd873c69 3386 # Get the first line for title
d443aca8 3387 video_title = u'NA'
fd873c69 3388 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
d443aca8
KK
3389 mobj = re.search(pattern, webpage)
3390 if mobj:
3391 video_title = mobj.group(1)
3392 self.report_title(video_title)
3393
3394 # Step 2, Stimulate clicking the image box to launch video
3395 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3396 mobj = re.search(pattern, webpage)
3397 if mobj is None:
3398 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3399
3400 video_page = mobj.group(1)
3401 request = urllib2.Request(video_page)
3402 try:
3403 webpage = urllib2.urlopen(request).read()
3404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
92b91c18 3405 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
d443aca8
KK
3406 return
3407 self.report_extract_vid_page(video_page)
3408
3409
3410 # Extract video links on video page
3411 """Extract video links of all sizes"""
3412 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3413 mobj = re.findall(pattern, webpage)
fd873c69 3414 if len(mobj) == 0:
d443aca8
KK
3415 self._downloader.trouble(u'ERROR: unable to extract video links')
3416
3417 # Sort in resolution
3418 links = sorted(mobj)
3419
3420 # Choose the lowest of the sort, i.e. highest resolution
3421 video_url = links[-1]
3422 # Only get the url. The resolution part in the tuple has no use anymore
3423 video_url = video_url[-1]
3424 # Treat escaped \u0026 style hex
fd873c69 3425 video_url = unicode(video_url, "unicode_escape")
d443aca8
KK
3426
3427
3428 return [{
3429 'id': video_id.decode('utf-8'),
fd873c69 3430 'url': video_url,
d443aca8
KK
3431 'uploader': uploader.decode('utf-8'),
3432 'upload_date': upload_date.decode('utf-8'),
3433 'title': video_title.decode('utf-8'),
3434 'ext': video_extension.decode('utf-8'),
3435 'format': u'NA',
3436 'player_url': None,
3437 }]