]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
added YouJizz extractor
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 from urlparse import parse_qs, urlparse
19
20 try:
21 import cStringIO as StringIO
22 except ImportError:
23 import StringIO
24
25 from utils import *
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
38 the following fields:
39
40 id: Video identifier.
41 url: Final video URL.
42 uploader: Nickname of the video uploader.
43 title: Literal title.
44 ext: Video filename extension.
45 format: Video format.
46 player_url: SWF Player URL (may be None).
47
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
52
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
55
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
59 """
60
61 _ready = False
62 _downloader = None
63
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
66 self._ready = False
67 self.set_downloader(downloader)
68
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
72
73 def initialize(self):
74 """Initializes an instance (authentication, etc)."""
75 if not self._ready:
76 self._real_initialize()
77 self._ready = True
78
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
81 self.initialize()
82 return self._real_extract(url)
83
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
87
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
90 pass
91
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
94 pass
95
96
97
98
99 class YoutubeIE(InfoExtractor):
100 """Information extractor for youtube.com."""
101
102 _VALID_URL = r"""^
103 (
104 (?:https?://)? # http(s):// (optional)
105 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
106 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
107 (?:.*?\#/)? # handle anchor (#/) redirect urls
108 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
109 (?: # the various things that can precede the ID:
110 (?:(?:v|embed|e)/) # v/ or embed/ or e/
111 |(?: # or the v= param in all its forms
112 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
113 (?:\?|\#!?) # the params delimiter ? or # or #!
114 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
115 v=
116 )
117 )? # optional -> youtube.com/xxxx is OK
118 )? # all until now is optional -> you can pass the naked ID
119 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
120 (?(1).+)? # if we found the ID, everything can follow
121 $"""
122 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
123 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
124 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
125 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
126 _NETRC_MACHINE = 'youtube'
127 # Listed in order of quality
128 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
129 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
130 _video_extensions = {
131 '13': '3gp',
132 '17': 'mp4',
133 '18': 'mp4',
134 '22': 'mp4',
135 '37': 'mp4',
136 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
137 '43': 'webm',
138 '44': 'webm',
139 '45': 'webm',
140 '46': 'webm',
141 }
142 _video_dimensions = {
143 '5': '240x400',
144 '6': '???',
145 '13': '???',
146 '17': '144x176',
147 '18': '360x640',
148 '22': '720x1280',
149 '34': '360x640',
150 '35': '480x854',
151 '37': '1080x1920',
152 '38': '3072x4096',
153 '43': '360x640',
154 '44': '480x854',
155 '45': '720x1280',
156 '46': '1080x1920',
157 }
158 IE_NAME = u'youtube'
159
160 def suitable(self, url):
161 """Receives a URL and returns True if suitable for this IE."""
162 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
163
164 def report_lang(self):
165 """Report attempt to set language."""
166 self._downloader.to_screen(u'[youtube] Setting language')
167
168 def report_login(self):
169 """Report attempt to log in."""
170 self._downloader.to_screen(u'[youtube] Logging in')
171
172 def report_age_confirmation(self):
173 """Report attempt to confirm age."""
174 self._downloader.to_screen(u'[youtube] Confirming age')
175
176 def report_video_webpage_download(self, video_id):
177 """Report attempt to download video webpage."""
178 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
179
180 def report_video_info_webpage_download(self, video_id):
181 """Report attempt to download video info webpage."""
182 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
183
184 def report_video_subtitles_download(self, video_id):
185 """Report attempt to download video info webpage."""
186 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
187
188 def report_information_extraction(self, video_id):
189 """Report attempt to extract video information."""
190 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
191
192 def report_unavailable_format(self, video_id, format):
193 """Report extracted video URL."""
194 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
195
196 def report_rtmp_download(self):
197 """Indicate the download will use the RTMP protocol."""
198 self._downloader.to_screen(u'[youtube] RTMP download detected')
199
200 def _closed_captions_xml_to_srt(self, xml_string):
201 srt = ''
202 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
203 # TODO parse xml instead of regex
204 for n, (start, dur_tag, dur, caption) in enumerate(texts):
205 if not dur: dur = '4'
206 start = float(start)
207 end = start + float(dur)
208 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
209 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
210 caption = unescapeHTML(caption)
211 caption = unescapeHTML(caption) # double cycle, intentional
212 srt += str(n+1) + '\n'
213 srt += start + ' --> ' + end + '\n'
214 srt += caption + '\n\n'
215 return srt
216
217 def _print_formats(self, formats):
218 print 'Available formats:'
219 for x in formats:
220 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
221
222 def _real_initialize(self):
223 if self._downloader is None:
224 return
225
226 username = None
227 password = None
228 downloader_params = self._downloader.params
229
230 # Attempt to use provided username and password or .netrc data
231 if downloader_params.get('username', None) is not None:
232 username = downloader_params['username']
233 password = downloader_params['password']
234 elif downloader_params.get('usenetrc', False):
235 try:
236 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
237 if info is not None:
238 username = info[0]
239 password = info[2]
240 else:
241 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
242 except (IOError, netrc.NetrcParseError), err:
243 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
244 return
245
246 # Set language
247 request = urllib2.Request(self._LANG_URL)
248 try:
249 self.report_lang()
250 urllib2.urlopen(request).read()
251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
252 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
253 return
254
255 # No authentication to be performed
256 if username is None:
257 return
258
259 # Log in
260 login_form = {
261 'current_form': 'loginForm',
262 'next': '/',
263 'action_login': 'Log In',
264 'username': username,
265 'password': password,
266 }
267 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
268 try:
269 self.report_login()
270 login_results = urllib2.urlopen(request).read()
271 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
272 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
273 return
274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
275 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
276 return
277
278 # Confirm age
279 age_form = {
280 'next_url': '/',
281 'action_confirm': 'Confirm',
282 }
283 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
284 try:
285 self.report_age_confirmation()
286 age_results = urllib2.urlopen(request).read()
287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
289 return
290
291 def _real_extract(self, url):
292 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
293 mobj = re.search(self._NEXT_URL_RE, url)
294 if mobj:
295 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
296
297 # Extract video id from URL
298 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
299 if mobj is None:
300 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
301 return
302 video_id = mobj.group(2)
303
304 # Get video webpage
305 self.report_video_webpage_download(video_id)
306 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
307 try:
308 video_webpage = urllib2.urlopen(request).read()
309 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
310 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
311 return
312
313 # Attempt to extract SWF player URL
314 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
315 if mobj is not None:
316 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
317 else:
318 player_url = None
319
320 # Get video info
321 self.report_video_info_webpage_download(video_id)
322 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
323 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
324 % (video_id, el_type))
325 request = urllib2.Request(video_info_url)
326 try:
327 video_info_webpage = urllib2.urlopen(request).read()
328 video_info = parse_qs(video_info_webpage)
329 if 'token' in video_info:
330 break
331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
332 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
333 return
334 if 'token' not in video_info:
335 if 'reason' in video_info:
336 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
337 else:
338 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
339 return
340
341 # Check for "rental" videos
342 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
343 self._downloader.trouble(u'ERROR: "rental" videos not supported')
344 return
345
346 # Start extracting information
347 self.report_information_extraction(video_id)
348
349 # uploader
350 if 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
352 return
353 video_uploader = urllib.unquote_plus(video_info['author'][0])
354
355 # title
356 if 'title' not in video_info:
357 self._downloader.trouble(u'ERROR: unable to extract video title')
358 return
359 video_title = urllib.unquote_plus(video_info['title'][0])
360 video_title = video_title.decode('utf-8')
361
362 # thumbnail image
363 if 'thumbnail_url' not in video_info:
364 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
365 video_thumbnail = ''
366 else: # don't panic if we can't find it
367 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
368
369 # upload date
370 upload_date = u'NA'
371 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
372 if mobj is not None:
373 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
374 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
375 for expression in format_expressions:
376 try:
377 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
378 except:
379 pass
380
381 # description
382 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
383 if video_description: video_description = clean_html(video_description)
384 else: video_description = ''
385
386 # closed captions
387 video_subtitles = None
388 if self._downloader.params.get('writesubtitles', False):
389 try:
390 self.report_video_subtitles_download(video_id)
391 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
392 try:
393 srt_list = urllib2.urlopen(request).read()
394 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
395 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
396 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
397 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
398 if not srt_lang_list:
399 raise Trouble(u'WARNING: video has no closed captions')
400 if self._downloader.params.get('subtitleslang', False):
401 srt_lang = self._downloader.params.get('subtitleslang')
402 elif 'en' in srt_lang_list:
403 srt_lang = 'en'
404 else:
405 srt_lang = srt_lang_list.keys()[0]
406 if not srt_lang in srt_lang_list:
407 raise Trouble(u'WARNING: no closed captions found in the specified language')
408 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
409 try:
410 srt_xml = urllib2.urlopen(request).read()
411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
412 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
413 if not srt_xml:
414 raise Trouble(u'WARNING: unable to download video subtitles')
415 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
416 except Trouble as trouble:
417 self._downloader.trouble(trouble[0])
418
419 if 'length_seconds' not in video_info:
420 self._downloader.trouble(u'WARNING: unable to extract video duration')
421 video_duration = ''
422 else:
423 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
424
425 # token
426 video_token = urllib.unquote_plus(video_info['token'][0])
427
428 # Decide which formats to download
429 req_format = self._downloader.params.get('format', None)
430
431 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
432 self.report_rtmp_download()
433 video_url_list = [(None, video_info['conn'][0])]
434 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
435 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
436 url_data = [parse_qs(uds) for uds in url_data_strs]
437 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
438 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
439
440 format_limit = self._downloader.params.get('format_limit', None)
441 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
442 if format_limit is not None and format_limit in available_formats:
443 format_list = available_formats[available_formats.index(format_limit):]
444 else:
445 format_list = available_formats
446 existing_formats = [x for x in format_list if x in url_map]
447 if len(existing_formats) == 0:
448 self._downloader.trouble(u'ERROR: no known formats available for video')
449 return
450 if self._downloader.params.get('listformats', None):
451 self._print_formats(existing_formats)
452 return
453 if req_format is None or req_format == 'best':
454 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
455 elif req_format == 'worst':
456 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
457 elif req_format in ('-1', 'all'):
458 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
459 else:
460 # Specific formats. We pick the first in a slash-delimeted sequence.
461 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
462 req_formats = req_format.split('/')
463 video_url_list = None
464 for rf in req_formats:
465 if rf in url_map:
466 video_url_list = [(rf, url_map[rf])]
467 break
468 if video_url_list is None:
469 self._downloader.trouble(u'ERROR: requested format not available')
470 return
471 else:
472 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
473 return
474
475 results = []
476 for format_param, video_real_url in video_url_list:
477 # Extension
478 video_extension = self._video_extensions.get(format_param, 'flv')
479
480 results.append({
481 'id': video_id.decode('utf-8'),
482 'url': video_real_url.decode('utf-8'),
483 'uploader': video_uploader.decode('utf-8'),
484 'upload_date': upload_date,
485 'title': video_title,
486 'ext': video_extension.decode('utf-8'),
487 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
488 'thumbnail': video_thumbnail.decode('utf-8'),
489 'description': video_description,
490 'player_url': player_url,
491 'subtitles': video_subtitles,
492 'duration': video_duration
493 })
494 return results
495
496
497 class MetacafeIE(InfoExtractor):
498 """Information Extractor for metacafe.com."""
499
500 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
501 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
502 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
503 IE_NAME = u'metacafe'
504
505 def __init__(self, downloader=None):
506 InfoExtractor.__init__(self, downloader)
507
508 def report_disclaimer(self):
509 """Report disclaimer retrieval."""
510 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
511
512 def report_age_confirmation(self):
513 """Report attempt to confirm age."""
514 self._downloader.to_screen(u'[metacafe] Confirming age')
515
516 def report_download_webpage(self, video_id):
517 """Report webpage download."""
518 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
519
520 def report_extraction(self, video_id):
521 """Report information extraction."""
522 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
523
524 def _real_initialize(self):
525 # Retrieve disclaimer
526 request = urllib2.Request(self._DISCLAIMER)
527 try:
528 self.report_disclaimer()
529 disclaimer = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
532 return
533
534 # Confirm age
535 disclaimer_form = {
536 'filters': '0',
537 'submit': "Continue - I'm over 18",
538 }
539 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
540 try:
541 self.report_age_confirmation()
542 disclaimer = urllib2.urlopen(request).read()
543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
544 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
545 return
546
547 def _real_extract(self, url):
548 # Extract id and simplified title from URL
549 mobj = re.match(self._VALID_URL, url)
550 if mobj is None:
551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
552 return
553
554 video_id = mobj.group(1)
555
556 # Check if video comes from YouTube
557 mobj2 = re.match(r'^yt-(.*)$', video_id)
558 if mobj2 is not None:
559 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
560 return
561
562 # Retrieve video webpage to extract further information
563 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
564 try:
565 self.report_download_webpage(video_id)
566 webpage = urllib2.urlopen(request).read()
567 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
568 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
569 return
570
571 # Extract URL, uploader and title from webpage
572 self.report_extraction(video_id)
573 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
574 if mobj is not None:
575 mediaURL = urllib.unquote(mobj.group(1))
576 video_extension = mediaURL[-3:]
577
578 # Extract gdaKey if available
579 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
580 if mobj is None:
581 video_url = mediaURL
582 else:
583 gdaKey = mobj.group(1)
584 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
585 else:
586 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
587 if mobj is None:
588 self._downloader.trouble(u'ERROR: unable to extract media URL')
589 return
590 vardict = parse_qs(mobj.group(1))
591 if 'mediaData' not in vardict:
592 self._downloader.trouble(u'ERROR: unable to extract media URL')
593 return
594 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
595 if mobj is None:
596 self._downloader.trouble(u'ERROR: unable to extract media URL')
597 return
598 mediaURL = mobj.group(1).replace('\\/', '/')
599 video_extension = mediaURL[-3:]
600 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
601
602 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
603 if mobj is None:
604 self._downloader.trouble(u'ERROR: unable to extract title')
605 return
606 video_title = mobj.group(1).decode('utf-8')
607
608 mobj = re.search(r'submitter=(.*?);', webpage)
609 if mobj is None:
610 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
611 return
612 video_uploader = mobj.group(1)
613
614 return [{
615 'id': video_id.decode('utf-8'),
616 'url': video_url.decode('utf-8'),
617 'uploader': video_uploader.decode('utf-8'),
618 'upload_date': u'NA',
619 'title': video_title,
620 'ext': video_extension.decode('utf-8'),
621 'format': u'NA',
622 'player_url': None,
623 }]
624
625
626 class DailymotionIE(InfoExtractor):
627 """Information Extractor for Dailymotion"""
628
629 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
630 IE_NAME = u'dailymotion'
631
632 def __init__(self, downloader=None):
633 InfoExtractor.__init__(self, downloader)
634
635 def report_download_webpage(self, video_id):
636 """Report webpage download."""
637 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
638
639 def report_extraction(self, video_id):
640 """Report information extraction."""
641 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
642
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
646 if mobj is None:
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
648 return
649
650 video_id = mobj.group(1).split('_')[0].split('?')[0]
651
652 video_extension = 'mp4'
653
654 # Retrieve video webpage to extract further information
655 request = urllib2.Request(url)
656 request.add_header('Cookie', 'family_filter=off')
657 try:
658 self.report_download_webpage(video_id)
659 webpage = urllib2.urlopen(request).read()
660 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
662 return
663
664 # Extract URL, uploader and title from webpage
665 self.report_extraction(video_id)
666 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
667 if mobj is None:
668 self._downloader.trouble(u'ERROR: unable to extract media URL')
669 return
670 flashvars = urllib.unquote(mobj.group(1))
671
672 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
673 if key in flashvars:
674 max_quality = key
675 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
676 break
677 else:
678 self._downloader.trouble(u'ERROR: unable to extract video URL')
679 return
680
681 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
682 if mobj is None:
683 self._downloader.trouble(u'ERROR: unable to extract video URL')
684 return
685
686 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
687
688 # TODO: support choosing qualities
689
690 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
691 if mobj is None:
692 self._downloader.trouble(u'ERROR: unable to extract title')
693 return
694 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
695
696 video_uploader = u'NA'
697 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
698 if mobj is None:
699 # lookin for official user
700 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
701 if mobj_official is None:
702 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
703 else:
704 video_uploader = mobj_official.group(1)
705 else:
706 video_uploader = mobj.group(1)
707
708 video_upload_date = u'NA'
709 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
710 if mobj is not None:
711 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
712
713 return [{
714 'id': video_id.decode('utf-8'),
715 'url': video_url.decode('utf-8'),
716 'uploader': video_uploader.decode('utf-8'),
717 'upload_date': video_upload_date,
718 'title': video_title,
719 'ext': video_extension.decode('utf-8'),
720 'format': u'NA',
721 'player_url': None,
722 }]
723
724
725 class GoogleIE(InfoExtractor):
726 """Information extractor for video.google.com."""
727
728 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
729 IE_NAME = u'video.google'
730
731 def __init__(self, downloader=None):
732 InfoExtractor.__init__(self, downloader)
733
734 def report_download_webpage(self, video_id):
735 """Report webpage download."""
736 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
737
738 def report_extraction(self, video_id):
739 """Report information extraction."""
740 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
741
742 def _real_extract(self, url):
743 # Extract id from URL
744 mobj = re.match(self._VALID_URL, url)
745 if mobj is None:
746 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
747 return
748
749 video_id = mobj.group(1)
750
751 video_extension = 'mp4'
752
753 # Retrieve video webpage to extract further information
754 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
755 try:
756 self.report_download_webpage(video_id)
757 webpage = urllib2.urlopen(request).read()
758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
760 return
761
762 # Extract URL, uploader, and title from webpage
763 self.report_extraction(video_id)
764 mobj = re.search(r"download_url:'([^']+)'", webpage)
765 if mobj is None:
766 video_extension = 'flv'
767 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
768 if mobj is None:
769 self._downloader.trouble(u'ERROR: unable to extract media URL')
770 return
771 mediaURL = urllib.unquote(mobj.group(1))
772 mediaURL = mediaURL.replace('\\x3d', '\x3d')
773 mediaURL = mediaURL.replace('\\x26', '\x26')
774
775 video_url = mediaURL
776
777 mobj = re.search(r'<title>(.*)</title>', webpage)
778 if mobj is None:
779 self._downloader.trouble(u'ERROR: unable to extract title')
780 return
781 video_title = mobj.group(1).decode('utf-8')
782
783 # Extract video description
784 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
785 if mobj is None:
786 self._downloader.trouble(u'ERROR: unable to extract video description')
787 return
788 video_description = mobj.group(1).decode('utf-8')
789 if not video_description:
790 video_description = 'No description available.'
791
792 # Extract video thumbnail
793 if self._downloader.params.get('forcethumbnail', False):
794 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
795 try:
796 webpage = urllib2.urlopen(request).read()
797 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
798 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
799 return
800 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
801 if mobj is None:
802 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
803 return
804 video_thumbnail = mobj.group(1)
805 else: # we need something to pass to process_info
806 video_thumbnail = ''
807
808 return [{
809 'id': video_id.decode('utf-8'),
810 'url': video_url.decode('utf-8'),
811 'uploader': u'NA',
812 'upload_date': u'NA',
813 'title': video_title,
814 'ext': video_extension.decode('utf-8'),
815 'format': u'NA',
816 'player_url': None,
817 }]
818
819
820 class PhotobucketIE(InfoExtractor):
821 """Information extractor for photobucket.com."""
822
823 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
824 IE_NAME = u'photobucket'
825
826 def __init__(self, downloader=None):
827 InfoExtractor.__init__(self, downloader)
828
829 def report_download_webpage(self, video_id):
830 """Report webpage download."""
831 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
832
833 def report_extraction(self, video_id):
834 """Report information extraction."""
835 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
836
837 def _real_extract(self, url):
838 # Extract id from URL
839 mobj = re.match(self._VALID_URL, url)
840 if mobj is None:
841 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
842 return
843
844 video_id = mobj.group(1)
845
846 video_extension = 'flv'
847
848 # Retrieve video webpage to extract further information
849 request = urllib2.Request(url)
850 try:
851 self.report_download_webpage(video_id)
852 webpage = urllib2.urlopen(request).read()
853 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
854 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
855 return
856
857 # Extract URL, uploader, and title from webpage
858 self.report_extraction(video_id)
859 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
860 if mobj is None:
861 self._downloader.trouble(u'ERROR: unable to extract media URL')
862 return
863 mediaURL = urllib.unquote(mobj.group(1))
864
865 video_url = mediaURL
866
867 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
868 if mobj is None:
869 self._downloader.trouble(u'ERROR: unable to extract title')
870 return
871 video_title = mobj.group(1).decode('utf-8')
872
873 video_uploader = mobj.group(2).decode('utf-8')
874
875 return [{
876 'id': video_id.decode('utf-8'),
877 'url': video_url.decode('utf-8'),
878 'uploader': video_uploader,
879 'upload_date': u'NA',
880 'title': video_title,
881 'ext': video_extension.decode('utf-8'),
882 'format': u'NA',
883 'player_url': None,
884 }]
885
886
887 class YahooIE(InfoExtractor):
888 """Information extractor for video.yahoo.com."""
889
890 # _VALID_URL matches all Yahoo! Video URLs
891 # _VPAGE_URL matches only the extractable '/watch/' URLs
892 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
893 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
894 IE_NAME = u'video.yahoo'
895
896 def __init__(self, downloader=None):
897 InfoExtractor.__init__(self, downloader)
898
899 def report_download_webpage(self, video_id):
900 """Report webpage download."""
901 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
902
903 def report_extraction(self, video_id):
904 """Report information extraction."""
905 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
906
907 def _real_extract(self, url, new_video=True):
908 # Extract ID from URL
909 mobj = re.match(self._VALID_URL, url)
910 if mobj is None:
911 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
912 return
913
914 video_id = mobj.group(2)
915 video_extension = 'flv'
916
917 # Rewrite valid but non-extractable URLs as
918 # extractable English language /watch/ URLs
919 if re.match(self._VPAGE_URL, url) is None:
920 request = urllib2.Request(url)
921 try:
922 webpage = urllib2.urlopen(request).read()
923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
924 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
925 return
926
927 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
928 if mobj is None:
929 self._downloader.trouble(u'ERROR: Unable to extract id field')
930 return
931 yahoo_id = mobj.group(1)
932
933 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
934 if mobj is None:
935 self._downloader.trouble(u'ERROR: Unable to extract vid field')
936 return
937 yahoo_vid = mobj.group(1)
938
939 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
940 return self._real_extract(url, new_video=False)
941
942 # Retrieve video webpage to extract further information
943 request = urllib2.Request(url)
944 try:
945 self.report_download_webpage(video_id)
946 webpage = urllib2.urlopen(request).read()
947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
949 return
950
951 # Extract uploader and title from webpage
952 self.report_extraction(video_id)
953 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
954 if mobj is None:
955 self._downloader.trouble(u'ERROR: unable to extract video title')
956 return
957 video_title = mobj.group(1).decode('utf-8')
958
959 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
960 if mobj is None:
961 self._downloader.trouble(u'ERROR: unable to extract video uploader')
962 return
963 video_uploader = mobj.group(1).decode('utf-8')
964
965 # Extract video thumbnail
966 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
967 if mobj is None:
968 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
969 return
970 video_thumbnail = mobj.group(1).decode('utf-8')
971
972 # Extract video description
973 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
974 if mobj is None:
975 self._downloader.trouble(u'ERROR: unable to extract video description')
976 return
977 video_description = mobj.group(1).decode('utf-8')
978 if not video_description:
979 video_description = 'No description available.'
980
981 # Extract video height and width
982 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
983 if mobj is None:
984 self._downloader.trouble(u'ERROR: unable to extract video height')
985 return
986 yv_video_height = mobj.group(1)
987
988 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
989 if mobj is None:
990 self._downloader.trouble(u'ERROR: unable to extract video width')
991 return
992 yv_video_width = mobj.group(1)
993
994 # Retrieve video playlist to extract media URL
995 # I'm not completely sure what all these options are, but we
996 # seem to need most of them, otherwise the server sends a 401.
997 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
998 yv_bitrate = '700' # according to Wikipedia this is hard-coded
999 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1002 try:
1003 self.report_download_webpage(video_id)
1004 webpage = urllib2.urlopen(request).read()
1005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1007 return
1008
1009 # Extract media URL from playlist XML
1010 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1011 if mobj is None:
1012 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1013 return
1014 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015 video_url = unescapeHTML(video_url)
1016
1017 return [{
1018 'id': video_id.decode('utf-8'),
1019 'url': video_url,
1020 'uploader': video_uploader,
1021 'upload_date': u'NA',
1022 'title': video_title,
1023 'ext': video_extension.decode('utf-8'),
1024 'thumbnail': video_thumbnail.decode('utf-8'),
1025 'description': video_description,
1026 'thumbnail': video_thumbnail,
1027 'player_url': None,
1028 }]
1029
1030
1031 class VimeoIE(InfoExtractor):
1032 """Information extractor for vimeo.com."""
1033
1034 # _VALID_URL matches Vimeo URLs
1035 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1036 IE_NAME = u'vimeo'
1037
1038 def __init__(self, downloader=None):
1039 InfoExtractor.__init__(self, downloader)
1040
1041 def report_download_webpage(self, video_id):
1042 """Report webpage download."""
1043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1044
1045 def report_extraction(self, video_id):
1046 """Report information extraction."""
1047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1048
1049 def _real_extract(self, url, new_video=True):
1050 # Extract ID from URL
1051 mobj = re.match(self._VALID_URL, url)
1052 if mobj is None:
1053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054 return
1055
1056 video_id = mobj.group(1)
1057
1058 # Retrieve video webpage to extract further information
1059 request = urllib2.Request(url, None, std_headers)
1060 try:
1061 self.report_download_webpage(video_id)
1062 webpage = urllib2.urlopen(request).read()
1063 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1064 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1065 return
1066
1067 # Now we begin extracting as much information as we can from what we
1068 # retrieved. First we extract the information common to all extractors,
1069 # and latter we extract those that are Vimeo specific.
1070 self.report_extraction(video_id)
1071
1072 # Extract the config JSON
1073 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1074 try:
1075 config = json.loads(config)
1076 except:
1077 self._downloader.trouble(u'ERROR: unable to extract info section')
1078 return
1079
1080 # Extract title
1081 video_title = config["video"]["title"]
1082
1083 # Extract uploader
1084 video_uploader = config["video"]["owner"]["name"]
1085
1086 # Extract video thumbnail
1087 video_thumbnail = config["video"]["thumbnail"]
1088
1089 # Extract video description
1090 video_description = get_element_by_id("description", webpage.decode('utf8'))
1091 if video_description: video_description = clean_html(video_description)
1092 else: video_description = ''
1093
1094 # Extract upload date
1095 video_upload_date = u'NA'
1096 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1097 if mobj is not None:
1098 video_upload_date = mobj.group(1)
1099
1100 # Vimeo specific: extract request signature and timestamp
1101 sig = config['request']['signature']
1102 timestamp = config['request']['timestamp']
1103
1104 # Vimeo specific: extract video codec and quality information
1105 # TODO bind to format param
1106 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1107 for codec in codecs:
1108 if codec[0] in config["video"]["files"]:
1109 video_codec = codec[0]
1110 video_extension = codec[1]
1111 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1112 else: quality = 'sd'
1113 break
1114 else:
1115 self._downloader.trouble(u'ERROR: no known codec found')
1116 return
1117
1118 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1119 %(video_id, sig, timestamp, quality, video_codec.upper())
1120
1121 return [{
1122 'id': video_id,
1123 'url': video_url,
1124 'uploader': video_uploader,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1130 'player_url': None,
1131 }]
1132
1133
1134 class GenericIE(InfoExtractor):
1135 """Generic last-resort information extractor."""
1136
1137 _VALID_URL = r'.*'
1138 IE_NAME = u'generic'
1139
1140 def __init__(self, downloader=None):
1141 InfoExtractor.__init__(self, downloader)
1142
1143 def report_download_webpage(self, video_id):
1144 """Report webpage download."""
1145 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1146 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1147
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1151
1152 def report_following_redirect(self, new_url):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1155
1156 def _test_redirect(self, url):
1157 """Check if it is a redirect, like url shorteners, in case restart chain."""
1158 class HeadRequest(urllib2.Request):
1159 def get_method(self):
1160 return "HEAD"
1161
1162 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1163 """
1164 Subclass the HTTPRedirectHandler to make it use our
1165 HeadRequest also on the redirected URL
1166 """
1167 def redirect_request(self, req, fp, code, msg, headers, newurl):
1168 if code in (301, 302, 303, 307):
1169 newurl = newurl.replace(' ', '%20')
1170 newheaders = dict((k,v) for k,v in req.headers.items()
1171 if k.lower() not in ("content-length", "content-type"))
1172 return HeadRequest(newurl,
1173 headers=newheaders,
1174 origin_req_host=req.get_origin_req_host(),
1175 unverifiable=True)
1176 else:
1177 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1178
1179 class HTTPMethodFallback(urllib2.BaseHandler):
1180 """
1181 Fallback to GET if HEAD is not allowed (405 HTTP error)
1182 """
1183 def http_error_405(self, req, fp, code, msg, headers):
1184 fp.read()
1185 fp.close()
1186
1187 newheaders = dict((k,v) for k,v in req.headers.items()
1188 if k.lower() not in ("content-length", "content-type"))
1189 return self.parent.open(urllib2.Request(req.get_full_url(),
1190 headers=newheaders,
1191 origin_req_host=req.get_origin_req_host(),
1192 unverifiable=True))
1193
1194 # Build our opener
1195 opener = urllib2.OpenerDirector()
1196 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1197 HTTPMethodFallback, HEADRedirectHandler,
1198 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1199 opener.add_handler(handler())
1200
1201 response = opener.open(HeadRequest(url))
1202 new_url = response.geturl()
1203
1204 if url == new_url: return False
1205
1206 self.report_following_redirect(new_url)
1207 self._downloader.download([new_url])
1208 return True
1209
1210 def _real_extract(self, url):
1211 if self._test_redirect(url): return
1212
1213 video_id = url.split('/')[-1]
1214 request = urllib2.Request(url)
1215 try:
1216 self.report_download_webpage(video_id)
1217 webpage = urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1220 return
1221 except ValueError, err:
1222 # since this is the last-resort InfoExtractor, if
1223 # this error is thrown, it'll be thrown here
1224 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1225 return
1226
1227 self.report_extraction(video_id)
1228 # Start with something easy: JW Player in SWFObject
1229 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1230 if mobj is None:
1231 # Broaden the search a little bit
1232 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1233 if mobj is None:
1234 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1235 return
1236
1237 # It's possible that one of the regexes
1238 # matched, but returned an empty group:
1239 if mobj.group(1) is None:
1240 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1241 return
1242
1243 video_url = urllib.unquote(mobj.group(1))
1244 video_id = os.path.basename(video_url)
1245
1246 # here's a fun little line of code for you:
1247 video_extension = os.path.splitext(video_id)[1][1:]
1248 video_id = os.path.splitext(video_id)[0]
1249
1250 # it's tempting to parse this further, but you would
1251 # have to take into account all the variations like
1252 # Video Title - Site Name
1253 # Site Name | Video Title
1254 # Video Title - Tagline | Site Name
1255 # and so on and so forth; it's just not practical
1256 mobj = re.search(r'<title>(.*)</title>', webpage)
1257 if mobj is None:
1258 self._downloader.trouble(u'ERROR: unable to extract title')
1259 return
1260 video_title = mobj.group(1).decode('utf-8')
1261
1262 # video uploader is domain name
1263 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1264 if mobj is None:
1265 self._downloader.trouble(u'ERROR: unable to extract title')
1266 return
1267 video_uploader = mobj.group(1).decode('utf-8')
1268
1269 return [{
1270 'id': video_id.decode('utf-8'),
1271 'url': video_url.decode('utf-8'),
1272 'uploader': video_uploader,
1273 'upload_date': u'NA',
1274 'title': video_title,
1275 'ext': video_extension.decode('utf-8'),
1276 'format': u'NA',
1277 'player_url': None,
1278 }]
1279
1280
1281 class YoutubeSearchIE(InfoExtractor):
1282 """Information Extractor for YouTube search queries."""
1283 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1284 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1285 _max_youtube_results = 1000
1286 IE_NAME = u'youtube:search'
1287
1288 def __init__(self, downloader=None):
1289 InfoExtractor.__init__(self, downloader)
1290
1291 def report_download_page(self, query, pagenum):
1292 """Report attempt to download search page with given number."""
1293 query = query.decode(preferredencoding())
1294 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1295
1296 def _real_extract(self, query):
1297 mobj = re.match(self._VALID_URL, query)
1298 if mobj is None:
1299 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1300 return
1301
1302 prefix, query = query.split(':')
1303 prefix = prefix[8:]
1304 query = query.encode('utf-8')
1305 if prefix == '':
1306 self._download_n_results(query, 1)
1307 return
1308 elif prefix == 'all':
1309 self._download_n_results(query, self._max_youtube_results)
1310 return
1311 else:
1312 try:
1313 n = long(prefix)
1314 if n <= 0:
1315 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1316 return
1317 elif n > self._max_youtube_results:
1318 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1319 n = self._max_youtube_results
1320 self._download_n_results(query, n)
1321 return
1322 except ValueError: # parsing prefix as integer fails
1323 self._download_n_results(query, 1)
1324 return
1325
1326 def _download_n_results(self, query, n):
1327 """Downloads a specified number of results for a query"""
1328
1329 video_ids = []
1330 pagenum = 0
1331 limit = n
1332
1333 while (50 * pagenum) < limit:
1334 self.report_download_page(query, pagenum+1)
1335 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1336 request = urllib2.Request(result_url)
1337 try:
1338 data = urllib2.urlopen(request).read()
1339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1340 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1341 return
1342 api_response = json.loads(data)['data']
1343
1344 new_ids = list(video['id'] for video in api_response['items'])
1345 video_ids += new_ids
1346
1347 limit = min(n, api_response['totalItems'])
1348 pagenum += 1
1349
1350 if len(video_ids) > n:
1351 video_ids = video_ids[:n]
1352 for id in video_ids:
1353 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1354 return
1355
1356
1357 class GoogleSearchIE(InfoExtractor):
1358 """Information Extractor for Google Video search queries."""
1359 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1360 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1361 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1362 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1363 _max_google_results = 1000
1364 IE_NAME = u'video.google:search'
1365
1366 def __init__(self, downloader=None):
1367 InfoExtractor.__init__(self, downloader)
1368
1369 def report_download_page(self, query, pagenum):
1370 """Report attempt to download playlist page with given number."""
1371 query = query.decode(preferredencoding())
1372 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1373
1374 def _real_extract(self, query):
1375 mobj = re.match(self._VALID_URL, query)
1376 if mobj is None:
1377 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1378 return
1379
1380 prefix, query = query.split(':')
1381 prefix = prefix[8:]
1382 query = query.encode('utf-8')
1383 if prefix == '':
1384 self._download_n_results(query, 1)
1385 return
1386 elif prefix == 'all':
1387 self._download_n_results(query, self._max_google_results)
1388 return
1389 else:
1390 try:
1391 n = long(prefix)
1392 if n <= 0:
1393 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1394 return
1395 elif n > self._max_google_results:
1396 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1397 n = self._max_google_results
1398 self._download_n_results(query, n)
1399 return
1400 except ValueError: # parsing prefix as integer fails
1401 self._download_n_results(query, 1)
1402 return
1403
1404 def _download_n_results(self, query, n):
1405 """Downloads a specified number of results for a query"""
1406
1407 video_ids = []
1408 pagenum = 0
1409
1410 while True:
1411 self.report_download_page(query, pagenum)
1412 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1413 request = urllib2.Request(result_url)
1414 try:
1415 page = urllib2.urlopen(request).read()
1416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1418 return
1419
1420 # Extract video identifiers
1421 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1422 video_id = mobj.group(1)
1423 if video_id not in video_ids:
1424 video_ids.append(video_id)
1425 if len(video_ids) == n:
1426 # Specified n videos reached
1427 for id in video_ids:
1428 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1429 return
1430
1431 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1432 for id in video_ids:
1433 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1434 return
1435
1436 pagenum = pagenum + 1
1437
1438
1439 class YahooSearchIE(InfoExtractor):
1440 """Information Extractor for Yahoo! Video search queries."""
1441 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1442 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1443 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1444 _MORE_PAGES_INDICATOR = r'\s*Next'
1445 _max_yahoo_results = 1000
1446 IE_NAME = u'video.yahoo:search'
1447
1448 def __init__(self, downloader=None):
1449 InfoExtractor.__init__(self, downloader)
1450
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1455
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1458 if mobj is None:
1459 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1460 return
1461
1462 prefix, query = query.split(':')
1463 prefix = prefix[8:]
1464 query = query.encode('utf-8')
1465 if prefix == '':
1466 self._download_n_results(query, 1)
1467 return
1468 elif prefix == 'all':
1469 self._download_n_results(query, self._max_yahoo_results)
1470 return
1471 else:
1472 try:
1473 n = long(prefix)
1474 if n <= 0:
1475 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476 return
1477 elif n > self._max_yahoo_results:
1478 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1479 n = self._max_yahoo_results
1480 self._download_n_results(query, n)
1481 return
1482 except ValueError: # parsing prefix as integer fails
1483 self._download_n_results(query, 1)
1484 return
1485
1486 def _download_n_results(self, query, n):
1487 """Downloads a specified number of results for a query"""
1488
1489 video_ids = []
1490 already_seen = set()
1491 pagenum = 1
1492
1493 while True:
1494 self.report_download_page(query, pagenum)
1495 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1496 request = urllib2.Request(result_url)
1497 try:
1498 page = urllib2.urlopen(request).read()
1499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1501 return
1502
1503 # Extract video identifiers
1504 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1505 video_id = mobj.group(1)
1506 if video_id not in already_seen:
1507 video_ids.append(video_id)
1508 already_seen.add(video_id)
1509 if len(video_ids) == n:
1510 # Specified n videos reached
1511 for id in video_ids:
1512 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1513 return
1514
1515 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1516 for id in video_ids:
1517 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1518 return
1519
1520 pagenum = pagenum + 1
1521
1522
1523 class YoutubePlaylistIE(InfoExtractor):
1524 """Information Extractor for YouTube playlists."""
1525
1526 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1527 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1528 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1529 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1530 IE_NAME = u'youtube:playlist'
1531
1532 def __init__(self, downloader=None):
1533 InfoExtractor.__init__(self, downloader)
1534
1535 def report_download_page(self, playlist_id, pagenum):
1536 """Report attempt to download playlist page with given number."""
1537 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1538
1539 def _real_extract(self, url):
1540 # Extract playlist id
1541 mobj = re.match(self._VALID_URL, url)
1542 if mobj is None:
1543 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1544 return
1545
1546 # Single video case
1547 if mobj.group(3) is not None:
1548 self._downloader.download([mobj.group(3)])
1549 return
1550
1551 # Download playlist pages
1552 # prefix is 'p' as default for playlists but there are other types that need extra care
1553 playlist_prefix = mobj.group(1)
1554 if playlist_prefix == 'a':
1555 playlist_access = 'artist'
1556 else:
1557 playlist_prefix = 'p'
1558 playlist_access = 'view_play_list'
1559 playlist_id = mobj.group(2)
1560 video_ids = []
1561 pagenum = 1
1562
1563 while True:
1564 self.report_download_page(playlist_id, pagenum)
1565 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1566 request = urllib2.Request(url)
1567 try:
1568 page = urllib2.urlopen(request).read()
1569 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1570 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1571 return
1572
1573 # Extract video identifiers
1574 ids_in_page = []
1575 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1576 if mobj.group(1) not in ids_in_page:
1577 ids_in_page.append(mobj.group(1))
1578 video_ids.extend(ids_in_page)
1579
1580 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1581 break
1582 pagenum = pagenum + 1
1583
1584 playliststart = self._downloader.params.get('playliststart', 1) - 1
1585 playlistend = self._downloader.params.get('playlistend', -1)
1586 if playlistend == -1:
1587 video_ids = video_ids[playliststart:]
1588 else:
1589 video_ids = video_ids[playliststart:playlistend]
1590
1591 for id in video_ids:
1592 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1593 return
1594
1595
1596 class YoutubeChannelIE(InfoExtractor):
1597 """Information Extractor for YouTube channels."""
1598
1599 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1600 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1601 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1602 IE_NAME = u'youtube:channel'
1603
1604 def report_download_page(self, channel_id, pagenum):
1605 """Report attempt to download channel page with given number."""
1606 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1607
1608 def _real_extract(self, url):
1609 # Extract channel id
1610 mobj = re.match(self._VALID_URL, url)
1611 if mobj is None:
1612 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1613 return
1614
1615 # Download channel pages
1616 channel_id = mobj.group(1)
1617 video_ids = []
1618 pagenum = 1
1619
1620 while True:
1621 self.report_download_page(channel_id, pagenum)
1622 url = self._TEMPLATE_URL % (channel_id, pagenum)
1623 request = urllib2.Request(url)
1624 try:
1625 page = urllib2.urlopen(request).read()
1626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1628 return
1629
1630 # Extract video identifiers
1631 ids_in_page = []
1632 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1633 if mobj.group(1) not in ids_in_page:
1634 ids_in_page.append(mobj.group(1))
1635 video_ids.extend(ids_in_page)
1636
1637 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1638 break
1639 pagenum = pagenum + 1
1640
1641 for id in video_ids:
1642 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1643 return
1644
1645
1646 class YoutubeUserIE(InfoExtractor):
1647 """Information Extractor for YouTube users."""
1648
1649 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1650 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1651 _GDATA_PAGE_SIZE = 50
1652 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1653 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1654 IE_NAME = u'youtube:user'
1655
1656 def __init__(self, downloader=None):
1657 InfoExtractor.__init__(self, downloader)
1658
1659 def report_download_page(self, username, start_index):
1660 """Report attempt to download user page."""
1661 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1662 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1663
1664 def _real_extract(self, url):
1665 # Extract username
1666 mobj = re.match(self._VALID_URL, url)
1667 if mobj is None:
1668 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1669 return
1670
1671 username = mobj.group(1)
1672
1673 # Download video ids using YouTube Data API. Result size per
1674 # query is limited (currently to 50 videos) so we need to query
1675 # page by page until there are no video ids - it means we got
1676 # all of them.
1677
1678 video_ids = []
1679 pagenum = 0
1680
1681 while True:
1682 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1683 self.report_download_page(username, start_index)
1684
1685 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1686
1687 try:
1688 page = urllib2.urlopen(request).read()
1689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1691 return
1692
1693 # Extract video identifiers
1694 ids_in_page = []
1695
1696 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697 if mobj.group(1) not in ids_in_page:
1698 ids_in_page.append(mobj.group(1))
1699
1700 video_ids.extend(ids_in_page)
1701
1702 # A little optimization - if current page is not
1703 # "full", ie. does not contain PAGE_SIZE video ids then
1704 # we can assume that this page is the last one - there
1705 # are no more ids on further pages - no need to query
1706 # again.
1707
1708 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1709 break
1710
1711 pagenum += 1
1712
1713 all_ids_count = len(video_ids)
1714 playliststart = self._downloader.params.get('playliststart', 1) - 1
1715 playlistend = self._downloader.params.get('playlistend', -1)
1716
1717 if playlistend == -1:
1718 video_ids = video_ids[playliststart:]
1719 else:
1720 video_ids = video_ids[playliststart:playlistend]
1721
1722 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1723 (username, all_ids_count, len(video_ids)))
1724
1725 for video_id in video_ids:
1726 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1727
1728
1729 class BlipTVUserIE(InfoExtractor):
1730 """Information Extractor for blip.tv users."""
1731
1732 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1733 _PAGE_SIZE = 12
1734 IE_NAME = u'blip.tv:user'
1735
1736 def __init__(self, downloader=None):
1737 InfoExtractor.__init__(self, downloader)
1738
1739 def report_download_page(self, username, pagenum):
1740 """Report attempt to download user page."""
1741 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1742 (self.IE_NAME, username, pagenum))
1743
1744 def _real_extract(self, url):
1745 # Extract username
1746 mobj = re.match(self._VALID_URL, url)
1747 if mobj is None:
1748 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1749 return
1750
1751 username = mobj.group(1)
1752
1753 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1754
1755 request = urllib2.Request(url)
1756
1757 try:
1758 page = urllib2.urlopen(request).read().decode('utf-8')
1759 mobj = re.search(r'data-users-id="([^"]+)"', page)
1760 page_base = page_base % mobj.group(1)
1761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763 return
1764
1765
1766 # Download video ids using BlipTV Ajax calls. Result size per
1767 # query is limited (currently to 12 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1769 # all of them.
1770
1771 video_ids = []
1772 pagenum = 1
1773
1774 while True:
1775 self.report_download_page(username, pagenum)
1776
1777 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1778
1779 try:
1780 page = urllib2.urlopen(request).read().decode('utf-8')
1781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1783 return
1784
1785 # Extract video identifiers
1786 ids_in_page = []
1787
1788 for mobj in re.finditer(r'href="/([^"]+)"', page):
1789 if mobj.group(1) not in ids_in_page:
1790 ids_in_page.append(unescapeHTML(mobj.group(1)))
1791
1792 video_ids.extend(ids_in_page)
1793
1794 # A little optimization - if current page is not
1795 # "full", ie. does not contain PAGE_SIZE video ids then
1796 # we can assume that this page is the last one - there
1797 # are no more ids on further pages - no need to query
1798 # again.
1799
1800 if len(ids_in_page) < self._PAGE_SIZE:
1801 break
1802
1803 pagenum += 1
1804
1805 all_ids_count = len(video_ids)
1806 playliststart = self._downloader.params.get('playliststart', 1) - 1
1807 playlistend = self._downloader.params.get('playlistend', -1)
1808
1809 if playlistend == -1:
1810 video_ids = video_ids[playliststart:]
1811 else:
1812 video_ids = video_ids[playliststart:playlistend]
1813
1814 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1815 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1816
1817 for video_id in video_ids:
1818 self._downloader.download([u'http://blip.tv/'+video_id])
1819
1820
1821 class DepositFilesIE(InfoExtractor):
1822 """Information extractor for depositfiles.com"""
1823
1824 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1825 IE_NAME = u'DepositFiles'
1826
1827 def __init__(self, downloader=None):
1828 InfoExtractor.__init__(self, downloader)
1829
1830 def report_download_webpage(self, file_id):
1831 """Report webpage download."""
1832 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1833
1834 def report_extraction(self, file_id):
1835 """Report information extraction."""
1836 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1837
1838 def _real_extract(self, url):
1839 file_id = url.split('/')[-1]
1840 # Rebuild url in english locale
1841 url = 'http://depositfiles.com/en/files/' + file_id
1842
1843 # Retrieve file webpage with 'Free download' button pressed
1844 free_download_indication = { 'gateway_result' : '1' }
1845 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1846 try:
1847 self.report_download_webpage(file_id)
1848 webpage = urllib2.urlopen(request).read()
1849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1851 return
1852
1853 # Search for the real file URL
1854 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1855 if (mobj is None) or (mobj.group(1) is None):
1856 # Try to figure out reason of the error.
1857 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1858 if (mobj is not None) and (mobj.group(1) is not None):
1859 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1860 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1861 else:
1862 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1863 return
1864
1865 file_url = mobj.group(1)
1866 file_extension = os.path.splitext(file_url)[1][1:]
1867
1868 # Search for file title
1869 mobj = re.search(r'<b title="(.*?)">', webpage)
1870 if mobj is None:
1871 self._downloader.trouble(u'ERROR: unable to extract title')
1872 return
1873 file_title = mobj.group(1).decode('utf-8')
1874
1875 return [{
1876 'id': file_id.decode('utf-8'),
1877 'url': file_url.decode('utf-8'),
1878 'uploader': u'NA',
1879 'upload_date': u'NA',
1880 'title': file_title,
1881 'ext': file_extension.decode('utf-8'),
1882 'format': u'NA',
1883 'player_url': None,
1884 }]
1885
1886
1887 class FacebookIE(InfoExtractor):
1888 """Information Extractor for Facebook"""
1889
1890 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1891 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1892 _NETRC_MACHINE = 'facebook'
1893 _available_formats = ['video', 'highqual', 'lowqual']
1894 _video_extensions = {
1895 'video': 'mp4',
1896 'highqual': 'mp4',
1897 'lowqual': 'mp4',
1898 }
1899 IE_NAME = u'facebook'
1900
1901 def __init__(self, downloader=None):
1902 InfoExtractor.__init__(self, downloader)
1903
1904 def _reporter(self, message):
1905 """Add header and report message."""
1906 self._downloader.to_screen(u'[facebook] %s' % message)
1907
1908 def report_login(self):
1909 """Report attempt to log in."""
1910 self._reporter(u'Logging in')
1911
1912 def report_video_webpage_download(self, video_id):
1913 """Report attempt to download video webpage."""
1914 self._reporter(u'%s: Downloading video webpage' % video_id)
1915
1916 def report_information_extraction(self, video_id):
1917 """Report attempt to extract video information."""
1918 self._reporter(u'%s: Extracting video information' % video_id)
1919
1920 def _parse_page(self, video_webpage):
1921 """Extract video information from page"""
1922 # General data
1923 data = {'title': r'\("video_title", "(.*?)"\)',
1924 'description': r'<div class="datawrap">(.*?)</div>',
1925 'owner': r'\("video_owner_name", "(.*?)"\)',
1926 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1927 }
1928 video_info = {}
1929 for piece in data.keys():
1930 mobj = re.search(data[piece], video_webpage)
1931 if mobj is not None:
1932 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1933
1934 # Video urls
1935 video_urls = {}
1936 for fmt in self._available_formats:
1937 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1938 if mobj is not None:
1939 # URL is in a Javascript segment inside an escaped Unicode format within
1940 # the generally utf-8 page
1941 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942 video_info['video_urls'] = video_urls
1943
1944 return video_info
1945
1946 def _real_initialize(self):
1947 if self._downloader is None:
1948 return
1949
1950 useremail = None
1951 password = None
1952 downloader_params = self._downloader.params
1953
1954 # Attempt to use provided username and password or .netrc data
1955 if downloader_params.get('username', None) is not None:
1956 useremail = downloader_params['username']
1957 password = downloader_params['password']
1958 elif downloader_params.get('usenetrc', False):
1959 try:
1960 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1961 if info is not None:
1962 useremail = info[0]
1963 password = info[2]
1964 else:
1965 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1966 except (IOError, netrc.NetrcParseError), err:
1967 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1968 return
1969
1970 if useremail is None:
1971 return
1972
1973 # Log in
1974 login_form = {
1975 'email': useremail,
1976 'pass': password,
1977 'login': 'Log+In'
1978 }
1979 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1980 try:
1981 self.report_login()
1982 login_results = urllib2.urlopen(request).read()
1983 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1984 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1985 return
1986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1987 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1988 return
1989
1990 def _real_extract(self, url):
1991 mobj = re.match(self._VALID_URL, url)
1992 if mobj is None:
1993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1994 return
1995 video_id = mobj.group('ID')
1996
1997 # Get video webpage
1998 self.report_video_webpage_download(video_id)
1999 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2000 try:
2001 page = urllib2.urlopen(request)
2002 video_webpage = page.read()
2003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2004 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2005 return
2006
2007 # Start extracting information
2008 self.report_information_extraction(video_id)
2009
2010 # Extract information
2011 video_info = self._parse_page(video_webpage)
2012
2013 # uploader
2014 if 'owner' not in video_info:
2015 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2016 return
2017 video_uploader = video_info['owner']
2018
2019 # title
2020 if 'title' not in video_info:
2021 self._downloader.trouble(u'ERROR: unable to extract video title')
2022 return
2023 video_title = video_info['title']
2024 video_title = video_title.decode('utf-8')
2025
2026 # thumbnail image
2027 if 'thumbnail' not in video_info:
2028 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2029 video_thumbnail = ''
2030 else:
2031 video_thumbnail = video_info['thumbnail']
2032
2033 # upload date
2034 upload_date = u'NA'
2035 if 'upload_date' in video_info:
2036 upload_time = video_info['upload_date']
2037 timetuple = email.utils.parsedate_tz(upload_time)
2038 if timetuple is not None:
2039 try:
2040 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2041 except:
2042 pass
2043
2044 # description
2045 video_description = video_info.get('description', 'No description available.')
2046
2047 url_map = video_info['video_urls']
2048 if len(url_map.keys()) > 0:
2049 # Decide which formats to download
2050 req_format = self._downloader.params.get('format', None)
2051 format_limit = self._downloader.params.get('format_limit', None)
2052
2053 if format_limit is not None and format_limit in self._available_formats:
2054 format_list = self._available_formats[self._available_formats.index(format_limit):]
2055 else:
2056 format_list = self._available_formats
2057 existing_formats = [x for x in format_list if x in url_map]
2058 if len(existing_formats) == 0:
2059 self._downloader.trouble(u'ERROR: no known formats available for video')
2060 return
2061 if req_format is None:
2062 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2063 elif req_format == 'worst':
2064 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2065 elif req_format == '-1':
2066 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2067 else:
2068 # Specific format
2069 if req_format not in url_map:
2070 self._downloader.trouble(u'ERROR: requested format not available')
2071 return
2072 video_url_list = [(req_format, url_map[req_format])] # Specific format
2073
2074 results = []
2075 for format_param, video_real_url in video_url_list:
2076 # Extension
2077 video_extension = self._video_extensions.get(format_param, 'mp4')
2078
2079 results.append({
2080 'id': video_id.decode('utf-8'),
2081 'url': video_real_url.decode('utf-8'),
2082 'uploader': video_uploader.decode('utf-8'),
2083 'upload_date': upload_date,
2084 'title': video_title,
2085 'ext': video_extension.decode('utf-8'),
2086 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2087 'thumbnail': video_thumbnail.decode('utf-8'),
2088 'description': video_description.decode('utf-8'),
2089 'player_url': None,
2090 })
2091 return results
2092
2093 class BlipTVIE(InfoExtractor):
2094 """Information extractor for blip.tv"""
2095
2096 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2097 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2098 IE_NAME = u'blip.tv'
2099
2100 def report_extraction(self, file_id):
2101 """Report information extraction."""
2102 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2103
2104 def report_direct_download(self, title):
2105 """Report information extraction."""
2106 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2107
2108 def _real_extract(self, url):
2109 mobj = re.match(self._VALID_URL, url)
2110 if mobj is None:
2111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2112 return
2113
2114 if '?' in url:
2115 cchar = '&'
2116 else:
2117 cchar = '?'
2118 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119 request = urllib2.Request(json_url.encode('utf-8'))
2120 self.report_extraction(mobj.group(1))
2121 info = None
2122 try:
2123 urlh = urllib2.urlopen(request)
2124 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2125 basename = url.split('/')[-1]
2126 title,ext = os.path.splitext(basename)
2127 title = title.decode('UTF-8')
2128 ext = ext.replace('.', '')
2129 self.report_direct_download(title)
2130 info = {
2131 'id': title,
2132 'url': url,
2133 'title': title,
2134 'ext': ext,
2135 'urlhandle': urlh
2136 }
2137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2138 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2139 return
2140 if info is None: # Regular URL
2141 try:
2142 json_code = urlh.read()
2143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2144 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2145 return
2146
2147 try:
2148 json_data = json.loads(json_code)
2149 if 'Post' in json_data:
2150 data = json_data['Post']
2151 else:
2152 data = json_data
2153
2154 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2155 video_url = data['media']['url']
2156 umobj = re.match(self._URL_EXT, video_url)
2157 if umobj is None:
2158 raise ValueError('Can not determine filename extension')
2159 ext = umobj.group(1)
2160
2161 info = {
2162 'id': data['item_id'],
2163 'url': video_url,
2164 'uploader': data['display_name'],
2165 'upload_date': upload_date,
2166 'title': data['title'],
2167 'ext': ext,
2168 'format': data['media']['mimeType'],
2169 'thumbnail': data['thumbnailUrl'],
2170 'description': data['description'],
2171 'player_url': data['embedUrl']
2172 }
2173 except (ValueError,KeyError), err:
2174 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2175 return
2176
2177 std_headers['User-Agent'] = 'iTunes/10.6.1'
2178 return [info]
2179
2180
2181 class MyVideoIE(InfoExtractor):
2182 """Information Extractor for myvideo.de."""
2183
2184 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2185 IE_NAME = u'myvideo'
2186
2187 def __init__(self, downloader=None):
2188 InfoExtractor.__init__(self, downloader)
2189
2190 def report_download_webpage(self, video_id):
2191 """Report webpage download."""
2192 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2193
2194 def report_extraction(self, video_id):
2195 """Report information extraction."""
2196 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2197
2198 def _real_extract(self,url):
2199 mobj = re.match(self._VALID_URL, url)
2200 if mobj is None:
2201 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2202 return
2203
2204 video_id = mobj.group(1)
2205
2206 # Get video webpage
2207 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2208 try:
2209 self.report_download_webpage(video_id)
2210 webpage = urllib2.urlopen(request).read()
2211 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2212 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2213 return
2214
2215 self.report_extraction(video_id)
2216 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2217 webpage)
2218 if mobj is None:
2219 self._downloader.trouble(u'ERROR: unable to extract media URL')
2220 return
2221 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2222
2223 mobj = re.search('<title>([^<]+)</title>', webpage)
2224 if mobj is None:
2225 self._downloader.trouble(u'ERROR: unable to extract title')
2226 return
2227
2228 video_title = mobj.group(1)
2229
2230 return [{
2231 'id': video_id,
2232 'url': video_url,
2233 'uploader': u'NA',
2234 'upload_date': u'NA',
2235 'title': video_title,
2236 'ext': u'flv',
2237 'format': u'NA',
2238 'player_url': None,
2239 }]
2240
2241 class ComedyCentralIE(InfoExtractor):
2242 """Information extractor for The Daily Show and Colbert Report """
2243
2244 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2245 IE_NAME = u'comedycentral'
2246
2247 def report_extraction(self, episode_id):
2248 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2249
2250 def report_config_download(self, episode_id):
2251 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2252
2253 def report_index_download(self, episode_id):
2254 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2255
2256 def report_player_url(self, episode_id):
2257 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2258
2259 def _real_extract(self, url):
2260 mobj = re.match(self._VALID_URL, url)
2261 if mobj is None:
2262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2263 return
2264
2265 if mobj.group('shortname'):
2266 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2267 url = u'http://www.thedailyshow.com/full-episodes/'
2268 else:
2269 url = u'http://www.colbertnation.com/full-episodes/'
2270 mobj = re.match(self._VALID_URL, url)
2271 assert mobj is not None
2272
2273 dlNewest = not mobj.group('episode')
2274 if dlNewest:
2275 epTitle = mobj.group('showname')
2276 else:
2277 epTitle = mobj.group('episode')
2278
2279 req = urllib2.Request(url)
2280 self.report_extraction(epTitle)
2281 try:
2282 htmlHandle = urllib2.urlopen(req)
2283 html = htmlHandle.read()
2284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2286 return
2287 if dlNewest:
2288 url = htmlHandle.geturl()
2289 mobj = re.match(self._VALID_URL, url)
2290 if mobj is None:
2291 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2292 return
2293 if mobj.group('episode') == '':
2294 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2295 return
2296 epTitle = mobj.group('episode')
2297
2298 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2299 if len(mMovieParams) == 0:
2300 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2301 return
2302
2303 playerUrl_raw = mMovieParams[0][0]
2304 self.report_player_url(epTitle)
2305 try:
2306 urlHandle = urllib2.urlopen(playerUrl_raw)
2307 playerUrl = urlHandle.geturl()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2310 return
2311
2312 uri = mMovieParams[0][1]
2313 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2314 self.report_index_download(epTitle)
2315 try:
2316 indexXml = urllib2.urlopen(indexUrl).read()
2317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2318 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2319 return
2320
2321 results = []
2322
2323 idoc = xml.etree.ElementTree.fromstring(indexXml)
2324 itemEls = idoc.findall('.//item')
2325 for itemEl in itemEls:
2326 mediaId = itemEl.findall('./guid')[0].text
2327 shortMediaId = mediaId.split(':')[-1]
2328 showId = mediaId.split(':')[-2].replace('.com', '')
2329 officialTitle = itemEl.findall('./title')[0].text
2330 officialDate = itemEl.findall('./pubDate')[0].text
2331
2332 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2333 urllib.urlencode({'uri': mediaId}))
2334 configReq = urllib2.Request(configUrl)
2335 self.report_config_download(epTitle)
2336 try:
2337 configXml = urllib2.urlopen(configReq).read()
2338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2339 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2340 return
2341
2342 cdoc = xml.etree.ElementTree.fromstring(configXml)
2343 turls = []
2344 for rendition in cdoc.findall('.//rendition'):
2345 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2346 turls.append(finfo)
2347
2348 if len(turls) == 0:
2349 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2350 continue
2351
2352 # For now, just pick the highest bitrate
2353 format,video_url = turls[-1]
2354
2355 effTitle = showId + u'-' + epTitle
2356 info = {
2357 'id': shortMediaId,
2358 'url': video_url,
2359 'uploader': showId,
2360 'upload_date': officialDate,
2361 'title': effTitle,
2362 'ext': 'mp4',
2363 'format': format,
2364 'thumbnail': None,
2365 'description': officialTitle,
2366 'player_url': playerUrl
2367 }
2368
2369 results.append(info)
2370
2371 return results
2372
2373
2374 class EscapistIE(InfoExtractor):
2375 """Information extractor for The Escapist """
2376
2377 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2378 IE_NAME = u'escapist'
2379
2380 def report_extraction(self, showName):
2381 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2382
2383 def report_config_download(self, showName):
2384 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2385
2386 def _real_extract(self, url):
2387 mobj = re.match(self._VALID_URL, url)
2388 if mobj is None:
2389 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2390 return
2391 showName = mobj.group('showname')
2392 videoId = mobj.group('episode')
2393
2394 self.report_extraction(showName)
2395 try:
2396 webPage = urllib2.urlopen(url)
2397 webPageBytes = webPage.read()
2398 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2399 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2400 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2402 return
2403
2404 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2405 description = unescapeHTML(descMatch.group(1))
2406 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2407 imgUrl = unescapeHTML(imgMatch.group(1))
2408 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2409 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2410 configUrlMatch = re.search('config=(.*)$', playerUrl)
2411 configUrl = urllib2.unquote(configUrlMatch.group(1))
2412
2413 self.report_config_download(showName)
2414 try:
2415 configJSON = urllib2.urlopen(configUrl).read()
2416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2417 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2418 return
2419
2420 # Technically, it's JavaScript, not JSON
2421 configJSON = configJSON.replace("'", '"')
2422
2423 try:
2424 config = json.loads(configJSON)
2425 except (ValueError,), err:
2426 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2427 return
2428
2429 playlist = config['playlist']
2430 videoUrl = playlist[1]['url']
2431
2432 info = {
2433 'id': videoId,
2434 'url': videoUrl,
2435 'uploader': showName,
2436 'upload_date': None,
2437 'title': showName,
2438 'ext': 'flv',
2439 'format': 'flv',
2440 'thumbnail': imgUrl,
2441 'description': description,
2442 'player_url': playerUrl,
2443 }
2444
2445 return [info]
2446
2447
2448 class CollegeHumorIE(InfoExtractor):
2449 """Information extractor for collegehumor.com"""
2450
2451 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2452 IE_NAME = u'collegehumor'
2453
2454 def report_webpage(self, video_id):
2455 """Report information extraction."""
2456 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2457
2458 def report_extraction(self, video_id):
2459 """Report information extraction."""
2460 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2461
2462 def _real_extract(self, url):
2463 mobj = re.match(self._VALID_URL, url)
2464 if mobj is None:
2465 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2466 return
2467 video_id = mobj.group('videoid')
2468
2469 self.report_webpage(video_id)
2470 request = urllib2.Request(url)
2471 try:
2472 webpage = urllib2.urlopen(request).read()
2473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2475 return
2476
2477 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2478 if m is None:
2479 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2480 return
2481 internal_video_id = m.group('internalvideoid')
2482
2483 info = {
2484 'id': video_id,
2485 'internal_id': internal_video_id,
2486 }
2487
2488 self.report_extraction(video_id)
2489 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2490 try:
2491 metaXml = urllib2.urlopen(xmlUrl).read()
2492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2493 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2494 return
2495
2496 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2497 try:
2498 videoNode = mdoc.findall('./video')[0]
2499 info['description'] = videoNode.findall('./description')[0].text
2500 info['title'] = videoNode.findall('./caption')[0].text
2501 info['url'] = videoNode.findall('./file')[0].text
2502 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2503 info['ext'] = info['url'].rpartition('.')[2]
2504 info['format'] = info['ext']
2505 except IndexError:
2506 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2507 return
2508
2509 return [info]
2510
2511
2512 class XVideosIE(InfoExtractor):
2513 """Information extractor for xvideos.com"""
2514
2515 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2516 IE_NAME = u'xvideos'
2517
2518 def report_webpage(self, video_id):
2519 """Report information extraction."""
2520 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2521
2522 def report_extraction(self, video_id):
2523 """Report information extraction."""
2524 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2525
2526 def _real_extract(self, url):
2527 mobj = re.match(self._VALID_URL, url)
2528 if mobj is None:
2529 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2530 return
2531 video_id = mobj.group(1).decode('utf-8')
2532
2533 self.report_webpage(video_id)
2534
2535 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2536 try:
2537 webpage = urllib2.urlopen(request).read()
2538 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2539 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2540 return
2541
2542 self.report_extraction(video_id)
2543
2544
2545 # Extract video URL
2546 mobj = re.search(r'flv_url=(.+?)&', webpage)
2547 if mobj is None:
2548 self._downloader.trouble(u'ERROR: unable to extract video url')
2549 return
2550 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2551
2552
2553 # Extract title
2554 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2555 if mobj is None:
2556 self._downloader.trouble(u'ERROR: unable to extract video title')
2557 return
2558 video_title = mobj.group(1).decode('utf-8')
2559
2560
2561 # Extract video thumbnail
2562 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2563 if mobj is None:
2564 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2565 return
2566 video_thumbnail = mobj.group(0).decode('utf-8')
2567
2568 info = {
2569 'id': video_id,
2570 'url': video_url,
2571 'uploader': None,
2572 'upload_date': None,
2573 'title': video_title,
2574 'ext': 'flv',
2575 'format': 'flv',
2576 'thumbnail': video_thumbnail,
2577 'description': None,
2578 'player_url': None,
2579 }
2580
2581 return [info]
2582
2583
2584 class SoundcloudIE(InfoExtractor):
2585 """Information extractor for soundcloud.com
2586 To access the media, the uid of the song and a stream token
2587 must be extracted from the page source and the script must make
2588 a request to media.soundcloud.com/crossdomain.xml. Then
2589 the media can be grabbed by requesting from an url composed
2590 of the stream token and uid
2591 """
2592
2593 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2594 IE_NAME = u'soundcloud'
2595
2596 def __init__(self, downloader=None):
2597 InfoExtractor.__init__(self, downloader)
2598
2599 def report_webpage(self, video_id):
2600 """Report information extraction."""
2601 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2602
2603 def report_extraction(self, video_id):
2604 """Report information extraction."""
2605 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2606
2607 def _real_extract(self, url):
2608 mobj = re.match(self._VALID_URL, url)
2609 if mobj is None:
2610 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2611 return
2612
2613 # extract uploader (which is in the url)
2614 uploader = mobj.group(1).decode('utf-8')
2615 # extract simple title (uploader + slug of song title)
2616 slug_title = mobj.group(2).decode('utf-8')
2617 simple_title = uploader + u'-' + slug_title
2618
2619 self.report_webpage('%s/%s' % (uploader, slug_title))
2620
2621 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2622 try:
2623 webpage = urllib2.urlopen(request).read()
2624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2626 return
2627
2628 self.report_extraction('%s/%s' % (uploader, slug_title))
2629
2630 # extract uid and stream token that soundcloud hands out for access
2631 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2632 if mobj:
2633 video_id = mobj.group(1)
2634 stream_token = mobj.group(2)
2635
2636 # extract unsimplified title
2637 mobj = re.search('"title":"(.*?)",', webpage)
2638 if mobj:
2639 title = mobj.group(1).decode('utf-8')
2640 else:
2641 title = simple_title
2642
2643 # construct media url (with uid/token)
2644 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2645 mediaURL = mediaURL % (video_id, stream_token)
2646
2647 # description
2648 description = u'No description available'
2649 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2650 if mobj:
2651 description = mobj.group(1)
2652
2653 # upload date
2654 upload_date = None
2655 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2656 if mobj:
2657 try:
2658 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2659 except Exception, e:
2660 self._downloader.to_stderr(str(e))
2661
2662 # for soundcloud, a request to a cross domain is required for cookies
2663 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2664
2665 return [{
2666 'id': video_id.decode('utf-8'),
2667 'url': mediaURL,
2668 'uploader': uploader.decode('utf-8'),
2669 'upload_date': upload_date,
2670 'title': title,
2671 'ext': u'mp3',
2672 'format': u'NA',
2673 'player_url': None,
2674 'description': description.decode('utf-8')
2675 }]
2676
2677
2678 class InfoQIE(InfoExtractor):
2679 """Information extractor for infoq.com"""
2680
2681 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2682 IE_NAME = u'infoq'
2683
2684 def report_webpage(self, video_id):
2685 """Report information extraction."""
2686 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2687
2688 def report_extraction(self, video_id):
2689 """Report information extraction."""
2690 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2691
2692 def _real_extract(self, url):
2693 mobj = re.match(self._VALID_URL, url)
2694 if mobj is None:
2695 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2696 return
2697
2698 self.report_webpage(url)
2699
2700 request = urllib2.Request(url)
2701 try:
2702 webpage = urllib2.urlopen(request).read()
2703 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2704 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2705 return
2706
2707 self.report_extraction(url)
2708
2709
2710 # Extract video URL
2711 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2712 if mobj is None:
2713 self._downloader.trouble(u'ERROR: unable to extract video url')
2714 return
2715 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2716
2717
2718 # Extract title
2719 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2720 if mobj is None:
2721 self._downloader.trouble(u'ERROR: unable to extract video title')
2722 return
2723 video_title = mobj.group(1).decode('utf-8')
2724
2725 # Extract description
2726 video_description = u'No description available.'
2727 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2728 if mobj is not None:
2729 video_description = mobj.group(1).decode('utf-8')
2730
2731 video_filename = video_url.split('/')[-1]
2732 video_id, extension = video_filename.split('.')
2733
2734 info = {
2735 'id': video_id,
2736 'url': video_url,
2737 'uploader': None,
2738 'upload_date': None,
2739 'title': video_title,
2740 'ext': extension,
2741 'format': extension, # Extension is always(?) mp4, but seems to be flv
2742 'thumbnail': None,
2743 'description': video_description,
2744 'player_url': None,
2745 }
2746
2747 return [info]
2748
2749 class MixcloudIE(InfoExtractor):
2750 """Information extractor for www.mixcloud.com"""
2751 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752 IE_NAME = u'mixcloud'
2753
2754 def __init__(self, downloader=None):
2755 InfoExtractor.__init__(self, downloader)
2756
2757 def report_download_json(self, file_id):
2758 """Report JSON download."""
2759 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2760
2761 def report_extraction(self, file_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2764
2765 def get_urls(self, jsonData, fmt, bitrate='best'):
2766 """Get urls from 'audio_formats' section in json"""
2767 file_url = None
2768 try:
2769 bitrate_list = jsonData[fmt]
2770 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2771 bitrate = max(bitrate_list) # select highest
2772
2773 url_list = jsonData[fmt][bitrate]
2774 except TypeError: # we have no bitrate info.
2775 url_list = jsonData[fmt]
2776 return url_list
2777
2778 def check_urls(self, url_list):
2779 """Returns 1st active url from list"""
2780 for url in url_list:
2781 try:
2782 urllib2.urlopen(url)
2783 return url
2784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2785 url = None
2786
2787 return None
2788
2789 def _print_formats(self, formats):
2790 print 'Available formats:'
2791 for fmt in formats.keys():
2792 for b in formats[fmt]:
2793 try:
2794 ext = formats[fmt][b][0]
2795 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2796 except TypeError: # we have no bitrate info
2797 ext = formats[fmt][0]
2798 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2799 break
2800
2801 def _real_extract(self, url):
2802 mobj = re.match(self._VALID_URL, url)
2803 if mobj is None:
2804 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805 return
2806 # extract uploader & filename from url
2807 uploader = mobj.group(1).decode('utf-8')
2808 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2809
2810 # construct API request
2811 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2812 # retrieve .json file with links to files
2813 request = urllib2.Request(file_url)
2814 try:
2815 self.report_download_json(file_url)
2816 jsonData = urllib2.urlopen(request).read()
2817 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2818 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2819 return
2820
2821 # parse JSON
2822 json_data = json.loads(jsonData)
2823 player_url = json_data['player_swf_url']
2824 formats = dict(json_data['audio_formats'])
2825
2826 req_format = self._downloader.params.get('format', None)
2827 bitrate = None
2828
2829 if self._downloader.params.get('listformats', None):
2830 self._print_formats(formats)
2831 return
2832
2833 if req_format is None or req_format == 'best':
2834 for format_param in formats.keys():
2835 url_list = self.get_urls(formats, format_param)
2836 # check urls
2837 file_url = self.check_urls(url_list)
2838 if file_url is not None:
2839 break # got it!
2840 else:
2841 if req_format not in formats.keys():
2842 self._downloader.trouble(u'ERROR: format is not available')
2843 return
2844
2845 url_list = self.get_urls(formats, req_format)
2846 file_url = self.check_urls(url_list)
2847 format_param = req_format
2848
2849 return [{
2850 'id': file_id.decode('utf-8'),
2851 'url': file_url.decode('utf-8'),
2852 'uploader': uploader.decode('utf-8'),
2853 'upload_date': u'NA',
2854 'title': json_data['name'],
2855 'ext': file_url.split('.')[-1].decode('utf-8'),
2856 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2857 'thumbnail': json_data['thumbnail_url'],
2858 'description': json_data['description'],
2859 'player_url': player_url.decode('utf-8'),
2860 }]
2861
2862 class StanfordOpenClassroomIE(InfoExtractor):
2863 """Information extractor for Stanford's Open ClassRoom"""
2864
2865 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2866 IE_NAME = u'stanfordoc'
2867
2868 def report_download_webpage(self, objid):
2869 """Report information extraction."""
2870 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2871
2872 def report_extraction(self, video_id):
2873 """Report information extraction."""
2874 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2875
2876 def _real_extract(self, url):
2877 mobj = re.match(self._VALID_URL, url)
2878 if mobj is None:
2879 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2880 return
2881
2882 if mobj.group('course') and mobj.group('video'): # A specific video
2883 course = mobj.group('course')
2884 video = mobj.group('video')
2885 info = {
2886 'id': course + '_' + video,
2887 }
2888
2889 self.report_extraction(info['id'])
2890 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2891 xmlUrl = baseUrl + video + '.xml'
2892 try:
2893 metaXml = urllib2.urlopen(xmlUrl).read()
2894 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2895 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2896 return
2897 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2898 try:
2899 info['title'] = mdoc.findall('./title')[0].text
2900 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2901 except IndexError:
2902 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2903 return
2904 info['ext'] = info['url'].rpartition('.')[2]
2905 info['format'] = info['ext']
2906 return [info]
2907 elif mobj.group('course'): # A course page
2908 course = mobj.group('course')
2909 info = {
2910 'id': course,
2911 'type': 'playlist',
2912 }
2913
2914 self.report_download_webpage(info['id'])
2915 try:
2916 coursepage = urllib2.urlopen(url).read()
2917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2918 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2919 return
2920
2921 m = re.search('<h1>([^<]+)</h1>', coursepage)
2922 if m:
2923 info['title'] = unescapeHTML(m.group(1))
2924 else:
2925 info['title'] = info['id']
2926
2927 m = re.search('<description>([^<]+)</description>', coursepage)
2928 if m:
2929 info['description'] = unescapeHTML(m.group(1))
2930
2931 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2932 info['list'] = [
2933 {
2934 'type': 'reference',
2935 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2936 }
2937 for vpage in links]
2938 results = []
2939 for entry in info['list']:
2940 assert entry['type'] == 'reference'
2941 results += self.extract(entry['url'])
2942 return results
2943
2944 else: # Root page
2945 info = {
2946 'id': 'Stanford OpenClassroom',
2947 'type': 'playlist',
2948 }
2949
2950 self.report_download_webpage(info['id'])
2951 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2952 try:
2953 rootpage = urllib2.urlopen(rootURL).read()
2954 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2955 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2956 return
2957
2958 info['title'] = info['id']
2959
2960 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2961 info['list'] = [
2962 {
2963 'type': 'reference',
2964 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2965 }
2966 for cpage in links]
2967
2968 results = []
2969 for entry in info['list']:
2970 assert entry['type'] == 'reference'
2971 results += self.extract(entry['url'])
2972 return results
2973
2974 class MTVIE(InfoExtractor):
2975 """Information extractor for MTV.com"""
2976
2977 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2978 IE_NAME = u'mtv'
2979
2980 def report_webpage(self, video_id):
2981 """Report information extraction."""
2982 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2983
2984 def report_extraction(self, video_id):
2985 """Report information extraction."""
2986 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2987
2988 def _real_extract(self, url):
2989 mobj = re.match(self._VALID_URL, url)
2990 if mobj is None:
2991 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2992 return
2993 if not mobj.group('proto'):
2994 url = 'http://' + url
2995 video_id = mobj.group('videoid')
2996 self.report_webpage(video_id)
2997
2998 request = urllib2.Request(url)
2999 try:
3000 webpage = urllib2.urlopen(request).read()
3001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3002 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3003 return
3004
3005 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3006 if mobj is None:
3007 self._downloader.trouble(u'ERROR: unable to extract song name')
3008 return
3009 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3010 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3011 if mobj is None:
3012 self._downloader.trouble(u'ERROR: unable to extract performer')
3013 return
3014 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3015 video_title = performer + ' - ' + song_name
3016
3017 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3018 if mobj is None:
3019 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3020 return
3021 mtvn_uri = mobj.group(1)
3022
3023 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3024 if mobj is None:
3025 self._downloader.trouble(u'ERROR: unable to extract content id')
3026 return
3027 content_id = mobj.group(1)
3028
3029 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3030 self.report_extraction(video_id)
3031 request = urllib2.Request(videogen_url)
3032 try:
3033 metadataXml = urllib2.urlopen(request).read()
3034 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3035 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3036 return
3037
3038 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3039 renditions = mdoc.findall('.//rendition')
3040
3041 # For now, always pick the highest quality.
3042 rendition = renditions[-1]
3043
3044 try:
3045 _,_,ext = rendition.attrib['type'].partition('/')
3046 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3047 video_url = rendition.find('./src').text
3048 except KeyError:
3049 self._downloader.trouble('Invalid rendition field.')
3050 return
3051
3052 info = {
3053 'id': video_id,
3054 'url': video_url,
3055 'uploader': performer,
3056 'title': video_title,
3057 'ext': ext,
3058 'format': format,
3059 }
3060
3061 return [info]
3062
3063
3064 class YoukuIE(InfoExtractor):
3065
3066 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3067 IE_NAME = u'Youku'
3068
3069 def __init__(self, downloader=None):
3070 InfoExtractor.__init__(self, downloader)
3071
3072 def report_download_webpage(self, file_id):
3073 """Report webpage download."""
3074 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3075
3076 def report_extraction(self, file_id):
3077 """Report information extraction."""
3078 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3079
3080 def _gen_sid(self):
3081 nowTime = int(time.time() * 1000)
3082 random1 = random.randint(1000,1998)
3083 random2 = random.randint(1000,9999)
3084
3085 return "%d%d%d" %(nowTime,random1,random2)
3086
3087 def _get_file_ID_mix_string(self, seed):
3088 mixed = []
3089 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3090 seed = float(seed)
3091 for i in range(len(source)):
3092 seed = (seed * 211 + 30031 ) % 65536
3093 index = math.floor(seed / 65536 * len(source) )
3094 mixed.append(source[int(index)])
3095 source.remove(source[int(index)])
3096 #return ''.join(mixed)
3097 return mixed
3098
3099 def _get_file_id(self, fileId, seed):
3100 mixed = self._get_file_ID_mix_string(seed)
3101 ids = fileId.split('*')
3102 realId = []
3103 for ch in ids:
3104 if ch:
3105 realId.append(mixed[int(ch)])
3106 return ''.join(realId)
3107
3108 def _real_extract(self, url):
3109 mobj = re.match(self._VALID_URL, url)
3110 if mobj is None:
3111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3112 return
3113 video_id = mobj.group('ID')
3114
3115 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3116
3117 request = urllib2.Request(info_url, None, std_headers)
3118 try:
3119 self.report_download_webpage(video_id)
3120 jsondata = urllib2.urlopen(request).read()
3121 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3122 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3123 return
3124
3125 self.report_extraction(video_id)
3126 try:
3127 config = json.loads(jsondata)
3128
3129 video_title = config['data'][0]['title']
3130 seed = config['data'][0]['seed']
3131
3132 format = self._downloader.params.get('format', None)
3133 supported_format = config['data'][0]['streamfileids'].keys()
3134
3135 if format is None or format == 'best':
3136 if 'hd2' in supported_format:
3137 format = 'hd2'
3138 else:
3139 format = 'flv'
3140 ext = u'flv'
3141 elif format == 'worst':
3142 format = 'mp4'
3143 ext = u'mp4'
3144 else:
3145 format = 'flv'
3146 ext = u'flv'
3147
3148
3149 fileid = config['data'][0]['streamfileids'][format]
3150 seg_number = len(config['data'][0]['segs'][format])
3151
3152 keys=[]
3153 for i in xrange(seg_number):
3154 keys.append(config['data'][0]['segs'][format][i]['k'])
3155
3156 #TODO check error
3157 #youku only could be viewed from mainland china
3158 except:
3159 self._downloader.trouble(u'ERROR: unable to extract info section')
3160 return
3161
3162 files_info=[]
3163 sid = self._gen_sid()
3164 fileid = self._get_file_id(fileid, seed)
3165
3166 #column 8,9 of fileid represent the segment number
3167 #fileid[7:9] should be changed
3168 for index, key in enumerate(keys):
3169
3170 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3171 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3172
3173 info = {
3174 'id': '%s_part%02d' % (video_id, index),
3175 'url': download_url,
3176 'uploader': None,
3177 'title': video_title,
3178 'ext': ext,
3179 'format': u'NA'
3180 }
3181 files_info.append(info)
3182
3183 return files_info
3184
3185
3186 class XNXXIE(InfoExtractor):
3187 """Information extractor for xnxx.com"""
3188
3189 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3190 IE_NAME = u'xnxx'
3191 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3192 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3193 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3194
3195 def report_webpage(self, video_id):
3196 """Report information extraction"""
3197 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3198
3199 def report_extraction(self, video_id):
3200 """Report information extraction"""
3201 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3202
3203 def _real_extract(self, url):
3204 mobj = re.match(self._VALID_URL, url)
3205 if mobj is None:
3206 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3207 return
3208 video_id = mobj.group(1).decode('utf-8')
3209
3210 self.report_webpage(video_id)
3211
3212 # Get webpage content
3213 try:
3214 webpage = urllib2.urlopen(url).read()
3215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3217 return
3218
3219 result = re.search(self.VIDEO_URL_RE, webpage)
3220 if result is None:
3221 self._downloader.trouble(u'ERROR: unable to extract video url')
3222 return
3223 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3224
3225 result = re.search(self.VIDEO_TITLE_RE, webpage)
3226 if result is None:
3227 self._downloader.trouble(u'ERROR: unable to extract video title')
3228 return
3229 video_title = result.group(1).decode('utf-8')
3230
3231 result = re.search(self.VIDEO_THUMB_RE, webpage)
3232 if result is None:
3233 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3234 return
3235 video_thumbnail = result.group(1).decode('utf-8')
3236
3237 info = {'id': video_id,
3238 'url': video_url,
3239 'uploader': None,
3240 'upload_date': None,
3241 'title': video_title,
3242 'ext': 'flv',
3243 'format': 'flv',
3244 'thumbnail': video_thumbnail,
3245 'description': None,
3246 'player_url': None}
3247
3248 return [info]
3249
3250
3251 class GooglePlusIE(InfoExtractor):
3252 """Information extractor for plus.google.com."""
3253
3254 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3255 IE_NAME = u'plus.google'
3256
3257 def __init__(self, downloader=None):
3258 InfoExtractor.__init__(self, downloader)
3259
3260 def report_extract_entry(self, url):
3261 """Report downloading extry"""
3262 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3263
3264 def report_date(self, upload_date):
3265 """Report downloading extry"""
3266 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3267
3268 def report_uploader(self, uploader):
3269 """Report downloading extry"""
3270 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3271
3272 def report_title(self, video_title):
3273 """Report downloading extry"""
3274 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3275
3276 def report_extract_vid_page(self, video_page):
3277 """Report information extraction."""
3278 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3279
3280 def _real_extract(self, url):
3281 # Extract id from URL
3282 mobj = re.match(self._VALID_URL, url)
3283 if mobj is None:
3284 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3285 return
3286
3287 post_url = mobj.group(0)
3288 video_id = mobj.group(2)
3289
3290 video_extension = 'flv'
3291
3292 # Step 1, Retrieve post webpage to extract further information
3293 self.report_extract_entry(post_url)
3294 request = urllib2.Request(post_url)
3295 try:
3296 webpage = urllib2.urlopen(request).read()
3297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3299 return
3300
3301 # Extract update date
3302 upload_date = u'NA'
3303 pattern = 'title="Timestamp">(.*?)</a>'
3304 mobj = re.search(pattern, webpage)
3305 if mobj:
3306 upload_date = mobj.group(1)
3307 # Convert timestring to a format suitable for filename
3308 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3309 upload_date = upload_date.strftime('%Y%m%d')
3310 self.report_date(upload_date)
3311
3312 # Extract uploader
3313 uploader = u'NA'
3314 pattern = r'rel\="author".*?>(.*?)</a>'
3315 mobj = re.search(pattern, webpage)
3316 if mobj:
3317 uploader = mobj.group(1)
3318 self.report_uploader(uploader)
3319
3320 # Extract title
3321 # Get the first line for title
3322 video_title = u'NA'
3323 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3324 mobj = re.search(pattern, webpage)
3325 if mobj:
3326 video_title = mobj.group(1)
3327 self.report_title(video_title)
3328
3329 # Step 2, Stimulate clicking the image box to launch video
3330 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3331 mobj = re.search(pattern, webpage)
3332 if mobj is None:
3333 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3334
3335 video_page = mobj.group(1)
3336 request = urllib2.Request(video_page)
3337 try:
3338 webpage = urllib2.urlopen(request).read()
3339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3341 return
3342 self.report_extract_vid_page(video_page)
3343
3344
3345 # Extract video links on video page
3346 """Extract video links of all sizes"""
3347 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3348 mobj = re.findall(pattern, webpage)
3349 if len(mobj) == 0:
3350 self._downloader.trouble(u'ERROR: unable to extract video links')
3351
3352 # Sort in resolution
3353 links = sorted(mobj)
3354
3355 # Choose the lowest of the sort, i.e. highest resolution
3356 video_url = links[-1]
3357 # Only get the url. The resolution part in the tuple has no use anymore
3358 video_url = video_url[-1]
3359 # Treat escaped \u0026 style hex
3360 video_url = unicode(video_url, "unicode_escape")
3361
3362
3363 return [{
3364 'id': video_id.decode('utf-8'),
3365 'url': video_url,
3366 'uploader': uploader.decode('utf-8'),
3367 'upload_date': upload_date.decode('utf-8'),
3368 'title': video_title.decode('utf-8'),
3369 'ext': video_extension.decode('utf-8'),
3370 'format': u'NA',
3371 'player_url': None,
3372 }]
3373
3374
3375
3376 class YouPornIE(InfoExtractor):
3377 """Information extractor for youporn.com."""
3378
3379 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3380 IE_NAME = u'youporn'
3381 VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>'
3382 VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>'
3383 VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>'
3384 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3385 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3386
3387 def __init__(self, downloader=None):
3388 InfoExtractor.__init__(self, downloader)
3389
3390 def report_id(self, video_id):
3391 """Report finding video ID"""
3392 self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id)
3393
3394 def report_webpage(self, url):
3395 """Report downloading page"""
3396 self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url)
3397
3398 def report_title(self, video_title):
3399 """Report dfinding title"""
3400 self._downloader.to_screen(u'[youporn] Title: %s' % video_title)
3401
3402 def report_uploader(self, uploader):
3403 """Report dfinding title"""
3404 self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader)
3405
3406 def report_upload_date(self, video_date):
3407 """Report finding date"""
3408 self._downloader.to_screen(u'[youporn] Date: %s' % video_date)
3409
3410 def _print_formats(self, formats):
3411 """Print all available formats"""
3412 print 'Available formats:'
3413 print u'ext\t\tformat'
3414 print u'---------------------------------'
3415 for format in formats:
3416 print u'%s\t\t%s' % (format['ext'], format['format'])
3417
3418 def _specific(self, req_format, formats):
3419 for x in formats:
3420 if(x["format"]==req_format):
3421 return x
3422 return None
3423
3424
3425 def _real_extract(self, url):
3426 mobj = re.match(self._VALID_URL, url)
3427 if mobj is None:
3428 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3429 return
3430
3431 video_id = mobj.group('videoid').decode('utf-8')
3432 self.report_id(video_id)
3433
3434 # Get webpage content
3435 try:
3436 webpage = urllib2.urlopen(url).read()
3437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3439 return
3440 self.report_webpage(url)
3441
3442 # Get the video title
3443 result = re.search(self.VIDEO_TITLE_RE, webpage)
3444 if result is None:
3445 self._downloader.trouble(u'ERROR: unable to extract video title')
3446 return
3447 video_title = result.group('title').decode('utf-8').strip()
3448 self.report_title(video_title)
3449
3450 # Get the video date
3451 result = re.search(self.VIDEO_DATE_RE, webpage)
3452 if result is None:
3453 self._downloader.trouble(u'ERROR: unable to extract video date')
3454 return
3455 upload_date = result.group('date').decode('utf-8').strip()
3456 self.report_upload_date(upload_date)
3457
3458 # Get the video uploader
3459 result = re.search(self.VIDEO_UPLOADER_RE, webpage)
3460 if result is None:
3461 self._downloader.trouble(u'ERROR: unable to extract uploader')
3462 return
3463 video_uploader = result.group('uploader').decode('utf-8').strip()
3464 video_uploader = clean_html( video_uploader )
3465 self.report_uploader(video_uploader)
3466
3467 # Get all of the formats available
3468 result = re.search(self.DOWNLOAD_LIST_RE, webpage)
3469 if result is None:
3470 self._downloader.trouble(u'ERROR: unable to extract download list')
3471 return
3472 download_list_html = result.group('download_list').decode('utf-8').strip()
3473
3474 # Get all of the links from the page
3475 links = re.findall(self.LINK_RE, download_list_html)
3476 if(len(links) == 0):
3477 self._downloader.trouble(u'ERROR: no known formats available for video')
3478 return
3479
3480 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3481
3482 formats = []
3483 for link in links:
3484
3485 # A link looks like this:
3486 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3487 # A path looks like this:
3488 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3489 video_url = unescapeHTML( link.decode('utf-8') )
3490 path = urlparse( video_url ).path
3491 extension = os.path.splitext( path )[1][1:]
3492 format = path.split('/')[4].split('_')[:2]
3493 size = format[0]
3494 bitrate = format[1]
3495 format = "-".join( format )
3496 title = u'%s-%s-%s' % (video_title, size, bitrate)
3497
3498 formats.append({
3499 'id': video_id,
3500 'url': video_url,
3501 'uploader': video_uploader,
3502 'upload_date': upload_date,
3503 'title': title,
3504 'ext': extension,
3505 'format': format,
3506 'thumbnail': None,
3507 'description': None,
3508 'player_url': None
3509 })
3510
3511 if self._downloader.params.get('listformats', None):
3512 self._print_formats(formats)
3513 return
3514
3515 req_format = self._downloader.params.get('format', None)
3516 #format_limit = self._downloader.params.get('format_limit', None)
3517 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3518
3519
3520 if req_format is None or req_format == 'best':
3521 return [formats[0]]
3522 elif req_format == 'worst':
3523 return [formats[-1]]
3524 elif req_format in ('-1', 'all'):
3525 return formats
3526 else:
3527 format = self._specific( req_format, formats )
3528 if result is None:
3529 self._downloader.trouble(u'ERROR: requested format not available')
3530 return
3531 return [format]
3532
3533
3534
3535
3536 class PornotubeIE(InfoExtractor):
3537 """Information extractor for pornotube.com."""
3538
3539 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3540 IE_NAME = u'pornotube'
3541 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3542 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3543
3544
3545 def __init__(self, downloader=None):
3546 InfoExtractor.__init__(self, downloader)
3547
3548 def report_extract_entry(self, url):
3549 """Report downloading extry"""
3550 self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8'))
3551
3552 def report_date(self, upload_date):
3553 """Report finding uploaded date"""
3554 self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date)
3555
3556 def report_webpage(self, url):
3557 """Report downloading page"""
3558 self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url)
3559
3560 def report_title(self, video_title):
3561 """Report downloading extry"""
3562 self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8'))
3563
3564 def _real_extract(self, url):
3565 mobj = re.match(self._VALID_URL, url)
3566 if mobj is None:
3567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568 return
3569
3570 video_id = mobj.group('videoid').decode('utf-8')
3571 video_title = mobj.group('title').decode('utf-8')
3572 self.report_title(video_title);
3573
3574 # Get webpage content
3575 try:
3576 webpage = urllib2.urlopen(url).read()
3577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3578 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3579 return
3580 self.report_webpage(url)
3581
3582 # Get the video URL
3583 result = re.search(self.VIDEO_URL_RE, webpage)
3584 if result is None:
3585 self._downloader.trouble(u'ERROR: unable to extract video url')
3586 return
3587 video_url = urllib.unquote(result.group('url').decode('utf-8'))
3588 self.report_extract_entry(video_url)
3589
3590 #Get the uploaded date
3591 result = re.search(self.VIDEO_UPLOADED_RE, webpage)
3592 if result is None:
3593 self._downloader.trouble(u'ERROR: unable to extract video title')
3594 return
3595 upload_date = result.group('date').decode('utf-8')
3596 self.report_date(upload_date);
3597
3598
3599 info = {'id': video_id,
3600 'url': video_url,
3601 'uploader': None,
3602 'upload_date': upload_date,
3603 'title': video_title,
3604 'ext': 'flv',
3605 'format': 'flv',
3606 'thumbnail': None,
3607 'description': None,
3608 'player_url': None}
3609
3610 return [info]
3611
3612
3613
3614
3615 class YouJizzIE(InfoExtractor):
3616 """Information extractor for youjizz.com."""
3617
3618 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/([^.]+).html$'
3619 IE_NAME = u'youjizz'
3620 VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
3621 EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
3622 SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
3623
3624 def __init__(self, downloader=None):
3625 InfoExtractor.__init__(self, downloader)
3626
3627 def report_extract_entry(self, url):
3628 """Report downloading extry"""
3629 self._downloader.to_screen(u'[youjizz] Downloading entry: %s' % url.decode('utf-8'))
3630
3631 def report_webpage(self, url):
3632 """Report downloading page"""
3633 self._downloader.to_screen(u'[youjizz] Downloaded page: %s' % url)
3634
3635 def report_title(self, video_title):
3636 """Report downloading extry"""
3637 self._downloader.to_screen(u'[youjizz] Title: %s' % video_title.decode('utf-8'))
3638
3639 def report_embed_page(self, embed_page):
3640 """Report downloading extry"""
3641 self._downloader.to_screen(u'[youjizz] Embed Page: %s' % embed_page.decode('utf-8'))
3642
3643 def _real_extract(self, url):
3644 # Get webpage content
3645 try:
3646 webpage = urllib2.urlopen(url).read()
3647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3648 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3649 return
3650 self.report_webpage(url)
3651
3652 # Get the video title
3653 result = re.search(self.VIDEO_TITLE_RE, webpage)
3654 if result is None:
3655 self._downloader.trouble(u'ERROR: unable to extract video title')
3656 return
3657 video_title = result.group('title').decode('utf-8').strip()
3658 self.report_title(video_title)
3659
3660 # Get the embed page
3661 result = re.search(self.EMBED_PAGE_RE, webpage)
3662 if result is None:
3663 self._downloader.trouble(u'ERROR: unable to extract embed page')
3664 return
3665
3666 embed_page_url = result.group(0).decode('utf-8').strip()
3667 video_id = result.group('videoid').decode('utf-8')
3668 self.report_embed_page(embed_page_url)
3669
3670 try:
3671 webpage = urllib2.urlopen(embed_page_url).read()
3672 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3673 self._downloader.trouble(u'ERROR: unable to download video embed page: %s' % err)
3674 return
3675
3676 # Get the video URL
3677 result = re.search(self.SOURCE_RE, webpage)
3678 if result is None:
3679 self._downloader.trouble(u'ERROR: unable to extract video url')
3680 return
3681 video_url = result.group('source').decode('utf-8')
3682 self.report_extract_entry(video_url)
3683
3684 info = {'id': video_id,
3685 'url': video_url,
3686 'uploader': None,
3687 'upload_date': None,
3688 'title': video_title,
3689 'ext': 'flv',
3690 'format': 'flv',
3691 'thumbnail': None,
3692 'description': None,
3693 'player_url': embed_page_url}
3694
3695 return [info]
3696