2 # -*- coding: utf-8 -*-
15 import xml
.etree
.ElementTree
18 from urlparse
import parse_qs
21 import cStringIO
as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader.
44 title: Video title, unescaped.
45 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional:
50 format: The video format, defaults to ext. Used by --get-format
51 thumbnail: Full URL to a video thumbnail image.
52 description One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
65 def __init__(self
, downloader
=None):
66 """Constructor. Receives an optional downloader."""
68 self
.set_downloader(downloader
)
70 def suitable(self
, url
):
71 """Receives a URL and returns True if suitable for this IE."""
72 return re
.match(self
._VALID
_URL
, url
) is not None
75 """Initializes an instance (authentication, etc)."""
77 self
._real
_initialize
()
80 def extract(self
, url
):
81 """Extracts URL information and returns it in list of dicts."""
83 return self
._real
_extract
(url
)
85 def set_downloader(self
, downloader
):
86 """Sets the downloader for this IE."""
87 self
._downloader
= downloader
89 def _real_initialize(self
):
90 """Real initialization process. Redefine in subclasses."""
93 def _real_extract(self
, url
):
94 """Real extraction process. Redefine in subclasses."""
98 class YoutubeIE(InfoExtractor
):
99 """Information extractor for youtube.com."""
103 (?:https?://)? # http(s):// (optional)
104 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
105 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
106 (?:.*?\#/)? # handle anchor (#/) redirect urls
107 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
108 (?: # the various things that can precede the ID:
109 (?:(?:v|embed|e)/) # v/ or embed/ or e/
110 |(?: # or the v= param in all its forms
111 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
112 (?:\?|\#!?) # the params delimiter ? or # or #!
113 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
116 )? # optional -> youtube.com/xxxx is OK
117 )? # all until now is optional -> you can pass the naked ID
118 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
119 (?(1).+)? # if we found the ID, everything can follow
121 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
122 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
123 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
124 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
125 _NETRC_MACHINE
= 'youtube'
126 # Listed in order of quality
127 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
128 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
129 _video_extensions
= {
135 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
141 _video_dimensions
= {
159 def suitable(self
, url
):
160 """Receives a URL and returns True if suitable for this IE."""
161 return re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) is not None
163 def report_lang(self
):
164 """Report attempt to set language."""
165 self
._downloader
.to_screen(u
'[youtube] Setting language')
167 def report_login(self
):
168 """Report attempt to log in."""
169 self
._downloader
.to_screen(u
'[youtube] Logging in')
171 def report_age_confirmation(self
):
172 """Report attempt to confirm age."""
173 self
._downloader
.to_screen(u
'[youtube] Confirming age')
175 def report_video_webpage_download(self
, video_id
):
176 """Report attempt to download video webpage."""
177 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
179 def report_video_info_webpage_download(self
, video_id
):
180 """Report attempt to download video info webpage."""
181 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
183 def report_video_subtitles_download(self
, video_id
):
184 """Report attempt to download video info webpage."""
185 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video subtitles' % video_id
)
187 def report_information_extraction(self
, video_id
):
188 """Report attempt to extract video information."""
189 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
191 def report_unavailable_format(self
, video_id
, format
):
192 """Report extracted video URL."""
193 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
195 def report_rtmp_download(self
):
196 """Indicate the download will use the RTMP protocol."""
197 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
199 def _closed_captions_xml_to_srt(self
, xml_string
):
201 texts
= re
.findall(r
'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string
, re
.MULTILINE
)
202 # TODO parse xml instead of regex
203 for n
, (start
, dur_tag
, dur
, caption
) in enumerate(texts
):
204 if not dur
: dur
= '4'
206 end
= start
+ float(dur
)
207 start
= "%02i:%02i:%02i,%03i" %(start
/(60*60), start
/60%60, start
%60, start
%1*1000)
208 end
= "%02i:%02i:%02i,%03i" %(end
/(60*60), end
/60%60, end
%60, end
%1*1000)
209 caption
= unescapeHTML(caption
)
210 caption
= unescapeHTML(caption
) # double cycle, intentional
211 srt
+= str(n
+1) + '\n'
212 srt
+= start
+ ' --> ' + end
+ '\n'
213 srt
+= caption
+ '\n\n'
216 def _print_formats(self
, formats
):
217 print('Available formats:')
219 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???')))
221 def _real_initialize(self
):
222 if self
._downloader
is None:
227 downloader_params
= self
._downloader
.params
229 # Attempt to use provided username and password or .netrc data
230 if downloader_params
.get('username', None) is not None:
231 username
= downloader_params
['username']
232 password
= downloader_params
['password']
233 elif downloader_params
.get('usenetrc', False):
235 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
240 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
241 except (IOError, netrc
.NetrcParseError
), err
:
242 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % compat_str(err
))
246 request
= urllib2
.Request(self
._LANG
_URL
)
249 urllib2
.urlopen(request
).read()
250 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
251 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % compat_str(err
))
254 # No authentication to be performed
260 'current_form': 'loginForm',
262 'action_login': 'Log In',
263 'username': username
,
264 'password': password
,
266 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
))
269 login_results
= urllib2
.urlopen(request
).read()
270 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
271 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
273 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
274 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % compat_str(err
))
280 'action_confirm': 'Confirm',
282 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
))
284 self
.report_age_confirmation()
285 age_results
= urllib2
.urlopen(request
).read()
286 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
287 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % compat_str(err
))
290 def _real_extract(self
, url
):
291 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
292 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
294 url
= 'http://www.youtube.com/' + urllib
.unquote(mobj
.group(1)).lstrip('/')
296 # Extract video id from URL
297 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
299 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
301 video_id
= mobj
.group(2)
304 self
.report_video_webpage_download(video_id
)
305 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
)
307 video_webpage
= urllib2
.urlopen(request
).read()
308 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
309 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
312 # Attempt to extract SWF player URL
313 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
315 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
320 self
.report_video_info_webpage_download(video_id
)
321 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
322 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
323 % (video_id
, el_type
))
324 request
= urllib2
.Request(video_info_url
)
326 video_info_webpage
= urllib2
.urlopen(request
).read()
327 video_info
= parse_qs(video_info_webpage
)
328 if 'token' in video_info
:
330 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
331 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
333 if 'token' not in video_info
:
334 if 'reason' in video_info
:
335 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
337 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
340 # Check for "rental" videos
341 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
342 self
._downloader
.trouble(u
'ERROR: "rental" videos not supported')
345 # Start extracting information
346 self
.report_information_extraction(video_id
)
349 if 'author' not in video_info
:
350 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
352 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
355 if 'title' not in video_info
:
356 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
358 video_title
= urllib
.unquote_plus(video_info
['title'][0])
359 video_title
= video_title
.decode('utf-8')
362 if 'thumbnail_url' not in video_info
:
363 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
365 else: # don't panic if we can't find it
366 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
370 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
372 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
373 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y']
374 for expression
in format_expressions
:
376 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
381 video_description
= get_element_by_id("eow-description", video_webpage
.decode('utf8'))
382 if video_description
: video_description
= clean_html(video_description
)
383 else: video_description
= ''
386 video_subtitles
= None
387 if self
._downloader
.params
.get('writesubtitles', False):
389 self
.report_video_subtitles_download(video_id
)
390 request
= urllib2
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
392 srt_list
= urllib2
.urlopen(request
).read()
393 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
394 raise Trouble(u
'WARNING: unable to download video subtitles: %s' % compat_str(err
))
395 srt_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list
)
396 srt_lang_list
= dict((l
[1], l
[0]) for l
in srt_lang_list
)
397 if not srt_lang_list
:
398 raise Trouble(u
'WARNING: video has no closed captions')
399 if self
._downloader
.params
.get('subtitleslang', False):
400 srt_lang
= self
._downloader
.params
.get('subtitleslang')
401 elif 'en' in srt_lang_list
:
404 srt_lang
= srt_lang_list
.keys()[0]
405 if not srt_lang
in srt_lang_list
:
406 raise Trouble(u
'WARNING: no closed captions found in the specified language')
407 request
= urllib2
.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang
, srt_lang_list
[srt_lang
], video_id
))
409 srt_xml
= urllib2
.urlopen(request
).read()
410 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
411 raise Trouble(u
'WARNING: unable to download video subtitles: %s' % compat_str(err
))
413 raise Trouble(u
'WARNING: unable to download video subtitles')
414 video_subtitles
= self
._closed
_captions
_xml
_to
_srt
(srt_xml
.decode('utf-8'))
415 except Trouble
as trouble
:
416 self
._downloader
.trouble(trouble
[0])
418 if 'length_seconds' not in video_info
:
419 self
._downloader
.trouble(u
'WARNING: unable to extract video duration')
422 video_duration
= urllib
.unquote_plus(video_info
['length_seconds'][0])
425 video_token
= urllib
.unquote_plus(video_info
['token'][0])
427 # Decide which formats to download
428 req_format
= self
._downloader
.params
.get('format', None)
430 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
431 self
.report_rtmp_download()
432 video_url_list
= [(None, video_info
['conn'][0])]
433 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
434 url_data_strs
= video_info
['url_encoded_fmt_stream_map'][0].split(',')
435 url_data
= [parse_qs(uds
) for uds
in url_data_strs
]
436 url_data
= filter(lambda ud
: 'itag' in ud
and 'url' in ud
, url_data
)
437 url_map
= dict((ud
['itag'][0], ud
['url'][0] + '&signature=' + ud
['sig'][0]) for ud
in url_data
)
439 format_limit
= self
._downloader
.params
.get('format_limit', None)
440 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
441 if format_limit
is not None and format_limit
in available_formats
:
442 format_list
= available_formats
[available_formats
.index(format_limit
):]
444 format_list
= available_formats
445 existing_formats
= [x
for x
in format_list
if x
in url_map
]
446 if len(existing_formats
) == 0:
447 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
449 if self
._downloader
.params
.get('listformats', None):
450 self
._print
_formats
(existing_formats
)
452 if req_format
is None or req_format
== 'best':
453 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
454 elif req_format
== 'worst':
455 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
456 elif req_format
in ('-1', 'all'):
457 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
459 # Specific formats. We pick the first in a slash-delimeted sequence.
460 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
461 req_formats
= req_format
.split('/')
462 video_url_list
= None
463 for rf
in req_formats
:
465 video_url_list
= [(rf
, url_map
[rf
])]
467 if video_url_list
is None:
468 self
._downloader
.trouble(u
'ERROR: requested format not available')
471 self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
475 for format_param
, video_real_url
in video_url_list
:
477 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
479 video_format
= '{} - {}'.format(format_param
.decode('utf-8') if format_param
else video_extension
.decode('utf-8'),
480 self
._video
_dimensions
.get(format_param
, '???'))
483 'id': video_id
.decode('utf-8'),
484 'url': video_real_url
.decode('utf-8'),
485 'uploader': video_uploader
.decode('utf-8'),
486 'upload_date': upload_date
,
487 'title': video_title
,
488 'ext': video_extension
.decode('utf-8'),
489 'format': video_format
,
490 'thumbnail': video_thumbnail
.decode('utf-8'),
491 'description': video_description
,
492 'player_url': player_url
,
493 'subtitles': video_subtitles
,
494 'duration': video_duration
499 class MetacafeIE(InfoExtractor
):
500 """Information Extractor for metacafe.com."""
502 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
503 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
504 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
505 IE_NAME
= u
'metacafe'
507 def __init__(self
, downloader
=None):
508 InfoExtractor
.__init
__(self
, downloader
)
510 def report_disclaimer(self
):
511 """Report disclaimer retrieval."""
512 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
514 def report_age_confirmation(self
):
515 """Report attempt to confirm age."""
516 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
518 def report_download_webpage(self
, video_id
):
519 """Report webpage download."""
520 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
522 def report_extraction(self
, video_id
):
523 """Report information extraction."""
524 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
526 def _real_initialize(self
):
527 # Retrieve disclaimer
528 request
= urllib2
.Request(self
._DISCLAIMER
)
530 self
.report_disclaimer()
531 disclaimer
= urllib2
.urlopen(request
).read()
532 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
533 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % compat_str(err
))
539 'submit': "Continue - I'm over 18",
541 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
))
543 self
.report_age_confirmation()
544 disclaimer
= urllib2
.urlopen(request
).read()
545 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
546 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % compat_str(err
))
549 def _real_extract(self
, url
):
550 # Extract id and simplified title from URL
551 mobj
= re
.match(self
._VALID
_URL
, url
)
553 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
556 video_id
= mobj
.group(1)
558 # Check if video comes from YouTube
559 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
560 if mobj2
is not None:
561 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % mobj2
.group(1)])
564 # Retrieve video webpage to extract further information
565 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
567 self
.report_download_webpage(video_id
)
568 webpage
= urllib2
.urlopen(request
).read()
569 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
570 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % compat_str(err
))
573 # Extract URL, uploader and title from webpage
574 self
.report_extraction(video_id
)
575 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
577 mediaURL
= urllib
.unquote(mobj
.group(1))
578 video_extension
= mediaURL
[-3:]
580 # Extract gdaKey if available
581 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
585 gdaKey
= mobj
.group(1)
586 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
588 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
590 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
592 vardict
= parse_qs(mobj
.group(1))
593 if 'mediaData' not in vardict
:
594 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
596 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
598 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
600 mediaURL
= mobj
.group(1).replace('\\/', '/')
601 video_extension
= mediaURL
[-3:]
602 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
604 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
606 self
._downloader
.trouble(u
'ERROR: unable to extract title')
608 video_title
= mobj
.group(1).decode('utf-8')
610 mobj
= re
.search(r
'submitter=(.*?);', webpage
)
612 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
614 video_uploader
= mobj
.group(1)
617 'id': video_id
.decode('utf-8'),
618 'url': video_url
.decode('utf-8'),
619 'uploader': video_uploader
.decode('utf-8'),
620 'upload_date': u
'NA',
621 'title': video_title
,
622 'ext': video_extension
.decode('utf-8'),
627 class DailymotionIE(InfoExtractor
):
628 """Information Extractor for Dailymotion"""
630 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
631 IE_NAME
= u
'dailymotion'
633 def __init__(self
, downloader
=None):
634 InfoExtractor
.__init
__(self
, downloader
)
636 def report_download_webpage(self
, video_id
):
637 """Report webpage download."""
638 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
640 def report_extraction(self
, video_id
):
641 """Report information extraction."""
642 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
644 def _real_extract(self
, url
):
645 # Extract id and simplified title from URL
646 mobj
= re
.match(self
._VALID
_URL
, url
)
648 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
651 video_id
= mobj
.group(1).split('_')[0].split('?')[0]
653 video_extension
= 'mp4'
655 # Retrieve video webpage to extract further information
656 request
= urllib2
.Request(url
)
657 request
.add_header('Cookie', 'family_filter=off')
659 self
.report_download_webpage(video_id
)
660 webpage
= urllib2
.urlopen(request
).read()
661 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
662 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % compat_str(err
))
665 # Extract URL, uploader and title from webpage
666 self
.report_extraction(video_id
)
667 mobj
= re
.search(r
'\s*var flashvars = (.*)', webpage
)
669 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
671 flashvars
= urllib
.unquote(mobj
.group(1))
673 for key
in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
676 self
._downloader
.to_screen(u
'[dailymotion] Using %s' % key
)
679 self
._downloader
.trouble(u
'ERROR: unable to extract video URL')
682 mobj
= re
.search(r
'"' + max_quality
+ r
'":"(.+?)"', flashvars
)
684 self
._downloader
.trouble(u
'ERROR: unable to extract video URL')
687 video_url
= urllib
.unquote(mobj
.group(1)).replace('\\/', '/')
689 # TODO: support choosing qualities
691 mobj
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
)
693 self
._downloader
.trouble(u
'ERROR: unable to extract title')
695 video_title
= unescapeHTML(mobj
.group('title').decode('utf-8'))
697 video_uploader
= u
'NA'
698 mobj
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage
)
700 # lookin for official user
701 mobj_official
= re
.search(r
'<span rel="author"[^>]+?>([^<]+?)</span>', webpage
)
702 if mobj_official
is None:
703 self
._downloader
.trouble(u
'WARNING: unable to extract uploader nickname')
705 video_uploader
= mobj_official
.group(1)
707 video_uploader
= mobj
.group(1)
709 video_upload_date
= u
'NA'
710 mobj
= re
.search(r
'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage
)
712 video_upload_date
= mobj
.group(3) + mobj
.group(2) + mobj
.group(1)
715 'id': video_id
.decode('utf-8'),
716 'url': video_url
.decode('utf-8'),
717 'uploader': video_uploader
.decode('utf-8'),
718 'upload_date': video_upload_date
,
719 'title': video_title
,
720 'ext': video_extension
.decode('utf-8'),
725 class GoogleIE(InfoExtractor
):
726 """Information extractor for video.google.com."""
728 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
729 IE_NAME
= u
'video.google'
731 def __init__(self
, downloader
=None):
732 InfoExtractor
.__init
__(self
, downloader
)
734 def report_download_webpage(self
, video_id
):
735 """Report webpage download."""
736 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
738 def report_extraction(self
, video_id
):
739 """Report information extraction."""
740 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
742 def _real_extract(self
, url
):
743 # Extract id from URL
744 mobj
= re
.match(self
._VALID
_URL
, url
)
746 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
749 video_id
= mobj
.group(1)
751 video_extension
= 'mp4'
753 # Retrieve video webpage to extract further information
754 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
756 self
.report_download_webpage(video_id
)
757 webpage
= urllib2
.urlopen(request
).read()
758 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
759 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
762 # Extract URL, uploader, and title from webpage
763 self
.report_extraction(video_id
)
764 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
766 video_extension
= 'flv'
767 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
769 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
771 mediaURL
= urllib
.unquote(mobj
.group(1))
772 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
773 mediaURL
= mediaURL
.replace('\\x26', '\x26')
777 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
779 self
._downloader
.trouble(u
'ERROR: unable to extract title')
781 video_title
= mobj
.group(1).decode('utf-8')
783 # Extract video description
784 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
786 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
788 video_description
= mobj
.group(1).decode('utf-8')
789 if not video_description
:
790 video_description
= 'No description available.'
792 # Extract video thumbnail
793 if self
._downloader
.params
.get('forcethumbnail', False):
794 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
796 webpage
= urllib2
.urlopen(request
).read()
797 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
798 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
800 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
802 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
804 video_thumbnail
= mobj
.group(1)
805 else: # we need something to pass to process_info
809 'id': video_id
.decode('utf-8'),
810 'url': video_url
.decode('utf-8'),
812 'upload_date': u
'NA',
813 'title': video_title
,
814 'ext': video_extension
.decode('utf-8'),
819 class PhotobucketIE(InfoExtractor
):
820 """Information extractor for photobucket.com."""
822 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
823 IE_NAME
= u
'photobucket'
825 def __init__(self
, downloader
=None):
826 InfoExtractor
.__init
__(self
, downloader
)
828 def report_download_webpage(self
, video_id
):
829 """Report webpage download."""
830 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
832 def report_extraction(self
, video_id
):
833 """Report information extraction."""
834 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
836 def _real_extract(self
, url
):
837 # Extract id from URL
838 mobj
= re
.match(self
._VALID
_URL
, url
)
840 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
843 video_id
= mobj
.group(1)
845 video_extension
= 'flv'
847 # Retrieve video webpage to extract further information
848 request
= urllib2
.Request(url
)
850 self
.report_download_webpage(video_id
)
851 webpage
= urllib2
.urlopen(request
).read()
852 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
853 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
856 # Extract URL, uploader, and title from webpage
857 self
.report_extraction(video_id
)
858 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
860 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
862 mediaURL
= urllib
.unquote(mobj
.group(1))
866 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
868 self
._downloader
.trouble(u
'ERROR: unable to extract title')
870 video_title
= mobj
.group(1).decode('utf-8')
872 video_uploader
= mobj
.group(2).decode('utf-8')
875 'id': video_id
.decode('utf-8'),
876 'url': video_url
.decode('utf-8'),
877 'uploader': video_uploader
,
878 'upload_date': u
'NA',
879 'title': video_title
,
880 'ext': video_extension
.decode('utf-8'),
885 class YahooIE(InfoExtractor
):
886 """Information extractor for video.yahoo.com."""
888 # _VALID_URL matches all Yahoo! Video URLs
889 # _VPAGE_URL matches only the extractable '/watch/' URLs
890 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
891 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
892 IE_NAME
= u
'video.yahoo'
894 def __init__(self
, downloader
=None):
895 InfoExtractor
.__init
__(self
, downloader
)
897 def report_download_webpage(self
, video_id
):
898 """Report webpage download."""
899 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
901 def report_extraction(self
, video_id
):
902 """Report information extraction."""
903 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
905 def _real_extract(self
, url
, new_video
=True):
906 # Extract ID from URL
907 mobj
= re
.match(self
._VALID
_URL
, url
)
909 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
912 video_id
= mobj
.group(2)
913 video_extension
= 'flv'
915 # Rewrite valid but non-extractable URLs as
916 # extractable English language /watch/ URLs
917 if re
.match(self
._VPAGE
_URL
, url
) is None:
918 request
= urllib2
.Request(url
)
920 webpage
= urllib2
.urlopen(request
).read()
921 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
922 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
925 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
927 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
929 yahoo_id
= mobj
.group(1)
931 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
933 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
935 yahoo_vid
= mobj
.group(1)
937 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
938 return self
._real
_extract
(url
, new_video
=False)
940 # Retrieve video webpage to extract further information
941 request
= urllib2
.Request(url
)
943 self
.report_download_webpage(video_id
)
944 webpage
= urllib2
.urlopen(request
).read()
945 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
946 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
949 # Extract uploader and title from webpage
950 self
.report_extraction(video_id
)
951 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
953 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
955 video_title
= mobj
.group(1).decode('utf-8')
957 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
959 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
961 video_uploader
= mobj
.group(1).decode('utf-8')
963 # Extract video thumbnail
964 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
966 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
968 video_thumbnail
= mobj
.group(1).decode('utf-8')
970 # Extract video description
971 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
973 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
975 video_description
= mobj
.group(1).decode('utf-8')
976 if not video_description
:
977 video_description
= 'No description available.'
979 # Extract video height and width
980 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
982 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
984 yv_video_height
= mobj
.group(1)
986 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
988 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
990 yv_video_width
= mobj
.group(1)
992 # Retrieve video playlist to extract media URL
993 # I'm not completely sure what all these options are, but we
994 # seem to need most of them, otherwise the server sends a 401.
995 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
996 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
997 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
998 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
999 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1001 self
.report_download_webpage(video_id
)
1002 webpage
= urllib2
.urlopen(request
).read()
1003 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1004 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1007 # Extract media URL from playlist XML
1008 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1010 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1012 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1013 video_url
= unescapeHTML(video_url
)
1016 'id': video_id
.decode('utf-8'),
1018 'uploader': video_uploader
,
1019 'upload_date': u
'NA',
1020 'title': video_title
,
1021 'ext': video_extension
.decode('utf-8'),
1022 'thumbnail': video_thumbnail
.decode('utf-8'),
1023 'description': video_description
,
1024 'thumbnail': video_thumbnail
,
1029 class VimeoIE(InfoExtractor
):
1030 """Information extractor for vimeo.com."""
1032 # _VALID_URL matches Vimeo URLs
1033 _VALID_URL
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1036 def __init__(self
, downloader
=None):
1037 InfoExtractor
.__init
__(self
, downloader
)
1039 def report_download_webpage(self
, video_id
):
1040 """Report webpage download."""
1041 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
)
1043 def report_extraction(self
, video_id
):
1044 """Report information extraction."""
1045 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
)
1047 def _real_extract(self
, url
, new_video
=True):
1048 # Extract ID from URL
1049 mobj
= re
.match(self
._VALID
_URL
, url
)
1051 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1054 video_id
= mobj
.group(1)
1056 # Retrieve video webpage to extract further information
1057 request
= urllib2
.Request(url
, None, std_headers
)
1059 self
.report_download_webpage(video_id
)
1060 webpage
= urllib2
.urlopen(request
).read()
1061 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1062 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1065 # Now we begin extracting as much information as we can from what we
1066 # retrieved. First we extract the information common to all extractors,
1067 # and latter we extract those that are Vimeo specific.
1068 self
.report_extraction(video_id
)
1070 # Extract the config JSON
1071 config
= webpage
.split(' = {config:')[1].split(',assets:')[0]
1073 config
= json
.loads(config
)
1075 self
._downloader
.trouble(u
'ERROR: unable to extract info section')
1079 video_title
= config
["video"]["title"]
1082 video_uploader
= config
["video"]["owner"]["name"]
1084 # Extract video thumbnail
1085 video_thumbnail
= config
["video"]["thumbnail"]
1087 # Extract video description
1088 video_description
= get_element_by_id("description", webpage
.decode('utf8'))
1089 if video_description
: video_description
= clean_html(video_description
)
1090 else: video_description
= ''
1092 # Extract upload date
1093 video_upload_date
= u
'NA'
1094 mobj
= re
.search(r
'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage
)
1095 if mobj
is not None:
1096 video_upload_date
= mobj
.group(1)
1098 # Vimeo specific: extract request signature and timestamp
1099 sig
= config
['request']['signature']
1100 timestamp
= config
['request']['timestamp']
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files
= { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name
, codec_extension
in codecs
:
1108 if codec_name
in config
["video"]["files"]:
1109 if 'hd' in config
["video"]["files"][codec_name
]:
1110 files
['hd'].append((codec_name
, codec_extension
, 'hd'))
1111 elif 'sd' in config
["video"]["files"][codec_name
]:
1112 files
['sd'].append((codec_name
, codec_extension
, 'sd'))
1114 files
['other'].append((codec_name
, codec_extension
, config
["video"]["files"][codec_name
][0]))
1116 for quality
in ('hd', 'sd', 'other'):
1117 if len(files
[quality
]) > 0:
1118 video_quality
= files
[quality
][0][2]
1119 video_codec
= files
[quality
][0][0]
1120 video_extension
= files
[quality
][0][1]
1121 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading %s file at %s quality' % (video_id
, video_codec
.upper(), video_quality
))
1124 self
._downloader
.trouble(u
'ERROR: no known codec found')
1127 video_url
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id
, sig
, timestamp
, video_quality
, video_codec
.upper())
1133 'uploader': video_uploader
,
1134 'upload_date': video_upload_date
,
1135 'title': video_title
,
1136 'ext': video_extension
,
1137 'thumbnail': video_thumbnail
,
1138 'description': video_description
,
1143 class GenericIE(InfoExtractor
):
1144 """Generic last-resort information extractor."""
1147 IE_NAME
= u
'generic'
1149 def __init__(self
, downloader
=None):
1150 InfoExtractor
.__init
__(self
, downloader
)
1152 def report_download_webpage(self
, video_id
):
1153 """Report webpage download."""
1154 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.')
1155 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
)
1157 def report_extraction(self
, video_id
):
1158 """Report information extraction."""
1159 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
)
1161 def report_following_redirect(self
, new_url
):
1162 """Report information extraction."""
1163 self
._downloader
.to_screen(u
'[redirect] Following redirect to %s' % new_url
)
1165 def _test_redirect(self
, url
):
1166 """Check if it is a redirect, like url shorteners, in case restart chain."""
1167 class HeadRequest(urllib2
.Request
):
1168 def get_method(self
):
1171 class HEADRedirectHandler(urllib2
.HTTPRedirectHandler
):
1173 Subclass the HTTPRedirectHandler to make it use our
1174 HeadRequest also on the redirected URL
1176 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1177 if code
in (301, 302, 303, 307):
1178 newurl
= newurl
.replace(' ', '%20')
1179 newheaders
= dict((k
,v
) for k
,v
in req
.headers
.items()
1180 if k
.lower() not in ("content-length", "content-type"))
1181 return HeadRequest(newurl
,
1183 origin_req_host
=req
.get_origin_req_host(),
1186 raise urllib2
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
)
1188 class HTTPMethodFallback(urllib2
.BaseHandler
):
1190 Fallback to GET if HEAD is not allowed (405 HTTP error)
1192 def http_error_405(self
, req
, fp
, code
, msg
, headers
):
1196 newheaders
= dict((k
,v
) for k
,v
in req
.headers
.items()
1197 if k
.lower() not in ("content-length", "content-type"))
1198 return self
.parent
.open(urllib2
.Request(req
.get_full_url(),
1200 origin_req_host
=req
.get_origin_req_host(),
1204 opener
= urllib2
.OpenerDirector()
1205 for handler
in [urllib2
.HTTPHandler
, urllib2
.HTTPDefaultErrorHandler
,
1206 HTTPMethodFallback
, HEADRedirectHandler
,
1207 urllib2
.HTTPErrorProcessor
, urllib2
.HTTPSHandler
]:
1208 opener
.add_handler(handler())
1210 response
= opener
.open(HeadRequest(url
))
1211 new_url
= response
.geturl()
1213 if url
== new_url
: return False
1215 self
.report_following_redirect(new_url
)
1216 self
._downloader
.download([new_url
])
1219 def _real_extract(self
, url
):
1220 if self
._test
_redirect
(url
): return
1222 video_id
= url
.split('/')[-1]
1223 request
= urllib2
.Request(url
)
1225 self
.report_download_webpage(video_id
)
1226 webpage
= urllib2
.urlopen(request
).read()
1227 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1228 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1230 except ValueError, err
:
1231 # since this is the last-resort InfoExtractor, if
1232 # this error is thrown, it'll be thrown here
1233 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1236 self
.report_extraction(video_id
)
1237 # Start with something easy: JW Player in SWFObject
1238 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1240 # Broaden the search a little bit
1241 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
1243 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1246 # It's possible that one of the regexes
1247 # matched, but returned an empty group:
1248 if mobj.group(1) is None:
1249 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1252 video_url = urllib.unquote(mobj.group(1))
1253 video_id = os.path.basename(video_url)
1255 # here's a fun little line of code for you:
1256 video_extension = os.path.splitext(video_id)[1][1:]
1257 video_id = os.path.splitext(video_id)[0]
1259 # it's tempting to parse this further, but you would
1260 # have to take into account all the variations like
1261 # Video Title - Site Name
1262 # Site Name | Video Title
1263 # Video Title - Tagline | Site Name
1264 # and so on and so forth; it's just not practical
1265 mobj = re.search(r'<title>(.*)</title>', webpage)
1267 self._downloader.trouble(u'ERROR: unable to extract title')
1269 video_title = mobj.group(1).decode('utf-8')
1271 # video uploader is domain name
1272 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1274 self._downloader.trouble(u'ERROR: unable to extract title')
1276 video_uploader = mobj.group(1).decode('utf-8')
1279 'id': video_id.decode('utf-8'),
1280 'url': video_url.decode('utf-8'),
1281 'uploader': video_uploader,
1282 'upload_date': u'NA',
1283 'title': video_title,
1284 'ext': video_extension.decode('utf-8'),
1289 class YoutubeSearchIE(InfoExtractor):
1290 """Information Extractor for YouTube search queries."""
1291 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1292 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1293 _max_youtube_results = 1000
1294 IE_NAME = u'youtube:search'
1296 def __init__(self, downloader=None):
1297 InfoExtractor.__init__(self, downloader)
1299 def report_download_page(self, query, pagenum):
1300 """Report attempt to download search page with given number."""
1301 query = query.decode(preferredencoding())
1302 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1304 def _real_extract(self, query):
1305 mobj = re.match(self._VALID_URL, query)
1307 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1310 prefix, query = query.split(':')
1312 query = query.encode('utf-8')
1314 self._download_n_results(query, 1)
1316 elif prefix == 'all':
1317 self._download_n_results(query, self._max_youtube_results)
1323 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1325 elif n > self._max_youtube_results:
1326 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1327 n = self._max_youtube_results
1328 self._download_n_results(query, n)
1330 except ValueError: # parsing prefix as integer fails
1331 self._download_n_results(query, 1)
1334 def _download_n_results(self, query, n):
1335 """Downloads a specified number of results for a query"""
1341 while (50 * pagenum) < limit:
1342 self.report_download_page(query, pagenum+1)
1343 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1344 request = urllib2.Request(result_url)
1346 data = urllib2.urlopen(request).read()
1347 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1348 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1350 api_response = json.loads(data)['data']
1352 new_ids = list(video['id'] for video in api_response['items'])
1353 video_ids += new_ids
1355 limit = min(n, api_response['totalItems'])
1358 if len(video_ids) > n:
1359 video_ids = video_ids[:n]
1360 for id in video_ids:
1361 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1365 class GoogleSearchIE(InfoExtractor):
1366 """Information Extractor for Google Video search queries."""
1367 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1368 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1369 _VIDEO_INDICATOR = r'<a href="http
://video\
.google\
.com
/videoplay
\?docid
=([^
"\&]+)'
1370 _MORE_PAGES_INDICATOR = r'class="pn
" id="pnnext
"'
1371 _max_google_results = 1000
1372 IE_NAME = u'video.google:search'
1374 def __init__(self, downloader=None):
1375 InfoExtractor.__init__(self, downloader)
1377 def report_download_page(self, query, pagenum):
1378 """Report attempt to download playlist page with given number."""
1379 query = query.decode(preferredencoding())
1380 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1382 def _real_extract(self, query):
1383 mobj = re.match(self._VALID_URL, query)
1385 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1388 prefix, query = query.split(':')
1390 query = query.encode('utf-8')
1392 self._download_n_results(query, 1)
1394 elif prefix == 'all':
1395 self._download_n_results(query, self._max_google_results)
1401 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1403 elif n > self._max_google_results:
1404 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1405 n = self._max_google_results
1406 self._download_n_results(query, n)
1408 except ValueError: # parsing prefix as integer fails
1409 self._download_n_results(query, 1)
1412 def _download_n_results(self, query, n):
1413 """Downloads a specified number of results for a query"""
1419 self.report_download_page(query, pagenum)
1420 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1421 request = urllib2.Request(result_url)
1423 page = urllib2.urlopen(request).read()
1424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1428 # Extract video identifiers
1429 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1430 video_id = mobj.group(1)
1431 if video_id not in video_ids:
1432 video_ids.append(video_id)
1433 if len(video_ids) == n:
1434 # Specified n videos reached
1435 for id in video_ids:
1436 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1439 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1440 for id in video_ids:
1441 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1444 pagenum = pagenum + 1
1447 class YahooSearchIE(InfoExtractor):
1448 """Information Extractor for Yahoo! Video search queries."""
1449 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1450 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1451 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
1452 _MORE_PAGES_INDICATOR = r'\s*Next'
1453 _max_yahoo_results = 1000
1454 IE_NAME = u'video.yahoo:search'
1456 def __init__(self, downloader=None):
1457 InfoExtractor.__init__(self, downloader)
1459 def report_download_page(self, query, pagenum):
1460 """Report attempt to download playlist page with given number."""
1461 query = query.decode(preferredencoding())
1462 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1464 def _real_extract(self, query):
1465 mobj = re.match(self._VALID_URL, query)
1467 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1470 prefix, query = query.split(':')
1472 query = query.encode('utf-8')
1474 self._download_n_results(query, 1)
1476 elif prefix == 'all':
1477 self._download_n_results(query, self._max_yahoo_results)
1483 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1485 elif n > self._max_yahoo_results:
1486 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1487 n = self._max_yahoo_results
1488 self._download_n_results(query, n)
1490 except ValueError: # parsing prefix as integer fails
1491 self._download_n_results(query, 1)
1494 def _download_n_results(self, query, n):
1495 """Downloads a specified number of results for a query"""
1498 already_seen = set()
1502 self.report_download_page(query, pagenum)
1503 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1504 request = urllib2.Request(result_url)
1506 page = urllib2.urlopen(request).read()
1507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1511 # Extract video identifiers
1512 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513 video_id = mobj.group(1)
1514 if video_id not in already_seen:
1515 video_ids.append(video_id)
1516 already_seen.add(video_id)
1517 if len(video_ids) == n:
1518 # Specified n videos reached
1519 for id in video_ids:
1520 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 for id in video_ids:
1525 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1528 pagenum = pagenum + 1
1531 class YoutubePlaylistIE(InfoExtractor):
1532 """Information Extractor for YouTube playlists."""
1534 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1535 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1536 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&
;)*list=.*?
%s'
1537 _MORE_PAGES_INDICATOR = r'yt
-uix
-pager
-next
'
1538 IE_NAME = u'youtube
:playlist
'
1540 def __init__(self, downloader=None):
1541 InfoExtractor.__init__(self, downloader)
1543 def report_download_page(self, playlist_id, pagenum):
1544 """Report attempt to download playlist page with given number."""
1545 self._downloader.to_screen(u'[youtube
] PL
%s: Downloading page
#%s' % (playlist_id, pagenum))
1547 def _real_extract(self
, url
):
1548 # Extract playlist id
1549 mobj
= re
.match(self
._VALID
_URL
, url
)
1551 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1555 if mobj
.group(3) is not None:
1556 self
._downloader
.download([mobj
.group(3)])
1559 # Download playlist pages
1560 # prefix is 'p' as default for playlists but there are other types that need extra care
1561 playlist_prefix
= mobj
.group(1)
1562 if playlist_prefix
== 'a':
1563 playlist_access
= 'artist'
1565 playlist_prefix
= 'p'
1566 playlist_access
= 'view_play_list'
1567 playlist_id
= mobj
.group(2)
1572 self
.report_download_page(playlist_id
, pagenum
)
1573 url
= self
._TEMPLATE
_URL
% (playlist_access
, playlist_prefix
, playlist_id
, pagenum
)
1574 request
= urllib2
.Request(url
)
1576 page
= urllib2
.urlopen(request
).read()
1577 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1578 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1581 # Extract video identifiers
1583 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
_TEMPLATE
% playlist_id
, page
):
1584 if mobj
.group(1) not in ids_in_page
:
1585 ids_in_page
.append(mobj
.group(1))
1586 video_ids
.extend(ids_in_page
)
1588 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1590 pagenum
= pagenum
+ 1
1592 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1593 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1594 if playlistend
== -1:
1595 video_ids
= video_ids
[playliststart
:]
1597 video_ids
= video_ids
[playliststart
:playlistend
]
1599 for id in video_ids
:
1600 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1604 class YoutubeChannelIE(InfoExtractor
):
1605 """Information Extractor for YouTube channels."""
1607 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1608 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1609 _MORE_PAGES_INDICATOR
= r
'yt-uix-button-content">Next' # TODO
1610 IE_NAME
= u
'youtube:channel'
1612 def report_download_page(self
, channel_id
, pagenum
):
1613 """Report attempt to download channel page with given number."""
1614 self
._downloader
.to_screen(u
'[youtube] Channel %s: Downloading page #%s' % (channel_id
, pagenum
))
1616 def _real_extract(self
, url
):
1617 # Extract channel id
1618 mobj
= re
.match(self
._VALID
_URL
, url
)
1620 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1623 # Download channel pages
1624 channel_id
= mobj
.group(1)
1629 self
.report_download_page(channel_id
, pagenum
)
1630 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
1631 request
= urllib2
.Request(url
)
1633 page
= urllib2
.urlopen(request
).read()
1634 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1635 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1638 # Extract video identifiers
1640 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&', page
):
1641 if mobj
.group(1) not in ids_in_page
:
1642 ids_in_page
.append(mobj
.group(1))
1643 video_ids
.extend(ids_in_page
)
1645 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1647 pagenum
= pagenum
+ 1
1649 for id in video_ids
:
1650 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1654 class YoutubeUserIE(InfoExtractor
):
1655 """Information Extractor for YouTube users."""
1657 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1658 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1659 _GDATA_PAGE_SIZE
= 50
1660 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1661 _VIDEO_INDICATOR
= r
'/watch\?v=(.+?)[\<&]'
1662 IE_NAME
= u
'youtube:user'
1664 def __init__(self
, downloader
=None):
1665 InfoExtractor
.__init
__(self
, downloader
)
1667 def report_download_page(self
, username
, start_index
):
1668 """Report attempt to download user page."""
1669 self
._downloader
.to_screen(u
'[youtube] user %s: Downloading video ids from %d to %d' %
1670 (username
, start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1672 def _real_extract(self
, url
):
1674 mobj
= re
.match(self
._VALID
_URL
, url
)
1676 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1679 username
= mobj
.group(1)
1681 # Download video ids using YouTube Data API. Result size per
1682 # query is limited (currently to 50 videos) so we need to query
1683 # page by page until there are no video ids - it means we got
1690 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1691 self
.report_download_page(username
, start_index
)
1693 request
= urllib2
.Request(self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
))
1696 page
= urllib2
.urlopen(request
).read()
1697 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1698 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1701 # Extract video identifiers
1704 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1705 if mobj
.group(1) not in ids_in_page
:
1706 ids_in_page
.append(mobj
.group(1))
1708 video_ids
.extend(ids_in_page
)
1710 # A little optimization - if current page is not
1711 # "full", ie. does not contain PAGE_SIZE video ids then
1712 # we can assume that this page is the last one - there
1713 # are no more ids on further pages - no need to query
1716 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1721 all_ids_count
= len(video_ids
)
1722 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1723 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1725 if playlistend
== -1:
1726 video_ids
= video_ids
[playliststart
:]
1728 video_ids
= video_ids
[playliststart
:playlistend
]
1730 self
._downloader
.to_screen(u
"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1731 (username
, all_ids_count
, len(video_ids
)))
1733 for video_id
in video_ids
:
1734 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % video_id
])
1737 class BlipTVUserIE(InfoExtractor
):
1738 """Information Extractor for blip.tv users."""
1740 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1742 IE_NAME
= u
'blip.tv:user'
1744 def __init__(self
, downloader
=None):
1745 InfoExtractor
.__init
__(self
, downloader
)
1747 def report_download_page(self
, username
, pagenum
):
1748 """Report attempt to download user page."""
1749 self
._downloader
.to_screen(u
'[%s] user %s: Downloading video ids from page %d' %
1750 (self
.IE_NAME
, username
, pagenum
))
1752 def _real_extract(self
, url
):
1754 mobj
= re
.match(self
._VALID
_URL
, url
)
1756 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1759 username
= mobj
.group(1)
1761 page_base
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1763 request
= urllib2
.Request(url
)
1766 page
= urllib2
.urlopen(request
).read().decode('utf-8')
1767 mobj
= re
.search(r
'data-users-id="([^"]+)"', page
)
1768 page_base
= page_base
% mobj
.group(1)
1769 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1770 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1774 # Download video ids using BlipTV Ajax calls. Result size per
1775 # query is limited (currently to 12 videos) so we need to query
1776 # page by page until there are no video ids - it means we got
1783 self
.report_download_page(username
, pagenum
)
1785 request
= urllib2
.Request( page_base
+ "&page=" + str(pagenum
) )
1788 page
= urllib2
.urlopen(request
).read().decode('utf-8')
1789 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1790 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % str(err
))
1793 # Extract video identifiers
1796 for mobj
in re
.finditer(r
'href="/([^"]+)"', page
):
1797 if mobj
.group(1) not in ids_in_page
:
1798 ids_in_page
.append(unescapeHTML(mobj
.group(1)))
1800 video_ids
.extend(ids_in_page
)
1802 # A little optimization - if current page is not
1803 # "full", ie. does not contain PAGE_SIZE video ids then
1804 # we can assume that this page is the last one - there
1805 # are no more ids on further pages - no need to query
1808 if len(ids_in_page
) < self
._PAGE
_SIZE
:
1813 all_ids_count
= len(video_ids
)
1814 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1815 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1817 if playlistend
== -1:
1818 video_ids
= video_ids
[playliststart
:]
1820 video_ids
= video_ids
[playliststart
:playlistend
]
1822 self
._downloader
.to_screen(u
"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1823 (self
.IE_NAME
, username
, all_ids_count
, len(video_ids
)))
1825 for video_id
in video_ids
:
1826 self
._downloader
.download([u
'http://blip.tv/'+video_id
])
1829 class DepositFilesIE(InfoExtractor
):
1830 """Information extractor for depositfiles.com"""
1832 _VALID_URL
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1833 IE_NAME
= u
'DepositFiles'
1835 def __init__(self
, downloader
=None):
1836 InfoExtractor
.__init
__(self
, downloader
)
1838 def report_download_webpage(self
, file_id
):
1839 """Report webpage download."""
1840 self
._downloader
.to_screen(u
'[DepositFiles] %s: Downloading webpage' % file_id
)
1842 def report_extraction(self
, file_id
):
1843 """Report information extraction."""
1844 self
._downloader
.to_screen(u
'[DepositFiles] %s: Extracting information' % file_id
)
1846 def _real_extract(self
, url
):
1847 file_id
= url
.split('/')[-1]
1848 # Rebuild url in english locale
1849 url
= 'http://depositfiles.com/en/files/' + file_id
1851 # Retrieve file webpage with 'Free download' button pressed
1852 free_download_indication
= { 'gateway_result' : '1' }
1853 request
= urllib2
.Request(url
, urllib
.urlencode(free_download_indication
))
1855 self
.report_download_webpage(file_id
)
1856 webpage
= urllib2
.urlopen(request
).read()
1857 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1858 self
._downloader
.trouble(u
'ERROR: Unable to retrieve file webpage: %s' % compat_str(err
))
1861 # Search for the real file URL
1862 mobj
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
)
1863 if (mobj
is None) or (mobj
.group(1) is None):
1864 # Try to figure out reason of the error.
1865 mobj
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
)
1866 if (mobj
is not None) and (mobj
.group(1) is not None):
1867 restriction_message
= re
.sub('\s+', ' ', mobj
.group(1)).strip()
1868 self
._downloader
.trouble(u
'ERROR: %s' % restriction_message
)
1870 self
._downloader
.trouble(u
'ERROR: unable to extract download URL from: %s' % url
)
1873 file_url
= mobj
.group(1)
1874 file_extension
= os
.path
.splitext(file_url
)[1][1:]
1876 # Search for file title
1877 mobj
= re
.search(r
'<b title="(.*?)">', webpage
)
1879 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1881 file_title
= mobj
.group(1).decode('utf-8')
1884 'id': file_id
.decode('utf-8'),
1885 'url': file_url
.decode('utf-8'),
1887 'upload_date': u
'NA',
1888 'title': file_title
,
1889 'ext': file_extension
.decode('utf-8'),
1894 class FacebookIE(InfoExtractor
):
1895 """Information Extractor for Facebook"""
1897 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1898 _LOGIN_URL
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1899 _NETRC_MACHINE
= 'facebook'
1900 _available_formats
= ['video', 'highqual', 'lowqual']
1901 _video_extensions
= {
1906 IE_NAME
= u
'facebook'
1908 def __init__(self
, downloader
=None):
1909 InfoExtractor
.__init
__(self
, downloader
)
1911 def _reporter(self
, message
):
1912 """Add header and report message."""
1913 self
._downloader
.to_screen(u
'[facebook] %s' % message
)
1915 def report_login(self
):
1916 """Report attempt to log in."""
1917 self
._reporter
(u
'Logging in')
1919 def report_video_webpage_download(self
, video_id
):
1920 """Report attempt to download video webpage."""
1921 self
._reporter
(u
'%s: Downloading video webpage' % video_id
)
1923 def report_information_extraction(self
, video_id
):
1924 """Report attempt to extract video information."""
1925 self
._reporter
(u
'%s: Extracting video information' % video_id
)
1927 def _parse_page(self
, video_webpage
):
1928 """Extract video information from page"""
1930 data
= {'title': r
'\("video_title", "(.*?)"\)',
1931 'description': r
'<div class="datawrap">(.*?)</div>',
1932 'owner': r
'\("video_owner_name", "(.*?)"\)',
1933 'thumbnail': r
'\("thumb_url", "(?P<THUMB>.*?)"\)',
1936 for piece
in data
.keys():
1937 mobj
= re
.search(data
[piece
], video_webpage
)
1938 if mobj
is not None:
1939 video_info
[piece
] = urllib
.unquote_plus(mobj
.group(1).decode("unicode_escape"))
1943 for fmt
in self
._available
_formats
:
1944 mobj
= re
.search(r
'\("%s_src\", "(.+?)"\)' % fmt
, video_webpage
)
1945 if mobj
is not None:
1946 # URL is in a Javascript segment inside an escaped Unicode format within
1947 # the generally utf-8 page
1948 video_urls
[fmt
] = urllib
.unquote_plus(mobj
.group(1).decode("unicode_escape"))
1949 video_info
['video_urls'] = video_urls
1953 def _real_initialize(self
):
1954 if self
._downloader
is None:
1959 downloader_params
= self
._downloader
.params
1961 # Attempt to use provided username and password or .netrc data
1962 if downloader_params
.get('username', None) is not None:
1963 useremail
= downloader_params
['username']
1964 password
= downloader_params
['password']
1965 elif downloader_params
.get('usenetrc', False):
1967 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
1968 if info
is not None:
1972 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
1973 except (IOError, netrc
.NetrcParseError
), err
:
1974 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % compat_str(err
))
1977 if useremail
is None:
1986 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
))
1989 login_results
= urllib2
.urlopen(request
).read()
1990 if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None:
1991 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1993 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1994 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % compat_str(err
))
1997 def _real_extract(self
, url
):
1998 mobj
= re
.match(self
._VALID
_URL
, url
)
2000 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2002 video_id
= mobj
.group('ID')
2005 self
.report_video_webpage_download(video_id
)
2006 request
= urllib2
.Request('https://www.facebook.com/video/video.php?v=%s' % video_id
)
2008 page
= urllib2
.urlopen(request
)
2009 video_webpage
= page
.read()
2010 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2011 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2014 # Start extracting information
2015 self
.report_information_extraction(video_id
)
2017 # Extract information
2018 video_info
= self
._parse
_page
(video_webpage
)
2021 if 'owner' not in video_info
:
2022 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
2024 video_uploader
= video_info
['owner']
2027 if 'title' not in video_info
:
2028 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2030 video_title
= video_info
['title']
2031 video_title
= video_title
.decode('utf-8')
2034 if 'thumbnail' not in video_info
:
2035 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
2036 video_thumbnail
= ''
2038 video_thumbnail
= video_info
['thumbnail']
2042 if 'upload_date' in video_info
:
2043 upload_time
= video_info
['upload_date']
2044 timetuple
= email
.utils
.parsedate_tz(upload_time
)
2045 if timetuple
is not None:
2047 upload_date
= time
.strftime('%Y%m%d', timetuple
[0:9])
2052 video_description
= video_info
.get('description', 'No description available.')
2054 url_map
= video_info
['video_urls']
2055 if len(url_map
.keys()) > 0:
2056 # Decide which formats to download
2057 req_format
= self
._downloader
.params
.get('format', None)
2058 format_limit
= self
._downloader
.params
.get('format_limit', None)
2060 if format_limit
is not None and format_limit
in self
._available
_formats
:
2061 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
2063 format_list
= self
._available
_formats
2064 existing_formats
= [x
for x
in format_list
if x
in url_map
]
2065 if len(existing_formats
) == 0:
2066 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
2068 if req_format
is None:
2069 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
2070 elif req_format
== 'worst':
2071 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
2072 elif req_format
== '-1':
2073 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
2076 if req_format
not in url_map
:
2077 self
._downloader
.trouble(u
'ERROR: requested format not available')
2079 video_url_list
= [(req_format
, url_map
[req_format
])] # Specific format
2082 for format_param
, video_real_url
in video_url_list
:
2084 video_extension
= self
._video
_extensions
.get(format_param
, 'mp4')
2087 'id': video_id
.decode('utf-8'),
2088 'url': video_real_url
.decode('utf-8'),
2089 'uploader': video_uploader
.decode('utf-8'),
2090 'upload_date': upload_date
,
2091 'title': video_title
,
2092 'ext': video_extension
.decode('utf-8'),
2093 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
2094 'thumbnail': video_thumbnail
.decode('utf-8'),
2095 'description': video_description
.decode('utf-8'),
2100 class BlipTVIE(InfoExtractor
):
2101 """Information extractor for blip.tv"""
2103 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2104 _URL_EXT
= r
'^.*\.([a-z0-9]+)$'
2105 IE_NAME
= u
'blip.tv'
2107 def report_extraction(self
, file_id
):
2108 """Report information extraction."""
2109 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
2111 def report_direct_download(self
, title
):
2112 """Report information extraction."""
2113 self
._downloader
.to_screen(u
'[%s] %s: Direct download detected' % (self
.IE_NAME
, title
))
2115 def _real_extract(self
, url
):
2116 mobj
= re
.match(self
._VALID
_URL
, url
)
2118 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2125 json_url
= url
+ cchar
+ 'skin=json&version=2&no_wrap=1'
2126 request
= urllib2
.Request(json_url
.encode('utf-8'))
2127 self
.report_extraction(mobj
.group(1))
2130 urlh
= urllib2
.urlopen(request
)
2131 if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download
2132 basename
= url
.split('/')[-1]
2133 title
,ext
= os
.path
.splitext(basename
)
2134 title
= title
.decode('UTF-8')
2135 ext
= ext
.replace('.', '')
2136 self
.report_direct_download(title
)
2144 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2145 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
2147 if info
is None: # Regular URL
2149 json_code
= urlh
.read()
2150 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2151 self
._downloader
.trouble(u
'ERROR: unable to read video info webpage: %s' % compat_str(err
))
2155 json_data
= json
.loads(json_code
)
2156 if 'Post' in json_data
:
2157 data
= json_data
['Post']
2161 upload_date
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2162 video_url
= data
['media']['url']
2163 umobj
= re
.match(self
._URL
_EXT
, video_url
)
2165 raise ValueError('Can not determine filename extension')
2166 ext
= umobj
.group(1)
2169 'id': data
['item_id'],
2171 'uploader': data
['display_name'],
2172 'upload_date': upload_date
,
2173 'title': data
['title'],
2175 'format': data
['media']['mimeType'],
2176 'thumbnail': data
['thumbnailUrl'],
2177 'description': data
['description'],
2178 'player_url': data
['embedUrl']
2180 except (ValueError,KeyError), err
:
2181 self
._downloader
.trouble(u
'ERROR: unable to parse video information: %s' % repr(err
))
2184 std_headers
['User-Agent'] = 'iTunes/10.6.1'
2188 class MyVideoIE(InfoExtractor
):
2189 """Information Extractor for myvideo.de."""
2191 _VALID_URL
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2192 IE_NAME
= u
'myvideo'
2194 def __init__(self
, downloader
=None):
2195 InfoExtractor
.__init
__(self
, downloader
)
2197 def report_download_webpage(self
, video_id
):
2198 """Report webpage download."""
2199 self
._downloader
.to_screen(u
'[myvideo] %s: Downloading webpage' % video_id
)
2201 def report_extraction(self
, video_id
):
2202 """Report information extraction."""
2203 self
._downloader
.to_screen(u
'[myvideo] %s: Extracting information' % video_id
)
2205 def _real_extract(self
,url
):
2206 mobj
= re
.match(self
._VALID
_URL
, url
)
2208 self
._download
.trouble(u
'ERROR: invalid URL: %s' % url
)
2211 video_id
= mobj
.group(1)
2214 request
= urllib2
.Request('http://www.myvideo.de/watch/%s' % video_id
)
2216 self
.report_download_webpage(video_id
)
2217 webpage
= urllib2
.urlopen(request
).read()
2218 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2219 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
2222 self
.report_extraction(video_id
)
2223 mobj
= re
.search(r
'<link rel=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />',
2226 self._downloader.trouble(u'ERROR
: unable to extract media URL
')
2228 video_url = mobj.group(1) + ('/%s.flv
' % video_id)
2230 mobj = re.search('<title
>([^
<]+)</title
>', webpage)
2232 self._downloader.trouble(u'ERROR
: unable to extract title
')
2235 video_title = mobj.group(1)
2241 'upload_date
': u'NA
',
2242 'title
': video_title,
2247 class ComedyCentralIE(InfoExtractor):
2248 """Information extractor for The Daily Show and Colbert Report """
2250 _VALID_URL = r'^
(:(?P
<shortname
>tds|thedailyshow|cr|colbert|colbertnation|colbertreport
))|
(https?
://)?
(www\
.)?
(?P
<showname
>thedailyshow|colbertnation
)\
.com
/full
-episodes
/(?P
<episode
>.*)$
'
2251 IE_NAME = u'comedycentral
'
2253 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2255 _video_extensions = {
2263 _video_dimensions = {
2272 def report_extraction(self, episode_id):
2273 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id)
2275 def report_config_download(self, episode_id):
2276 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id)
2278 def report_index_download(self, episode_id):
2279 self._downloader.to_screen(u'[comedycentral
] %s: Downloading show index
' % episode_id)
2281 def report_player_url(self, episode_id):
2282 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id)
2285 def _print_formats(self, formats):
2286 print('Available formats
:')
2288 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4
'), self._video_dimensions.get(x, '???
')))
2291 def _real_extract(self, url):
2292 mobj = re.match(self._VALID_URL, url)
2294 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2297 if mobj.group('shortname
'):
2298 if mobj.group('shortname
') in ('tds
', 'thedailyshow
'):
2299 url = u'http
://www
.thedailyshow
.com
/full
-episodes
/'
2301 url = u'http
://www
.colbertnation
.com
/full
-episodes
/'
2302 mobj = re.match(self._VALID_URL, url)
2303 assert mobj is not None
2305 dlNewest = not mobj.group('episode
')
2307 epTitle = mobj.group('showname
')
2309 epTitle = mobj.group('episode
')
2311 req = urllib2.Request(url)
2312 self.report_extraction(epTitle)
2314 htmlHandle = urllib2.urlopen(req)
2315 html = htmlHandle.read()
2316 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2317 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err))
2320 url = htmlHandle.geturl()
2321 mobj = re.match(self._VALID_URL, url)
2323 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url)
2325 if mobj.group('episode
') == '':
2326 self._downloader.trouble(u'ERROR
: Redirected URL
is still
not specific
: ' + url)
2328 epTitle = mobj.group('episode
')
2330 mMovieParams = re.findall('(?
:<param name
="movie" value
="|var url = ")(http
://media
.mtvnservices
.com
/([^
"]*episode.*?:.*?))"', html)
2332 if len(mMovieParams) == 0:
2333 # The Colbert Report embeds the information in a without
2334 # a URL prefix; so extract the alternate reference
2335 # and then add the URL prefix manually.
2337 altMovieParams = re.findall('data
-mgid
="([^"]*episode
.*?
:.*?
)"', html)
2338 if len(altMovieParams) == 0:
2339 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2342 mMovieParams = [("http
://media
.mtvnservices
.com
/" + altMovieParams[0], altMovieParams[0])]
2344 playerUrl_raw = mMovieParams[0][0]
2345 self.report_player_url(epTitle)
2347 urlHandle = urllib2.urlopen(playerUrl_raw)
2348 playerUrl = urlHandle.geturl()
2349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2350 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2353 uri = mMovieParams[0][1]
2354 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2355 self.report_index_download(epTitle)
2357 indexXml = urllib2.urlopen(indexUrl).read()
2358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2364 idoc = xml.etree.ElementTree.fromstring(indexXml)
2365 itemEls = idoc.findall('.//item')
2366 for itemEl in itemEls:
2367 mediaId = itemEl.findall('./guid')[0].text
2368 shortMediaId = mediaId.split(':')[-1]
2369 showId = mediaId.split(':')[-2].replace('.com', '')
2370 officialTitle = itemEl.findall('./title')[0].text
2371 officialDate = itemEl.findall('./pubDate')[0].text
2373 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2374 urllib.urlencode({'uri': mediaId}))
2375 configReq = urllib2.Request(configUrl)
2376 self.report_config_download(epTitle)
2378 configXml = urllib2.urlopen(configReq).read()
2379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2380 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2383 cdoc = xml.etree.ElementTree.fromstring(configXml)
2385 for rendition in cdoc.findall('.//rendition'):
2386 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2390 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2393 if self._downloader.params.get('listformats', None):
2394 self._print_formats([i[0] for i in turls])
2397 # For now, just pick the highest bitrate
2398 format,video_url = turls[-1]
2400 # Get the format arg from the arg stream
2401 req_format = self._downloader.params.get('format', None)
2403 # Select format if we can find one
2406 format, video_url = f, v
2409 # Patch to download from alternative CDN, which does not
2410 # break on current RTMPDump builds
2411 broken_cdn = "rtmpe
://viacomccstrmfs
.fplive
.net
/viacomccstrm
/gsp
.comedystor
/"
2412 better_cdn = "rtmpe
://cp10740
.edgefcs
.net
/ondemand
/mtvnorigin
/gsp
.comedystor
/"
2414 if video_url.startswith(broken_cdn):
2415 video_url = video_url.replace(broken_cdn, better_cdn)
2417 effTitle = showId + u'-' + epTitle
2422 'upload_date': officialDate,
2427 'description': officialTitle,
2428 'player_url': None #playerUrl
2431 results.append(info)
2436 class EscapistIE(InfoExtractor):
2437 """Information extractor for The Escapist """
2439 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2440 IE_NAME = u'escapist'
2442 def report_extraction(self, showName):
2443 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2445 def report_config_download(self, showName):
2446 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2448 def _real_extract(self, url):
2449 mobj = re.match(self._VALID_URL, url)
2451 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2453 showName = mobj.group('showname')
2454 videoId = mobj.group('episode')
2456 self.report_extraction(showName)
2458 webPage = urllib2.urlopen(url)
2459 webPageBytes = webPage.read()
2460 m = re.match(r'text/html; charset="?
([^
"]+)"?
', webPage.headers['Content
-Type
'])
2461 webPage = webPageBytes.decode(m.group(1) if m else 'utf
-8')
2462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2463 self._downloader.trouble(u'ERROR
: unable to download webpage
: ' + unicode(err))
2466 descMatch = re.search('<meta name
="description" content
="([^"]*)"', webPage)
2467 description = unescapeHTML(descMatch.group(1))
2468 imgMatch = re.search('<meta property="og
:image
" content="([^
"]*)"', webPage)
2469 imgUrl = unescapeHTML(imgMatch.group(1))
2470 playerUrlMatch = re.search('<meta
property="og:video" content
="([^"]*)"', webPage)
2471 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2472 configUrlMatch = re.search('config=(.*)$', playerUrl)
2473 configUrl = urllib2.unquote(configUrlMatch.group(1))
2475 self.report_config_download(showName)
2477 configJSON = urllib2.urlopen(configUrl).read()
2478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2482 # Technically, it's JavaScript, not JSON
2483 configJSON = configJSON.replace("'", '"')
2486 config = json.loads(configJSON)
2487 except (ValueError,), err:
2488 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2491 playlist = config['playlist']
2492 videoUrl = playlist[1]['url']
2497 'uploader': showName,
2498 'upload_date': None,
2501 'thumbnail': imgUrl,
2502 'description': description,
2503 'player_url': playerUrl,
2509 class CollegeHumorIE(InfoExtractor):
2510 """Information extractor for collegehumor.com"""
2512 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2513 IE_NAME = u'collegehumor'
2515 def report_webpage(self, video_id):
2516 """Report information extraction."""
2517 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519 def report_extraction(self, video_id):
2520 """Report information extraction."""
2521 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523 def _real_extract(self, url):
2524 mobj = re.match(self._VALID_URL, url)
2526 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2528 video_id = mobj.group('videoid')
2530 self.report_webpage(video_id)
2531 request = urllib2.Request(url)
2533 webpage = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2538 m = re.search(r'id="video
:(?P
<internalvideoid
>[0-9]+)"', webpage)
2540 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2542 internal_video_id = m.group('internalvideoid')
2546 'internal_id': internal_video_id,
2549 self.report_extraction(video_id)
2550 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2552 metaXml = urllib2.urlopen(xmlUrl).read()
2553 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2554 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2557 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2559 videoNode = mdoc.findall('./video')[0]
2560 info['description'] = videoNode.findall('./description')[0].text
2561 info['title'] = videoNode.findall('./caption')[0].text
2562 info['url'] = videoNode.findall('./file')[0].text
2563 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2564 info['ext'] = info['url'].rpartition('.')[2]
2566 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2572 class XVideosIE(InfoExtractor):
2573 """Information extractor for xvideos.com"""
2575 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2576 IE_NAME = u'xvideos'
2578 def report_webpage(self, video_id):
2579 """Report information extraction."""
2580 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2582 def report_extraction(self, video_id):
2583 """Report information extraction."""
2584 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2586 def _real_extract(self, url):
2587 mobj = re.match(self._VALID_URL, url)
2589 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2591 video_id = mobj.group(1).decode('utf-8')
2593 self.report_webpage(video_id)
2595 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2597 webpage = urllib2.urlopen(request).read()
2598 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2599 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2602 self.report_extraction(video_id)
2606 mobj = re.search(r'flv_url=(.+?)&', webpage)
2608 self._downloader.trouble(u'ERROR: unable to extract video url')
2610 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2614 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2616 self._downloader.trouble(u'ERROR: unable to extract video title')
2618 video_title = mobj.group(1).decode('utf-8')
2621 # Extract video thumbnail
2622 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2624 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2626 video_thumbnail = mobj.group(0).decode('utf-8')
2632 'upload_date': None,
2633 'title': video_title,
2635 'thumbnail': video_thumbnail,
2636 'description': None,
2643 class SoundcloudIE(InfoExtractor):
2644 """Information extractor for soundcloud.com
2645 To access the media, the uid of the song and a stream token
2646 must be extracted from the page source and the script must make
2647 a request to media.soundcloud.com/crossdomain.xml. Then
2648 the media can be grabbed by requesting from an url composed
2649 of the stream token and uid
2652 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2653 IE_NAME = u'soundcloud'
2655 def __init__(self, downloader=None):
2656 InfoExtractor.__init__(self, downloader)
2658 def report_webpage(self, video_id):
2659 """Report information extraction."""
2660 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2662 def report_extraction(self, video_id):
2663 """Report information extraction."""
2664 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2666 def _real_extract(self, url):
2667 mobj = re.match(self._VALID_URL, url)
2669 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2672 # extract uploader (which is in the url)
2673 uploader = mobj.group(1).decode('utf-8')
2674 # extract simple title (uploader + slug of song title)
2675 slug_title = mobj.group(2).decode('utf-8')
2676 simple_title = uploader + u'-' + slug_title
2678 self.report_webpage('%s/%s' % (uploader, slug_title))
2680 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2682 webpage = urllib2.urlopen(request).read()
2683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2684 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2687 self.report_extraction('%s/%s' % (uploader, slug_title))
2689 # extract uid and stream token that soundcloud hands out for access
2690 mobj = re.search('"uid
":"([\w\d
]+?
)".*?stream_token=([\w\d]+)', webpage)
2692 video_id = mobj.group(1)
2693 stream_token = mobj.group(2)
2695 # extract unsimplified title
2696 mobj = re.search('"title
":"(.*?
)",', webpage)
2698 title = mobj.group(1).decode('utf-8')
2700 title = simple_title
2702 # construct media url (with uid/token)
2703 mediaURL = "http
://media
.soundcloud
.com
/stream
/%s?stream_token
=%s"
2704 mediaURL = mediaURL % (video_id, stream_token)
2707 description = u'No description available'
2708 mobj = re.search('track-description-value"><p
>(.*?
)</p
>', webpage)
2710 description = mobj.group(1)
2714 mobj = re.search("pretty-date'>on ([\w
]+ [\d
]+, [\d
]+ \d
+:\d
+)</abbr
></h2
>", webpage)
2717 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2718 except Exception, e:
2719 self._downloader.to_stderr(compat_str(e))
2721 # for soundcloud, a request to a cross domain is required for cookies
2722 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2725 'id': video_id.decode('utf-8'),
2727 'uploader': uploader.decode('utf-8'),
2728 'upload_date': upload_date,
2732 'description': description.decode('utf-8')
2736 class InfoQIE(InfoExtractor):
2737 """Information extractor for infoq.com"""
2739 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2742 def report_webpage(self, video_id):
2743 """Report information extraction."""
2744 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2746 def report_extraction(self, video_id):
2747 """Report information extraction."""
2748 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2750 def _real_extract(self, url):
2751 mobj = re.match(self._VALID_URL, url)
2753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2756 self.report_webpage(url)
2758 request = urllib2.Request(url)
2760 webpage = urllib2.urlopen(request).read()
2761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2765 self.report_extraction(url)
2769 mobj = re.search(r"jsclassref
='([^']*)'", webpage)
2771 self._downloader.trouble(u'ERROR
: unable to extract video url
')
2773 video_url = 'rtmpe
://video
.infoq
.com
/cfx
/st
/' + urllib2.unquote(mobj.group(1).decode('base64
'))
2777 mobj = re.search(r'contentTitle
= "(.*?)";', webpage)
2779 self._downloader.trouble(u'ERROR
: unable to extract video title
')
2781 video_title = mobj.group(1).decode('utf
-8')
2783 # Extract description
2784 video_description = u'No description available
.'
2785 mobj = re.search(r'<meta name
="description" content
="(.*)"(?
:\s
*/)?
>', webpage)
2786 if mobj is not None:
2787 video_description = mobj.group(1).decode('utf
-8')
2789 video_filename = video_url.split('/')[-1]
2790 video_id, extension = video_filename.split('.')
2796 'upload_date
': None,
2797 'title
': video_title,
2798 'ext
': extension, # Extension is always(?) mp4, but seems to be flv
2800 'description
': video_description,
2806 class MixcloudIE(InfoExtractor):
2807 """Information extractor for www.mixcloud.com"""
2808 _VALID_URL = r'^
(?
:https?
://)?
(?
:www\
.)?mixcloud\
.com
/([\w\d
-]+)/([\w\d
-]+)'
2809 IE_NAME = u'mixcloud
'
2811 def __init__(self, downloader=None):
2812 InfoExtractor.__init__(self, downloader)
2814 def report_download_json(self, file_id):
2815 """Report JSON download."""
2816 self._downloader.to_screen(u'[%s] Downloading json
' % self.IE_NAME)
2818 def report_extraction(self, file_id):
2819 """Report information extraction."""
2820 self._downloader.to_screen(u'[%s] %s: Extracting information
' % (self.IE_NAME, file_id))
2822 def get_urls(self, jsonData, fmt, bitrate='best
'):
2823 """Get urls from 'audio_formats
' section in json"""
2826 bitrate_list = jsonData[fmt]
2827 if bitrate is None or bitrate == 'best
' or bitrate not in bitrate_list:
2828 bitrate = max(bitrate_list) # select highest
2830 url_list = jsonData[fmt][bitrate]
2831 except TypeError: # we have no bitrate info.
2832 url_list = jsonData[fmt]
2835 def check_urls(self, url_list):
2836 """Returns 1st active url from list"""
2837 for url in url_list:
2839 urllib2.urlopen(url)
2841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2846 def _print_formats(self, formats):
2847 print('Available formats
:')
2848 for fmt in formats.keys():
2849 for b in formats[fmt]:
2851 ext = formats[fmt][b][0]
2852 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2853 except TypeError: # we have no bitrate info
2854 ext = formats[fmt][0]
2855 print('%s\t%s\t[%s]' % (fmt, '??
', ext.split('.')[-1]))
2858 def _real_extract(self, url):
2859 mobj = re.match(self._VALID_URL, url)
2861 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2863 # extract uploader & filename from url
2864 uploader = mobj.group(1).decode('utf
-8')
2865 file_id = uploader + "-" + mobj.group(2).decode('utf
-8')
2867 # construct API request
2868 file_url = 'http
://www
.mixcloud
.com
/api
/1/cloudcast
/' + '/'.join(url.split('/')[-3:-1]) + '.json
'
2869 # retrieve .json file with links to files
2870 request = urllib2.Request(file_url)
2872 self.report_download_json(file_url)
2873 jsonData = urllib2.urlopen(request).read()
2874 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2875 self._downloader.trouble(u'ERROR
: Unable to retrieve
file: %s' % compat_str(err))
2879 json_data = json.loads(jsonData)
2880 player_url = json_data['player_swf_url
']
2881 formats = dict(json_data['audio_formats
'])
2883 req_format = self._downloader.params.get('format
', None)
2886 if self._downloader.params.get('listformats
', None):
2887 self._print_formats(formats)
2890 if req_format is None or req_format == 'best
':
2891 for format_param in formats.keys():
2892 url_list = self.get_urls(formats, format_param)
2894 file_url = self.check_urls(url_list)
2895 if file_url is not None:
2898 if req_format not in formats.keys():
2899 self._downloader.trouble(u'ERROR
: format
is not available
')
2902 url_list = self.get_urls(formats, req_format)
2903 file_url = self.check_urls(url_list)
2904 format_param = req_format
2907 'id': file_id.decode('utf
-8'),
2908 'url
': file_url.decode('utf
-8'),
2909 'uploader
': uploader.decode('utf
-8'),
2910 'upload_date
': u'NA
',
2911 'title
': json_data['name
'],
2912 'ext
': file_url.split('.')[-1].decode('utf
-8'),
2913 'format
': (format_param is None and u'NA
' or format_param.decode('utf
-8')),
2914 'thumbnail
': json_data['thumbnail_url
'],
2915 'description
': json_data['description
'],
2916 'player_url
': player_url.decode('utf
-8'),
2919 class StanfordOpenClassroomIE(InfoExtractor):
2920 """Information extractor for Stanford's Open ClassRoom
"""
2922 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2923 IE_NAME = u'stanfordoc'
2925 def report_download_webpage(self, objid):
2926 """Report information extraction
."""
2927 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2929 def report_extraction(self, video_id):
2930 """Report information extraction
."""
2931 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2933 def _real_extract(self, url):
2934 mobj = re.match(self._VALID_URL, url)
2936 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2939 if mobj.group('course') and mobj.group('video'): # A specific video
2940 course = mobj.group('course')
2941 video = mobj.group('video')
2943 'id': course + '_' + video,
2946 self.report_extraction(info['id'])
2947 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2948 xmlUrl = baseUrl + video + '.xml'
2950 metaXml = urllib2.urlopen(xmlUrl).read()
2951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2952 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2954 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2956 info['title'] = mdoc.findall('./title')[0].text
2957 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2959 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2961 info['ext'] = info['url'].rpartition('.')[2]
2963 elif mobj.group('course'): # A course page
2964 course = mobj.group('course')
2970 self.report_download_webpage(info['id'])
2972 coursepage = urllib2.urlopen(url).read()
2973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2977 m = re.search('<h1>([^<]+)</h1>', coursepage)
2979 info['title'] = unescapeHTML(m.group(1))
2981 info['title'] = info['id']
2983 m = re.search('<description>([^<]+)</description>', coursepage)
2985 info['description'] = unescapeHTML(m.group(1))
2987 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2990 'type': 'reference',
2991 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2995 for entry in info['list']:
2996 assert entry['type'] == 'reference'
2997 results += self.extract(entry['url'])
3002 'id': 'Stanford OpenClassroom',
3006 self.report_download_webpage(info['id'])
3007 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3009 rootpage = urllib2.urlopen(rootURL).read()
3010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3011 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3014 info['title'] = info['id']
3016 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3019 'type': 'reference',
3020 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3025 for entry in info['list']:
3026 assert entry['type'] == 'reference'
3027 results += self.extract(entry['url'])
3030 class MTVIE(InfoExtractor):
3031 """Information extractor
for MTV
.com
"""
3033 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3036 def report_webpage(self, video_id):
3037 """Report information extraction
."""
3038 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3040 def report_extraction(self, video_id):
3041 """Report information extraction
."""
3042 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3044 def _real_extract(self, url):
3045 mobj = re.match(self._VALID_URL, url)
3047 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3049 if not mobj.group('proto'):
3050 url = 'http://' + url
3051 video_id = mobj.group('videoid')
3052 self.report_webpage(video_id)
3054 request = urllib2.Request(url)
3056 webpage = urllib2.urlopen(request).read()
3057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3058 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3061 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3063 self._downloader.trouble(u'ERROR: unable to extract song name')
3065 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3068 self._downloader.trouble(u'ERROR: unable to extract performer')
3070 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071 video_title = performer + ' - ' + song_name
3073 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3075 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3077 mtvn_uri = mobj.group(1)
3079 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3081 self._downloader.trouble(u'ERROR: unable to extract content id')
3083 content_id = mobj.group(1)
3085 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086 self.report_extraction(video_id)
3087 request = urllib2.Request(videogen_url)
3089 metadataXml = urllib2.urlopen(request).read()
3090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3094 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095 renditions = mdoc.findall('.//rendition')
3097 # For now, always pick the highest quality.
3098 rendition = renditions[-1]
3101 _,_,ext = rendition.attrib['type'].partition('/')
3102 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103 video_url = rendition.find('./src').text
3105 self._downloader.trouble('Invalid rendition field.')
3111 'uploader': performer,
3112 'title': video_title,
3120 class YoukuIE(InfoExtractor):
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3125 def __init__(self, downloader=None):
3126 InfoExtractor.__init__(self, downloader)
3128 def report_download_webpage(self, file_id):
3129 """Report webpage download
."""
3130 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3132 def report_extraction(self, file_id):
3133 """Report information extraction
."""
3134 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3137 nowTime = int(time.time() * 1000)
3138 random1 = random.randint(1000,1998)
3139 random2 = random.randint(1000,9999)
3141 return "%d%d%d" %(nowTime,random1,random2)
3143 def _get_file_ID_mix_string(self, seed):
3145 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3147 for i in range(len(source)):
3148 seed = (seed * 211 + 30031 ) % 65536
3149 index = math.floor(seed / 65536 * len(source) )
3150 mixed.append(source[int(index)])
3151 source.remove(source[int(index)])
3152 #return ''.join(mixed)
3155 def _get_file_id(self, fileId, seed):
3156 mixed = self._get_file_ID_mix_string(seed)
3157 ids = fileId.split('*')
3161 realId.append(mixed[int(ch)])
3162 return ''.join(realId)
3164 def _real_extract(self, url):
3165 mobj = re.match(self._VALID_URL, url)
3167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3169 video_id = mobj.group('ID')
3171 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3173 request = urllib2.Request(info_url, None, std_headers)
3175 self.report_download_webpage(video_id)
3176 jsondata = urllib2.urlopen(request).read()
3177 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3178 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3181 self.report_extraction(video_id)
3183 config = json.loads(jsondata)
3185 video_title = config['data'][0]['title']
3186 seed = config['data'][0]['seed']
3188 format = self._downloader.params.get('format', None)
3189 supported_format = config['data'][0]['streamfileids'].keys()
3191 if format is None or format == 'best':
3192 if 'hd2' in supported_format:
3197 elif format == 'worst':
3205 fileid = config['data'][0]['streamfileids'][format]
3206 seg_number = len(config['data'][0]['segs'][format])
3209 for i in xrange(seg_number):
3210 keys.append(config['data'][0]['segs'][format][i]['k'])
3213 #youku only could be viewed from mainland china
3215 self._downloader.trouble(u'ERROR: unable to extract info section')
3219 sid = self._gen_sid()
3220 fileid = self._get_file_id(fileid, seed)
3222 #column 8,9 of fileid represent the segment number
3223 #fileid[7:9] should be changed
3224 for index, key in enumerate(keys):
3226 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3230 'id': '%s_part%02d' % (video_id, index),
3231 'url': download_url,
3233 'title': video_title,
3236 files_info.append(info)
3241 class XNXXIE(InfoExtractor):
3242 """Information extractor
for xnxx
.com
"""
3244 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3246 VIDEO_URL_RE = r'flv_url=(.*?)&'
3247 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3248 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3250 def report_webpage(self, video_id):
3251 """Report information extraction
"""
3252 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3254 def report_extraction(self, video_id):
3255 """Report information extraction
"""
3256 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3258 def _real_extract(self, url):
3259 mobj = re.match(self._VALID_URL, url)
3261 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3263 video_id = mobj.group(1).decode('utf-8')
3265 self.report_webpage(video_id)
3267 # Get webpage content
3269 webpage = urllib2.urlopen(url).read()
3270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3271 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3274 result = re.search(self.VIDEO_URL_RE, webpage)
3276 self._downloader.trouble(u'ERROR: unable to extract video url')
3278 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3280 result = re.search(self.VIDEO_TITLE_RE, webpage)
3282 self._downloader.trouble(u'ERROR: unable to extract video title')
3284 video_title = result.group(1).decode('utf-8')
3286 result = re.search(self.VIDEO_THUMB_RE, webpage)
3288 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3290 video_thumbnail = result.group(1).decode('utf-8')
3292 info = {'id': video_id,
3295 'upload_date': None,
3296 'title': video_title,
3298 'thumbnail': video_thumbnail,
3299 'description': None,
3305 class GooglePlusIE(InfoExtractor):
3306 """Information extractor
for plus
.google
.com
."""
3308 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3309 IE_NAME = u'plus.google'
3311 def __init__(self, downloader=None):
3312 InfoExtractor.__init__(self, downloader)
3314 def report_extract_entry(self, url):
3315 """Report downloading extry
"""
3316 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3318 def report_date(self, upload_date):
3319 """Report downloading extry
"""
3320 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3322 def report_uploader(self, uploader):
3323 """Report downloading extry
"""
3324 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3326 def report_title(self, video_title):
3327 """Report downloading extry
"""
3328 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3330 def report_extract_vid_page(self, video_page):
3331 """Report information extraction
."""
3332 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3334 def _real_extract(self, url):
3335 # Extract id from URL
3336 mobj = re.match(self._VALID_URL, url)
3338 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3341 post_url = mobj.group(0)
3342 video_id = mobj.group(2)
3344 video_extension = 'flv'
3346 # Step 1, Retrieve post webpage to extract further information
3347 self.report_extract_entry(post_url)
3348 request = urllib2.Request(post_url)
3350 webpage = urllib2.urlopen(request).read()
3351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3355 # Extract update date
3357 pattern = 'title="Timestamp">(.*?)</a>'
3358 mobj = re.search(pattern, webpage)
3360 upload_date = mobj.group(1)
3361 # Convert timestring to a format suitable for filename
3362 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3363 upload_date = upload_date.strftime('%Y%m%d')
3364 self.report_date(upload_date)
3368 pattern = r'rel\="author".*?>(.*?)</a>'
3369 mobj = re.search(pattern, webpage)
3371 uploader = mobj.group(1)
3372 self.report_uploader(uploader)
3375 # Get the first line for title
3377 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3378 mobj = re.search(pattern, webpage)
3380 video_title = mobj.group(1)
3381 self.report_title(video_title)
3383 # Step 2, Stimulate clicking the image box to launch video
3384 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3385 mobj = re.search(pattern, webpage)
3387 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3389 video_page = mobj.group(1)
3390 request = urllib2.Request(video_page)
3392 webpage = urllib2.urlopen(request).read()
3393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3394 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3396 self.report_extract_vid_page(video_page)
3399 # Extract video links on video page
3400 """Extract video links of all sizes
"""
3401 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3402 mobj = re.findall(pattern, webpage)
3404 self._downloader.trouble(u'ERROR: unable to extract video links')
3406 # Sort in resolution
3407 links = sorted(mobj)
3409 # Choose the lowest of the sort, i.e. highest resolution
3410 video_url = links[-1]
3411 # Only get the url. The resolution part in the tuple has no use anymore
3412 video_url = video_url[-1]
3413 # Treat escaped \u0026 style hex
3414 video_url = unicode(video_url, "unicode_escape")
3418 'id': video_id.decode('utf-8'),
3420 'uploader': uploader.decode('utf-8'),
3421 'upload_date': upload_date.decode('utf-8'),
3422 'title': video_title.decode('utf-8'),
3423 'ext': video_extension.decode('utf-8'),