]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Fix xvideo IE in Python 3
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
d77c3dfd 6import datetime
d77c3dfd
FV
7import netrc
8import os
9import re
10import socket
11import time
d77c3dfd 12import email.utils
921a1455 13import xml.etree.ElementTree
302efc19 14import random
15import math
d77c3dfd 16
9e8056d5 17from .utils import *
d77c3dfd
FV
18
19
20class InfoExtractor(object):
59ae15a5 21 """Information Extractor class.
d77c3dfd 22
59ae15a5
PH
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
717b1f72 30
59ae15a5 31 The dictionaries must include the following fields:
717b1f72 32
59ae15a5
PH
33 id: Video identifier.
34 url: Final video URL.
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
717b1f72 39
59ae15a5 40 The following fields are optional:
717b1f72 41
59ae15a5
PH
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
d77c3dfd 49
59ae15a5 50 The fields should all be Unicode strings.
9ce5d9ee 51
59ae15a5
PH
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
717b1f72 55
59ae15a5
PH
56 _real_extract() must return a *list* of information dictionaries as
57 described above.
03c5b0fb 58
59ae15a5
PH
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
61 """
d77c3dfd 62
59ae15a5
PH
63 _ready = False
64 _downloader = None
65 _WORKING = True
d77c3dfd 66
59ae15a5
PH
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
69 self._ready = False
70 self.set_downloader(downloader)
d77c3dfd 71
59ae15a5
PH
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
d77c3dfd 75
59ae15a5
PH
76 def working(self):
77 """Getter method for _WORKING."""
78 return self._WORKING
03c5b0fb 79
59ae15a5
PH
80 def initialize(self):
81 """Initializes an instance (authentication, etc)."""
82 if not self._ready:
83 self._real_initialize()
84 self._ready = True
d77c3dfd 85
59ae15a5
PH
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
88 self.initialize()
89 return self._real_extract(url)
d77c3dfd 90
59ae15a5
PH
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
d77c3dfd 94
59ae15a5
PH
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
97 pass
d77c3dfd 98
59ae15a5
PH
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
101 pass
d77c3dfd
FV
102
103
104class YoutubeIE(InfoExtractor):
59ae15a5
PH
105 """Information extractor for youtube.com."""
106
107 _VALID_URL = r"""^
108 (
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
120 v=
121 )
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
126 $"""
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
136 '13': '3gp',
137 '17': 'mp4',
138 '18': 'mp4',
139 '22': 'mp4',
140 '37': 'mp4',
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142 '43': 'webm',
143 '44': 'webm',
144 '45': 'webm',
145 '46': 'webm',
146 }
147 _video_dimensions = {
148 '5': '240x400',
149 '6': '???',
150 '13': '???',
151 '17': '144x176',
152 '18': '360x640',
153 '22': '720x1280',
154 '34': '360x640',
155 '35': '480x854',
156 '37': '1080x1920',
157 '38': '3072x4096',
158 '43': '360x640',
159 '44': '480x854',
160 '45': '720x1280',
161 '46': '1080x1920',
162 }
163 IE_NAME = u'youtube'
164
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
172
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
176
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
180
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205 def _closed_captions_xml_to_srt(self, xml_string):
206 srt = ''
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
211 start = float(start)
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
220 return srt
221
222 def _print_formats(self, formats):
223 print('Available formats:')
224 for x in formats:
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227 def _real_initialize(self):
228 if self._downloader is None:
229 return
230
231 username = None
232 password = None
233 downloader_params = self._downloader.params
234
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
240 try:
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242 if info is not None:
243 username = info[0]
244 password = info[2]
245 else:
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249 return
250
251 # Set language
252 request = compat_urllib_request.Request(self._LANG_URL)
253 try:
254 self.report_lang()
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258 return
259
260 # No authentication to be performed
261 if username is None:
262 return
263
264 # Log in
265 login_form = {
266 'current_form': 'loginForm',
267 'next': '/',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
271 }
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273 try:
274 self.report_login()
275 login_results = compat_urllib_request.urlopen(request).read()
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278 return
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281 return
282
283 # Confirm age
284 age_form = {
285 'next_url': '/',
286 'action_confirm': 'Confirm',
287 }
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289 try:
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read()
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294 return
295
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
299 if mobj:
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304 if mobj is None:
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306 return
307 video_id = mobj.group(2)
308
309 # Get video webpage
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312 try:
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316 return
317
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322 if mobj is not None:
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324 else:
325 player_url = None
326
327 # Get video info
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
333 try:
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
338 break
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341 return
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345 else:
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347 return
348
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
352 return
353
354 # Start extracting information
355 self.report_information_extraction(video_id)
356
357 # uploader
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360 return
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363 # title
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
366 return
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369 # thumbnail image
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372 video_thumbnail = ''
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376 # upload date
377 upload_date = None
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379 if mobj is not None:
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
383 try:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385 except:
386 pass
387
388 # description
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
392 else:
393 video_description = ''
394
395 # closed captions
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
398 try:
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401 try:
402 srt_list = compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
412 srt_lang = 'en'
413 else:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418 try:
419 srt_xml = compat_urllib_request.urlopen(request).read()
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422 if not srt_xml:
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
425 except Trouble as trouble:
426 self._downloader.trouble(trouble[0])
427
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
430 video_duration = ''
431 else:
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434 # token
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
439
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
453 else:
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
458 return
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
461 return
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468 else:
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
474 if rf in url_map:
475 video_url_list = [(rf, url_map[rf])]
476 break
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
479 return
480 else:
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482 return
483
484 results = []
485 for format_param, video_real_url in video_url_list:
486 # Extension
487 video_extension = self._video_extensions.get(format_param, 'flv')
488
32761d86
FV
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
491
492 results.append({
493 'id': video_id,
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
505 })
506 return results
d77c3dfd
FV
507
508
509class MetacafeIE(InfoExtractor):
59ae15a5
PH
510 """Information Extractor for metacafe.com."""
511
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
516
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
519
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
527
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
539 try:
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544 return
545
546 # Confirm age
547 disclaimer_form = {
548 'filters': '0',
549 'submit': "Continue - I'm over 18",
550 }
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552 try:
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557 return
558
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
562 if mobj is None:
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564 return
565
566 video_id = mobj.group(1)
567
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572 return
573
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576 try:
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581 return
582
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586 if mobj is not None:
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
589
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592 if mobj is None:
593 video_url = mediaURL
594 else:
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597 else:
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599 if mobj is None:
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
601 return
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
605 return
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
609 return
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615 if mobj is None:
616 self._downloader.trouble(u'ERROR: unable to extract title')
617 return
618 video_title = mobj.group(1).decode('utf-8')
619
620 mobj = re.search(r'submitter=(.*?);', webpage)
621 if mobj is None:
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623 return
624 video_uploader = mobj.group(1)
625
626 return [{
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
630 'upload_date': None,
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
633 }]
d77c3dfd
FV
634
635
636class DailymotionIE(InfoExtractor):
59ae15a5
PH
637 """Information Extractor for Dailymotion"""
638
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
641
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
644
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
656 if mobj is None:
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658 return
659
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662 video_extension = 'mp4'
663
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
667 try:
668 self.report_download_webpage(video_id)
669 webpage = compat_urllib_request.urlopen(request).read()
670 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
671 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
672 return
673
674 # Extract URL, uploader and title from webpage
675 self.report_extraction(video_id)
676 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
677 if mobj is None:
678 self._downloader.trouble(u'ERROR: unable to extract media URL')
679 return
680 flashvars = compat_urllib_parse.unquote(mobj.group(1))
681
682 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
683 if key in flashvars:
684 max_quality = key
685 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
686 break
687 else:
688 self._downloader.trouble(u'ERROR: unable to extract video URL')
689 return
690
691 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
692 if mobj is None:
693 self._downloader.trouble(u'ERROR: unable to extract video URL')
694 return
695
696 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
697
698 # TODO: support choosing qualities
699
700 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
701 if mobj is None:
702 self._downloader.trouble(u'ERROR: unable to extract title')
703 return
704 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
705
706 video_uploader = None
707 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
708 if mobj is None:
709 # lookin for official user
710 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
711 if mobj_official is None:
712 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
713 else:
714 video_uploader = mobj_official.group(1)
715 else:
716 video_uploader = mobj.group(1)
717
718 video_upload_date = None
719 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
720 if mobj is not None:
721 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
722
723 return [{
724 'id': video_id.decode('utf-8'),
725 'url': video_url.decode('utf-8'),
726 'uploader': video_uploader.decode('utf-8'),
727 'upload_date': video_upload_date,
728 'title': video_title,
729 'ext': video_extension.decode('utf-8'),
730 }]
d77c3dfd
FV
731
732
733class GoogleIE(InfoExtractor):
59ae15a5
PH
734 """Information extractor for video.google.com."""
735
736 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
737 IE_NAME = u'video.google'
738
739 def __init__(self, downloader=None):
740 InfoExtractor.__init__(self, downloader)
741
742 def report_download_webpage(self, video_id):
743 """Report webpage download."""
744 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
745
746 def report_extraction(self, video_id):
747 """Report information extraction."""
748 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
749
750 def _real_extract(self, url):
751 # Extract id from URL
752 mobj = re.match(self._VALID_URL, url)
753 if mobj is None:
754 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
755 return
756
757 video_id = mobj.group(1)
758
759 video_extension = 'mp4'
760
761 # Retrieve video webpage to extract further information
762 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
763 try:
764 self.report_download_webpage(video_id)
765 webpage = compat_urllib_request.urlopen(request).read()
766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
767 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
768 return
769
770 # Extract URL, uploader, and title from webpage
771 self.report_extraction(video_id)
772 mobj = re.search(r"download_url:'([^']+)'", webpage)
773 if mobj is None:
774 video_extension = 'flv'
775 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
776 if mobj is None:
777 self._downloader.trouble(u'ERROR: unable to extract media URL')
778 return
779 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
780 mediaURL = mediaURL.replace('\\x3d', '\x3d')
781 mediaURL = mediaURL.replace('\\x26', '\x26')
782
783 video_url = mediaURL
784
785 mobj = re.search(r'<title>(.*)</title>', webpage)
786 if mobj is None:
787 self._downloader.trouble(u'ERROR: unable to extract title')
788 return
789 video_title = mobj.group(1).decode('utf-8')
790
791 # Extract video description
792 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
793 if mobj is None:
794 self._downloader.trouble(u'ERROR: unable to extract video description')
795 return
796 video_description = mobj.group(1).decode('utf-8')
797 if not video_description:
798 video_description = 'No description available.'
799
800 # Extract video thumbnail
801 if self._downloader.params.get('forcethumbnail', False):
802 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
803 try:
804 webpage = compat_urllib_request.urlopen(request).read()
805 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
806 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
807 return
808 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
809 if mobj is None:
810 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
811 return
812 video_thumbnail = mobj.group(1)
813 else: # we need something to pass to process_info
814 video_thumbnail = ''
815
816 return [{
817 'id': video_id.decode('utf-8'),
818 'url': video_url.decode('utf-8'),
819 'uploader': None,
820 'upload_date': None,
821 'title': video_title,
822 'ext': video_extension.decode('utf-8'),
823 }]
d77c3dfd
FV
824
825
826class PhotobucketIE(InfoExtractor):
59ae15a5
PH
827 """Information extractor for photobucket.com."""
828
829 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
830 IE_NAME = u'photobucket'
831
832 def __init__(self, downloader=None):
833 InfoExtractor.__init__(self, downloader)
834
835 def report_download_webpage(self, video_id):
836 """Report webpage download."""
837 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
838
839 def report_extraction(self, video_id):
840 """Report information extraction."""
841 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
842
843 def _real_extract(self, url):
844 # Extract id from URL
845 mobj = re.match(self._VALID_URL, url)
846 if mobj is None:
847 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
848 return
849
850 video_id = mobj.group(1)
851
852 video_extension = 'flv'
853
854 # Retrieve video webpage to extract further information
855 request = compat_urllib_request.Request(url)
856 try:
857 self.report_download_webpage(video_id)
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861 return
862
863 # Extract URL, uploader, and title from webpage
864 self.report_extraction(video_id)
865 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
866 if mobj is None:
867 self._downloader.trouble(u'ERROR: unable to extract media URL')
868 return
869 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
870
871 video_url = mediaURL
872
873 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
874 if mobj is None:
875 self._downloader.trouble(u'ERROR: unable to extract title')
876 return
877 video_title = mobj.group(1).decode('utf-8')
878
879 video_uploader = mobj.group(2).decode('utf-8')
880
881 return [{
882 'id': video_id.decode('utf-8'),
883 'url': video_url.decode('utf-8'),
884 'uploader': video_uploader,
885 'upload_date': None,
886 'title': video_title,
887 'ext': video_extension.decode('utf-8'),
888 }]
d77c3dfd
FV
889
890
891class YahooIE(InfoExtractor):
59ae15a5
PH
892 """Information extractor for video.yahoo.com."""
893
894 # _VALID_URL matches all Yahoo! Video URLs
895 # _VPAGE_URL matches only the extractable '/watch/' URLs
896 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
897 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
898 IE_NAME = u'video.yahoo'
899
900 def __init__(self, downloader=None):
901 InfoExtractor.__init__(self, downloader)
902
903 def report_download_webpage(self, video_id):
904 """Report webpage download."""
905 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
906
907 def report_extraction(self, video_id):
908 """Report information extraction."""
909 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
910
911 def _real_extract(self, url, new_video=True):
912 # Extract ID from URL
913 mobj = re.match(self._VALID_URL, url)
914 if mobj is None:
915 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
916 return
917
918 video_id = mobj.group(2)
919 video_extension = 'flv'
920
921 # Rewrite valid but non-extractable URLs as
922 # extractable English language /watch/ URLs
923 if re.match(self._VPAGE_URL, url) is None:
924 request = compat_urllib_request.Request(url)
925 try:
926 webpage = compat_urllib_request.urlopen(request).read()
927 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
928 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
929 return
930
931 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract id field')
934 return
935 yahoo_id = mobj.group(1)
936
937 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
938 if mobj is None:
939 self._downloader.trouble(u'ERROR: Unable to extract vid field')
940 return
941 yahoo_vid = mobj.group(1)
942
943 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
944 return self._real_extract(url, new_video=False)
945
946 # Retrieve video webpage to extract further information
947 request = compat_urllib_request.Request(url)
948 try:
949 self.report_download_webpage(video_id)
950 webpage = compat_urllib_request.urlopen(request).read()
951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
953 return
954
955 # Extract uploader and title from webpage
956 self.report_extraction(video_id)
957 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video title')
960 return
961 video_title = mobj.group(1).decode('utf-8')
962
963 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
964 if mobj is None:
965 self._downloader.trouble(u'ERROR: unable to extract video uploader')
966 return
967 video_uploader = mobj.group(1).decode('utf-8')
968
969 # Extract video thumbnail
970 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
971 if mobj is None:
972 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
973 return
974 video_thumbnail = mobj.group(1).decode('utf-8')
975
976 # Extract video description
977 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
978 if mobj is None:
979 self._downloader.trouble(u'ERROR: unable to extract video description')
980 return
981 video_description = mobj.group(1).decode('utf-8')
982 if not video_description:
983 video_description = 'No description available.'
984
985 # Extract video height and width
986 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video height')
989 return
990 yv_video_height = mobj.group(1)
991
992 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
993 if mobj is None:
994 self._downloader.trouble(u'ERROR: unable to extract video width')
995 return
996 yv_video_width = mobj.group(1)
997
998 # Retrieve video playlist to extract media URL
999 # I'm not completely sure what all these options are, but we
1000 # seem to need most of them, otherwise the server sends a 401.
1001 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1002 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1003 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1004 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1005 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006 try:
1007 self.report_download_webpage(video_id)
1008 webpage = compat_urllib_request.urlopen(request).read()
1009 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1010 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011 return
1012
1013 # Extract media URL from playlist XML
1014 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1015 if mobj is None:
1016 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1017 return
1018 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1019 video_url = unescapeHTML(video_url)
1020
1021 return [{
1022 'id': video_id.decode('utf-8'),
1023 'url': video_url,
1024 'uploader': video_uploader,
1025 'upload_date': None,
1026 'title': video_title,
1027 'ext': video_extension.decode('utf-8'),
1028 'thumbnail': video_thumbnail.decode('utf-8'),
1029 'description': video_description,
1030 }]
d77c3dfd
FV
1031
1032
1033class VimeoIE(InfoExtractor):
59ae15a5
PH
1034 """Information extractor for vimeo.com."""
1035
1036 # _VALID_URL matches Vimeo URLs
1037 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1038 IE_NAME = u'vimeo'
1039
1040 def __init__(self, downloader=None):
1041 InfoExtractor.__init__(self, downloader)
1042
1043 def report_download_webpage(self, video_id):
1044 """Report webpage download."""
1045 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1046
1047 def report_extraction(self, video_id):
1048 """Report information extraction."""
1049 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1050
1051 def _real_extract(self, url, new_video=True):
1052 # Extract ID from URL
1053 mobj = re.match(self._VALID_URL, url)
1054 if mobj is None:
1055 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1056 return
1057
1058 video_id = mobj.group(1)
1059
1060 # Retrieve video webpage to extract further information
1061 request = compat_urllib_request.Request(url, None, std_headers)
1062 try:
1063 self.report_download_webpage(video_id)
1064 webpage = compat_urllib_request.urlopen(request).read()
1065 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1067 return
1068
1069 # Now we begin extracting as much information as we can from what we
1070 # retrieved. First we extract the information common to all extractors,
1071 # and latter we extract those that are Vimeo specific.
1072 self.report_extraction(video_id)
1073
1074 # Extract the config JSON
59ae15a5 1075 try:
1ca63e3a 1076 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1077 config = json.loads(config)
1078 except:
1079 self._downloader.trouble(u'ERROR: unable to extract info section')
1080 return
1081
1082 # Extract title
1083 video_title = config["video"]["title"]
1084
1085 # Extract uploader
1086 video_uploader = config["video"]["owner"]["name"]
1087
1088 # Extract video thumbnail
1089 video_thumbnail = config["video"]["thumbnail"]
1090
1091 # Extract video description
1092 video_description = get_element_by_id("description", webpage.decode('utf8'))
1093 if video_description: video_description = clean_html(video_description)
1094 else: video_description = ''
1095
1096 # Extract upload date
1097 video_upload_date = None
1098 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1099 if mobj is not None:
1100 video_upload_date = mobj.group(1)
1101
1102 # Vimeo specific: extract request signature and timestamp
1103 sig = config['request']['signature']
1104 timestamp = config['request']['timestamp']
1105
1106 # Vimeo specific: extract video codec and quality information
1107 # First consider quality, then codecs, then take everything
1108 # TODO bind to format param
1109 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1110 files = { 'hd': [], 'sd': [], 'other': []}
1111 for codec_name, codec_extension in codecs:
1112 if codec_name in config["video"]["files"]:
1113 if 'hd' in config["video"]["files"][codec_name]:
1114 files['hd'].append((codec_name, codec_extension, 'hd'))
1115 elif 'sd' in config["video"]["files"][codec_name]:
1116 files['sd'].append((codec_name, codec_extension, 'sd'))
1117 else:
1118 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1119
1120 for quality in ('hd', 'sd', 'other'):
1121 if len(files[quality]) > 0:
1122 video_quality = files[quality][0][2]
1123 video_codec = files[quality][0][0]
1124 video_extension = files[quality][0][1]
1125 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1126 break
1127 else:
1128 self._downloader.trouble(u'ERROR: no known codec found')
1129 return
1130
1131 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1132 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133
1134 return [{
1135 'id': video_id,
1136 'url': video_url,
1137 'uploader': video_uploader,
1138 'upload_date': video_upload_date,
1139 'title': video_title,
1140 'ext': video_extension,
1141 'thumbnail': video_thumbnail,
1142 'description': video_description,
1143 }]
d77c3dfd
FV
1144
1145
f2ad10a9 1146class ArteTvIE(InfoExtractor):
59ae15a5
PH
1147 """arte.tv information extractor."""
1148
1149 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1150 _LIVE_URL = r'index-[0-9]+\.html$'
1151
1152 IE_NAME = u'arte.tv'
1153
1154 def __init__(self, downloader=None):
1155 InfoExtractor.__init__(self, downloader)
1156
1157 def report_download_webpage(self, video_id):
1158 """Report webpage download."""
1159 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1160
1161 def report_extraction(self, video_id):
1162 """Report information extraction."""
1163 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1164
1165 def fetch_webpage(self, url):
1166 self._downloader.increment_downloads()
1167 request = compat_urllib_request.Request(url)
1168 try:
1169 self.report_download_webpage(url)
1170 webpage = compat_urllib_request.urlopen(request).read()
1171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1172 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1173 return
1174 except ValueError as err:
1175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176 return
1177 return webpage
1178
1179 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1180 page = self.fetch_webpage(url)
1181 mobj = re.search(regex, page, regexFlags)
1182 info = {}
1183
1184 if mobj is None:
1185 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186 return
1187
1188 for (i, key, err) in matchTuples:
1189 if mobj.group(i) is None:
1190 self._downloader.trouble(err)
1191 return
1192 else:
1193 info[key] = mobj.group(i)
1194
1195 return info
1196
1197 def extractLiveStream(self, url):
1198 video_lang = url.split('/')[-4]
1199 info = self.grep_webpage(
1200 url,
1201 r'src="(.*?/videothek_js.*?\.js)',
1202 0,
1203 [
1204 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1205 ]
1206 )
1207 http_host = url.split('/')[2]
1208 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209 info = self.grep_webpage(
1210 next_url,
1211 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212 '(http://.*?\.swf).*?' +
1213 '(rtmp://.*?)\'',
1214 re.DOTALL,
1215 [
1216 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1217 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1218 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1219 ]
1220 )
1221 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223 def extractPlus7Stream(self, url):
1224 video_lang = url.split('/')[-3]
1225 info = self.grep_webpage(
1226 url,
1227 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228 0,
1229 [
1230 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1231 ]
1232 )
1233 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1235 next_url,
1236 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237 0,
1238 [
1239 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1240 ]
1241 )
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244 info = self.grep_webpage(
1245 next_url,
1246 r'<video id="(.*?)".*?>.*?' +
1247 '<name>(.*?)</name>.*?' +
1248 '<dateVideo>(.*?)</dateVideo>.*?' +
1249 '<url quality="hd">(.*?)</url>',
1250 re.DOTALL,
1251 [
1252 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1253 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1254 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1255 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1256 ]
1257 )
1258
1259 return {
1260 'id': info.get('id'),
1261 'url': compat_urllib_parse.unquote(info.get('url')),
1262 'uploader': u'arte.tv',
1263 'upload_date': info.get('date'),
1264 'title': info.get('title'),
1265 'ext': u'mp4',
1266 'format': u'NA',
1267 'player_url': None,
1268 }
1269
1270 def _real_extract(self, url):
1271 video_id = url.split('/')[-1]
1272 self.report_extraction(video_id)
1273
1274 if re.search(self._LIVE_URL, video_id) is not None:
1275 self.extractLiveStream(url)
1276 return
1277 else:
1278 info = self.extractPlus7Stream(url)
1279
1280 return [info]
f2ad10a9
CA
1281
1282
d77c3dfd 1283class GenericIE(InfoExtractor):
59ae15a5
PH
1284 """Generic last-resort information extractor."""
1285
1286 _VALID_URL = r'.*'
1287 IE_NAME = u'generic'
1288
1289 def __init__(self, downloader=None):
1290 InfoExtractor.__init__(self, downloader)
1291
1292 def report_download_webpage(self, video_id):
1293 """Report webpage download."""
1294 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1295 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1296
1297 def report_extraction(self, video_id):
1298 """Report information extraction."""
1299 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1300
1301 def report_following_redirect(self, new_url):
1302 """Report information extraction."""
1303 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1304
1305 def _test_redirect(self, url):
1306 """Check if it is a redirect, like url shorteners, in case restart chain."""
1307 class HeadRequest(compat_urllib_request.Request):
1308 def get_method(self):
1309 return "HEAD"
1310
1311 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1312 """
1313 Subclass the HTTPRedirectHandler to make it use our
1314 HeadRequest also on the redirected URL
1315 """
1316 def redirect_request(self, req, fp, code, msg, headers, newurl):
1317 if code in (301, 302, 303, 307):
1318 newurl = newurl.replace(' ', '%20')
1319 newheaders = dict((k,v) for k,v in req.headers.items()
1320 if k.lower() not in ("content-length", "content-type"))
1321 return HeadRequest(newurl,
1322 headers=newheaders,
1323 origin_req_host=req.get_origin_req_host(),
1324 unverifiable=True)
1325 else:
1326 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1327
1328 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1329 """
1330 Fallback to GET if HEAD is not allowed (405 HTTP error)
1331 """
1332 def http_error_405(self, req, fp, code, msg, headers):
1333 fp.read()
1334 fp.close()
1335
1336 newheaders = dict((k,v) for k,v in req.headers.items()
1337 if k.lower() not in ("content-length", "content-type"))
1338 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1339 headers=newheaders,
1340 origin_req_host=req.get_origin_req_host(),
1341 unverifiable=True))
1342
1343 # Build our opener
1344 opener = compat_urllib_request.OpenerDirector()
1345 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1346 HTTPMethodFallback, HEADRedirectHandler,
1347 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1348 opener.add_handler(handler())
1349
1350 response = opener.open(HeadRequest(url))
1351 new_url = response.geturl()
1352
1353 if url == new_url:
1354 return False
1355
1356 self.report_following_redirect(new_url)
1357 self._downloader.download([new_url])
1358 return True
1359
1360 def _real_extract(self, url):
1361 if self._test_redirect(url): return
1362
1363 video_id = url.split('/')[-1]
1364 request = compat_urllib_request.Request(url)
1365 try:
1366 self.report_download_webpage(video_id)
1367 webpage = compat_urllib_request.urlopen(request).read()
1368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1369 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1370 return
1371 except ValueError as err:
1372 # since this is the last-resort InfoExtractor, if
1373 # this error is thrown, it'll be thrown here
1374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375 return
1376
1377 self.report_extraction(video_id)
1378 # Start with something easy: JW Player in SWFObject
1379 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380 if mobj is None:
1381 # Broaden the search a little bit
1382 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383 if mobj is None:
1384 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385 return
1386
1387 # It's possible that one of the regexes
1388 # matched, but returned an empty group:
1389 if mobj.group(1) is None:
1390 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391 return
1392
1393 video_url = compat_urllib_parse.unquote(mobj.group(1))
1394 video_id = os.path.basename(video_url)
1395
1396 # here's a fun little line of code for you:
1397 video_extension = os.path.splitext(video_id)[1][1:]
1398 video_id = os.path.splitext(video_id)[0]
1399
1400 # it's tempting to parse this further, but you would
1401 # have to take into account all the variations like
1402 # Video Title - Site Name
1403 # Site Name | Video Title
1404 # Video Title - Tagline | Site Name
1405 # and so on and so forth; it's just not practical
1406 mobj = re.search(r'<title>(.*)</title>', webpage)
1407 if mobj is None:
1408 self._downloader.trouble(u'ERROR: unable to extract title')
1409 return
1410 video_title = mobj.group(1).decode('utf-8')
1411
1412 # video uploader is domain name
1413 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1414 if mobj is None:
1415 self._downloader.trouble(u'ERROR: unable to extract title')
1416 return
1417 video_uploader = mobj.group(1).decode('utf-8')
1418
1419 return [{
1420 'id': video_id.decode('utf-8'),
1421 'url': video_url.decode('utf-8'),
1422 'uploader': video_uploader,
1423 'upload_date': None,
1424 'title': video_title,
1425 'ext': video_extension.decode('utf-8'),
1426 }]
d77c3dfd
FV
1427
1428
1429class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1430 """Information Extractor for YouTube search queries."""
1431 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1432 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1433 _max_youtube_results = 1000
1434 IE_NAME = u'youtube:search'
1435
1436 def __init__(self, downloader=None):
1437 InfoExtractor.__init__(self, downloader)
1438
1439 def report_download_page(self, query, pagenum):
1440 """Report attempt to download search page with given number."""
1441 query = query.decode(preferredencoding())
1442 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1443
1444 def _real_extract(self, query):
1445 mobj = re.match(self._VALID_URL, query)
1446 if mobj is None:
1447 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1448 return
1449
1450 prefix, query = query.split(':')
1451 prefix = prefix[8:]
1452 query = query.encode('utf-8')
1453 if prefix == '':
1454 self._download_n_results(query, 1)
1455 return
1456 elif prefix == 'all':
1457 self._download_n_results(query, self._max_youtube_results)
1458 return
1459 else:
1460 try:
1461 n = int(prefix)
1462 if n <= 0:
1463 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1464 return
1465 elif n > self._max_youtube_results:
1466 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1467 n = self._max_youtube_results
1468 self._download_n_results(query, n)
1469 return
1470 except ValueError: # parsing prefix as integer fails
1471 self._download_n_results(query, 1)
1472 return
1473
1474 def _download_n_results(self, query, n):
1475 """Downloads a specified number of results for a query"""
1476
1477 video_ids = []
1478 pagenum = 0
1479 limit = n
1480
1481 while (50 * pagenum) < limit:
1482 self.report_download_page(query, pagenum+1)
1483 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1484 request = compat_urllib_request.Request(result_url)
1485 try:
1486 data = compat_urllib_request.urlopen(request).read()
1487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1488 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1489 return
1490 api_response = json.loads(data)['data']
1491
1492 new_ids = list(video['id'] for video in api_response['items'])
1493 video_ids += new_ids
1494
1495 limit = min(n, api_response['totalItems'])
1496 pagenum += 1
1497
1498 if len(video_ids) > n:
1499 video_ids = video_ids[:n]
1500 for id in video_ids:
1501 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1502 return
d77c3dfd
FV
1503
1504
1505class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1506 """Information Extractor for Google Video search queries."""
1507 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1508 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1509 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1510 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1511 _max_google_results = 1000
1512 IE_NAME = u'video.google:search'
1513
1514 def __init__(self, downloader=None):
1515 InfoExtractor.__init__(self, downloader)
1516
1517 def report_download_page(self, query, pagenum):
1518 """Report attempt to download playlist page with given number."""
1519 query = query.decode(preferredencoding())
1520 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1521
1522 def _real_extract(self, query):
1523 mobj = re.match(self._VALID_URL, query)
1524 if mobj is None:
1525 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1526 return
1527
1528 prefix, query = query.split(':')
1529 prefix = prefix[8:]
1530 query = query.encode('utf-8')
1531 if prefix == '':
1532 self._download_n_results(query, 1)
1533 return
1534 elif prefix == 'all':
1535 self._download_n_results(query, self._max_google_results)
1536 return
1537 else:
1538 try:
1539 n = int(prefix)
1540 if n <= 0:
1541 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1542 return
1543 elif n > self._max_google_results:
1544 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1545 n = self._max_google_results
1546 self._download_n_results(query, n)
1547 return
1548 except ValueError: # parsing prefix as integer fails
1549 self._download_n_results(query, 1)
1550 return
1551
1552 def _download_n_results(self, query, n):
1553 """Downloads a specified number of results for a query"""
1554
1555 video_ids = []
1556 pagenum = 0
1557
1558 while True:
1559 self.report_download_page(query, pagenum)
1560 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1561 request = compat_urllib_request.Request(result_url)
1562 try:
1563 page = compat_urllib_request.urlopen(request).read()
1564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1565 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1566 return
1567
1568 # Extract video identifiers
1569 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1570 video_id = mobj.group(1)
1571 if video_id not in video_ids:
1572 video_ids.append(video_id)
1573 if len(video_ids) == n:
1574 # Specified n videos reached
1575 for id in video_ids:
1576 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577 return
1578
1579 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1580 for id in video_ids:
1581 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1582 return
1583
1584 pagenum = pagenum + 1
d77c3dfd
FV
1585
1586
1587class YahooSearchIE(InfoExtractor):
59ae15a5
PH
1588 """Information Extractor for Yahoo! Video search queries."""
1589 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1590 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1591 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1592 _MORE_PAGES_INDICATOR = r'\s*Next'
1593 _max_yahoo_results = 1000
1594 IE_NAME = u'video.yahoo:search'
1595
1596 def __init__(self, downloader=None):
1597 InfoExtractor.__init__(self, downloader)
1598
1599 def report_download_page(self, query, pagenum):
1600 """Report attempt to download playlist page with given number."""
1601 query = query.decode(preferredencoding())
1602 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1603
1604 def _real_extract(self, query):
1605 mobj = re.match(self._VALID_URL, query)
1606 if mobj is None:
1607 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1608 return
1609
1610 prefix, query = query.split(':')
1611 prefix = prefix[8:]
1612 query = query.encode('utf-8')
1613 if prefix == '':
1614 self._download_n_results(query, 1)
1615 return
1616 elif prefix == 'all':
1617 self._download_n_results(query, self._max_yahoo_results)
1618 return
1619 else:
1620 try:
1621 n = int(prefix)
1622 if n <= 0:
1623 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1624 return
1625 elif n > self._max_yahoo_results:
1626 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1627 n = self._max_yahoo_results
1628 self._download_n_results(query, n)
1629 return
1630 except ValueError: # parsing prefix as integer fails
1631 self._download_n_results(query, 1)
1632 return
1633
1634 def _download_n_results(self, query, n):
1635 """Downloads a specified number of results for a query"""
1636
1637 video_ids = []
1638 already_seen = set()
1639 pagenum = 1
1640
1641 while True:
1642 self.report_download_page(query, pagenum)
1643 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1644 request = compat_urllib_request.Request(result_url)
1645 try:
1646 page = compat_urllib_request.urlopen(request).read()
1647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1648 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1649 return
1650
1651 # Extract video identifiers
1652 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1653 video_id = mobj.group(1)
1654 if video_id not in already_seen:
1655 video_ids.append(video_id)
1656 already_seen.add(video_id)
1657 if len(video_ids) == n:
1658 # Specified n videos reached
1659 for id in video_ids:
1660 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661 return
1662
1663 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1664 for id in video_ids:
1665 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1666 return
1667
1668 pagenum = pagenum + 1
d77c3dfd
FV
1669
1670
1671class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1672 """Information Extractor for YouTube playlists."""
1673
e387eb5a 1674 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
59ae15a5
PH
1675 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1676 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
9789a05c 1677 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1678 IE_NAME = u'youtube:playlist'
1679
1680 def __init__(self, downloader=None):
1681 InfoExtractor.__init__(self, downloader)
1682
1683 def report_download_page(self, playlist_id, pagenum):
1684 """Report attempt to download playlist page with given number."""
1685 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1686
1687 def _real_extract(self, url):
1688 # Extract playlist id
1689 mobj = re.match(self._VALID_URL, url)
1690 if mobj is None:
1691 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1692 return
1693
1694 # Single video case
1695 if mobj.group(3) is not None:
1696 self._downloader.download([mobj.group(3)])
1697 return
1698
1699 # Download playlist pages
1700 # prefix is 'p' as default for playlists but there are other types that need extra care
1701 playlist_prefix = mobj.group(1)
1702 if playlist_prefix == 'a':
1703 playlist_access = 'artist'
1704 else:
1705 playlist_prefix = 'p'
1706 playlist_access = 'view_play_list'
1707 playlist_id = mobj.group(2)
1708 video_ids = []
1709 pagenum = 1
1710
1711 while True:
1712 self.report_download_page(playlist_id, pagenum)
1713 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1714 request = compat_urllib_request.Request(url)
1715 try:
9789a05c 1716 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5
PH
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719 return
1720
1721 # Extract video identifiers
1722 ids_in_page = []
1723 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1727
9789a05c 1728 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1729 break
1730 pagenum = pagenum + 1
1731
9789a05c
FV
1732 total = len(video_ids)
1733
59ae15a5
PH
1734 playliststart = self._downloader.params.get('playliststart', 1) - 1
1735 playlistend = self._downloader.params.get('playlistend', -1)
1736 if playlistend == -1:
1737 video_ids = video_ids[playliststart:]
1738 else:
1739 video_ids = video_ids[playliststart:playlistend]
1740
9789a05c
FV
1741 if len(video_ids) == total:
1742 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1743 else:
1744 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1745
59ae15a5
PH
1746 for id in video_ids:
1747 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1748 return
d77c3dfd
FV
1749
1750
902b2a0a 1751class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1752 """Information Extractor for YouTube channels."""
1753
1754 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1755 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
9789a05c 1756 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1757 IE_NAME = u'youtube:channel'
1758
1759 def report_download_page(self, channel_id, pagenum):
1760 """Report attempt to download channel page with given number."""
1761 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1762
1763 def _real_extract(self, url):
1764 # Extract channel id
1765 mobj = re.match(self._VALID_URL, url)
1766 if mobj is None:
1767 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1768 return
1769
1770 # Download channel pages
1771 channel_id = mobj.group(1)
1772 video_ids = []
1773 pagenum = 1
1774
1775 while True:
1776 self.report_download_page(channel_id, pagenum)
1777 url = self._TEMPLATE_URL % (channel_id, pagenum)
1778 request = compat_urllib_request.Request(url)
1779 try:
9789a05c 1780 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5
PH
1781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1782 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1783 return
1784
1785 # Extract video identifiers
1786 ids_in_page = []
1787 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1788 if mobj.group(1) not in ids_in_page:
1789 ids_in_page.append(mobj.group(1))
1790 video_ids.extend(ids_in_page)
1791
9789a05c 1792 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1793 break
1794 pagenum = pagenum + 1
1795
9789a05c
FV
1796 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1797
59ae15a5
PH
1798 for id in video_ids:
1799 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1800 return
902b2a0a
FV
1801
1802
d77c3dfd 1803class YoutubeUserIE(InfoExtractor):
59ae15a5 1804 """Information Extractor for YouTube users."""
d77c3dfd 1805
59ae15a5
PH
1806 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1807 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1808 _GDATA_PAGE_SIZE = 50
1809 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1810 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1811 IE_NAME = u'youtube:user'
d77c3dfd 1812
59ae15a5
PH
1813 def __init__(self, downloader=None):
1814 InfoExtractor.__init__(self, downloader)
d77c3dfd 1815
59ae15a5
PH
1816 def report_download_page(self, username, start_index):
1817 """Report attempt to download user page."""
1818 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1819 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1820
59ae15a5
PH
1821 def _real_extract(self, url):
1822 # Extract username
1823 mobj = re.match(self._VALID_URL, url)
1824 if mobj is None:
1825 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1826 return
d77c3dfd 1827
59ae15a5 1828 username = mobj.group(1)
d77c3dfd 1829
59ae15a5
PH
1830 # Download video ids using YouTube Data API. Result size per
1831 # query is limited (currently to 50 videos) so we need to query
1832 # page by page until there are no video ids - it means we got
1833 # all of them.
d77c3dfd 1834
59ae15a5
PH
1835 video_ids = []
1836 pagenum = 0
d77c3dfd 1837
59ae15a5
PH
1838 while True:
1839 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1840 self.report_download_page(username, start_index)
d77c3dfd 1841
59ae15a5 1842 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1843
59ae15a5
PH
1844 try:
1845 page = compat_urllib_request.urlopen(request).read()
1846 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1847 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1848 return
d77c3dfd 1849
59ae15a5
PH
1850 # Extract video identifiers
1851 ids_in_page = []
d77c3dfd 1852
59ae15a5
PH
1853 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1854 if mobj.group(1) not in ids_in_page:
1855 ids_in_page.append(mobj.group(1))
d77c3dfd 1856
59ae15a5 1857 video_ids.extend(ids_in_page)
d77c3dfd 1858
59ae15a5
PH
1859 # A little optimization - if current page is not
1860 # "full", ie. does not contain PAGE_SIZE video ids then
1861 # we can assume that this page is the last one - there
1862 # are no more ids on further pages - no need to query
1863 # again.
d77c3dfd 1864
59ae15a5
PH
1865 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1866 break
d77c3dfd 1867
59ae15a5 1868 pagenum += 1
d77c3dfd 1869
59ae15a5
PH
1870 all_ids_count = len(video_ids)
1871 playliststart = self._downloader.params.get('playliststart', 1) - 1
1872 playlistend = self._downloader.params.get('playlistend', -1)
d77c3dfd 1873
59ae15a5
PH
1874 if playlistend == -1:
1875 video_ids = video_ids[playliststart:]
1876 else:
1877 video_ids = video_ids[playliststart:playlistend]
d77c3dfd 1878
59ae15a5
PH
1879 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1880 (username, all_ids_count, len(video_ids)))
d77c3dfd 1881
59ae15a5
PH
1882 for video_id in video_ids:
1883 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1884
1885
eeeb4daa 1886class BlipTVUserIE(InfoExtractor):
59ae15a5 1887 """Information Extractor for blip.tv users."""
eeeb4daa 1888
59ae15a5
PH
1889 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1890 _PAGE_SIZE = 12
1891 IE_NAME = u'blip.tv:user'
eeeb4daa 1892
59ae15a5
PH
1893 def __init__(self, downloader=None):
1894 InfoExtractor.__init__(self, downloader)
eeeb4daa 1895
59ae15a5
PH
1896 def report_download_page(self, username, pagenum):
1897 """Report attempt to download user page."""
1898 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1899 (self.IE_NAME, username, pagenum))
eeeb4daa 1900
59ae15a5
PH
1901 def _real_extract(self, url):
1902 # Extract username
1903 mobj = re.match(self._VALID_URL, url)
1904 if mobj is None:
1905 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1906 return
eeeb4daa 1907
59ae15a5 1908 username = mobj.group(1)
eeeb4daa 1909
59ae15a5 1910 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1911
59ae15a5 1912 request = compat_urllib_request.Request(url)
eeeb4daa 1913
59ae15a5
PH
1914 try:
1915 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1916 mobj = re.search(r'data-users-id="([^"]+)"', page)
1917 page_base = page_base % mobj.group(1)
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1920 return
eeeb4daa
JCGS
1921
1922
59ae15a5
PH
1923 # Download video ids using BlipTV Ajax calls. Result size per
1924 # query is limited (currently to 12 videos) so we need to query
1925 # page by page until there are no video ids - it means we got
1926 # all of them.
eeeb4daa 1927
59ae15a5
PH
1928 video_ids = []
1929 pagenum = 1
eeeb4daa 1930
59ae15a5
PH
1931 while True:
1932 self.report_download_page(username, pagenum)
eeeb4daa 1933
59ae15a5 1934 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa 1935
59ae15a5
PH
1936 try:
1937 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1940 return
eeeb4daa 1941
59ae15a5
PH
1942 # Extract video identifiers
1943 ids_in_page = []
eeeb4daa 1944
59ae15a5
PH
1945 for mobj in re.finditer(r'href="/([^"]+)"', page):
1946 if mobj.group(1) not in ids_in_page:
1947 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1948
59ae15a5 1949 video_ids.extend(ids_in_page)
eeeb4daa 1950
59ae15a5
PH
1951 # A little optimization - if current page is not
1952 # "full", ie. does not contain PAGE_SIZE video ids then
1953 # we can assume that this page is the last one - there
1954 # are no more ids on further pages - no need to query
1955 # again.
eeeb4daa 1956
59ae15a5
PH
1957 if len(ids_in_page) < self._PAGE_SIZE:
1958 break
eeeb4daa 1959
59ae15a5 1960 pagenum += 1
eeeb4daa 1961
59ae15a5
PH
1962 all_ids_count = len(video_ids)
1963 playliststart = self._downloader.params.get('playliststart', 1) - 1
1964 playlistend = self._downloader.params.get('playlistend', -1)
eeeb4daa 1965
59ae15a5
PH
1966 if playlistend == -1:
1967 video_ids = video_ids[playliststart:]
1968 else:
1969 video_ids = video_ids[playliststart:playlistend]
eeeb4daa 1970
59ae15a5
PH
1971 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1972 (self.IE_NAME, username, all_ids_count, len(video_ids)))
eeeb4daa 1973
59ae15a5
PH
1974 for video_id in video_ids:
1975 self._downloader.download([u'http://blip.tv/'+video_id])
eeeb4daa
JCGS
1976
1977
d77c3dfd 1978class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1979 """Information extractor for depositfiles.com"""
1980
1981 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1982 IE_NAME = u'DepositFiles'
1983
1984 def __init__(self, downloader=None):
1985 InfoExtractor.__init__(self, downloader)
1986
1987 def report_download_webpage(self, file_id):
1988 """Report webpage download."""
1989 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1990
1991 def report_extraction(self, file_id):
1992 """Report information extraction."""
1993 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1994
1995 def _real_extract(self, url):
1996 file_id = url.split('/')[-1]
1997 # Rebuild url in english locale
1998 url = 'http://depositfiles.com/en/files/' + file_id
1999
2000 # Retrieve file webpage with 'Free download' button pressed
2001 free_download_indication = { 'gateway_result' : '1' }
2002 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2003 try:
2004 self.report_download_webpage(file_id)
2005 webpage = compat_urllib_request.urlopen(request).read()
2006 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2007 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2008 return
2009
2010 # Search for the real file URL
2011 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2012 if (mobj is None) or (mobj.group(1) is None):
2013 # Try to figure out reason of the error.
2014 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2015 if (mobj is not None) and (mobj.group(1) is not None):
2016 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2017 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2018 else:
2019 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2020 return
2021
2022 file_url = mobj.group(1)
2023 file_extension = os.path.splitext(file_url)[1][1:]
2024
2025 # Search for file title
2026 mobj = re.search(r'<b title="(.*?)">', webpage)
2027 if mobj is None:
2028 self._downloader.trouble(u'ERROR: unable to extract title')
2029 return
2030 file_title = mobj.group(1).decode('utf-8')
2031
2032 return [{
2033 'id': file_id.decode('utf-8'),
2034 'url': file_url.decode('utf-8'),
2035 'uploader': None,
2036 'upload_date': None,
2037 'title': file_title,
2038 'ext': file_extension.decode('utf-8'),
2039 }]
d77c3dfd
FV
2040
2041
2042class FacebookIE(InfoExtractor):
59ae15a5
PH
2043 """Information Extractor for Facebook"""
2044
2045 _WORKING = False
2046 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2047 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2048 _NETRC_MACHINE = 'facebook'
2049 _available_formats = ['video', 'highqual', 'lowqual']
2050 _video_extensions = {
2051 'video': 'mp4',
2052 'highqual': 'mp4',
2053 'lowqual': 'mp4',
2054 }
2055 IE_NAME = u'facebook'
2056
2057 def __init__(self, downloader=None):
2058 InfoExtractor.__init__(self, downloader)
2059
2060 def _reporter(self, message):
2061 """Add header and report message."""
2062 self._downloader.to_screen(u'[facebook] %s' % message)
2063
2064 def report_login(self):
2065 """Report attempt to log in."""
2066 self._reporter(u'Logging in')
2067
2068 def report_video_webpage_download(self, video_id):
2069 """Report attempt to download video webpage."""
2070 self._reporter(u'%s: Downloading video webpage' % video_id)
2071
2072 def report_information_extraction(self, video_id):
2073 """Report attempt to extract video information."""
2074 self._reporter(u'%s: Extracting video information' % video_id)
2075
2076 def _parse_page(self, video_webpage):
2077 """Extract video information from page"""
2078 # General data
2079 data = {'title': r'\("video_title", "(.*?)"\)',
2080 'description': r'<div class="datawrap">(.*?)</div>',
2081 'owner': r'\("video_owner_name", "(.*?)"\)',
2082 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2083 }
2084 video_info = {}
2085 for piece in data.keys():
2086 mobj = re.search(data[piece], video_webpage)
2087 if mobj is not None:
2088 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2089
2090 # Video urls
2091 video_urls = {}
2092 for fmt in self._available_formats:
2093 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2094 if mobj is not None:
2095 # URL is in a Javascript segment inside an escaped Unicode format within
2096 # the generally utf-8 page
2097 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2098 video_info['video_urls'] = video_urls
2099
2100 return video_info
2101
2102 def _real_initialize(self):
2103 if self._downloader is None:
2104 return
2105
2106 useremail = None
2107 password = None
2108 downloader_params = self._downloader.params
2109
2110 # Attempt to use provided username and password or .netrc data
2111 if downloader_params.get('username', None) is not None:
2112 useremail = downloader_params['username']
2113 password = downloader_params['password']
2114 elif downloader_params.get('usenetrc', False):
2115 try:
2116 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2117 if info is not None:
2118 useremail = info[0]
2119 password = info[2]
2120 else:
2121 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2122 except (IOError, netrc.NetrcParseError) as err:
2123 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2124 return
2125
2126 if useremail is None:
2127 return
2128
2129 # Log in
2130 login_form = {
2131 'email': useremail,
2132 'pass': password,
2133 'login': 'Log+In'
2134 }
2135 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2136 try:
2137 self.report_login()
2138 login_results = compat_urllib_request.urlopen(request).read()
2139 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2140 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2141 return
2142 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2143 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2144 return
2145
2146 def _real_extract(self, url):
2147 mobj = re.match(self._VALID_URL, url)
2148 if mobj is None:
2149 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2150 return
2151 video_id = mobj.group('ID')
2152
2153 # Get video webpage
2154 self.report_video_webpage_download(video_id)
2155 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2156 try:
2157 page = compat_urllib_request.urlopen(request)
2158 video_webpage = page.read()
2159 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2160 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2161 return
2162
2163 # Start extracting information
2164 self.report_information_extraction(video_id)
2165
2166 # Extract information
2167 video_info = self._parse_page(video_webpage)
2168
2169 # uploader
2170 if 'owner' not in video_info:
2171 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2172 return
2173 video_uploader = video_info['owner']
2174
2175 # title
2176 if 'title' not in video_info:
2177 self._downloader.trouble(u'ERROR: unable to extract video title')
2178 return
2179 video_title = video_info['title']
2180 video_title = video_title.decode('utf-8')
2181
2182 # thumbnail image
2183 if 'thumbnail' not in video_info:
2184 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2185 video_thumbnail = ''
2186 else:
2187 video_thumbnail = video_info['thumbnail']
2188
2189 # upload date
2190 upload_date = None
2191 if 'upload_date' in video_info:
2192 upload_time = video_info['upload_date']
2193 timetuple = email.utils.parsedate_tz(upload_time)
2194 if timetuple is not None:
2195 try:
2196 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2197 except:
2198 pass
2199
2200 # description
2201 video_description = video_info.get('description', 'No description available.')
2202
2203 url_map = video_info['video_urls']
2204 if len(url_map.keys()) > 0:
2205 # Decide which formats to download
2206 req_format = self._downloader.params.get('format', None)
2207 format_limit = self._downloader.params.get('format_limit', None)
2208
2209 if format_limit is not None and format_limit in self._available_formats:
2210 format_list = self._available_formats[self._available_formats.index(format_limit):]
2211 else:
2212 format_list = self._available_formats
2213 existing_formats = [x for x in format_list if x in url_map]
2214 if len(existing_formats) == 0:
2215 self._downloader.trouble(u'ERROR: no known formats available for video')
2216 return
2217 if req_format is None:
2218 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2219 elif req_format == 'worst':
2220 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2221 elif req_format == '-1':
2222 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2223 else:
2224 # Specific format
2225 if req_format not in url_map:
2226 self._downloader.trouble(u'ERROR: requested format not available')
2227 return
2228 video_url_list = [(req_format, url_map[req_format])] # Specific format
2229
2230 results = []
2231 for format_param, video_real_url in video_url_list:
2232 # Extension
2233 video_extension = self._video_extensions.get(format_param, 'mp4')
2234
2235 results.append({
2236 'id': video_id.decode('utf-8'),
2237 'url': video_real_url.decode('utf-8'),
2238 'uploader': video_uploader.decode('utf-8'),
2239 'upload_date': upload_date,
2240 'title': video_title,
2241 'ext': video_extension.decode('utf-8'),
2242 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2243 'thumbnail': video_thumbnail.decode('utf-8'),
2244 'description': video_description.decode('utf-8'),
2245 })
2246 return results
d77c3dfd
FV
2247
2248class BlipTVIE(InfoExtractor):
59ae15a5
PH
2249 """Information extractor for blip.tv"""
2250
2251 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2252 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2253 IE_NAME = u'blip.tv'
2254
2255 def report_extraction(self, file_id):
2256 """Report information extraction."""
2257 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2258
2259 def report_direct_download(self, title):
2260 """Report information extraction."""
2261 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2262
2263 def _real_extract(self, url):
2264 mobj = re.match(self._VALID_URL, url)
2265 if mobj is None:
2266 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2267 return
2268
2269 if '?' in url:
2270 cchar = '&'
2271 else:
2272 cchar = '?'
2273 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2274 request = compat_urllib_request.Request(json_url)
59ae15a5
PH
2275 self.report_extraction(mobj.group(1))
2276 info = None
2277 try:
2278 urlh = compat_urllib_request.urlopen(request)
2279 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2280 basename = url.split('/')[-1]
2281 title,ext = os.path.splitext(basename)
2282 title = title.decode('UTF-8')
2283 ext = ext.replace('.', '')
2284 self.report_direct_download(title)
2285 info = {
2286 'id': title,
2287 'url': url,
2288 'uploader': None,
2289 'upload_date': None,
2290 'title': title,
2291 'ext': ext,
2292 'urlhandle': urlh
2293 }
2294 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2295 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2296 return
2297 if info is None: # Regular URL
2298 try:
55c05398
PH
2299 json_code_bytes = urlh.read()
2300 json_code = json_code_bytes.decode('utf-8')
59ae15a5
PH
2301 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2302 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2303 return
2304
2305 try:
2306 json_data = json.loads(json_code)
2307 if 'Post' in json_data:
2308 data = json_data['Post']
2309 else:
2310 data = json_data
2311
2312 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2313 video_url = data['media']['url']
2314 umobj = re.match(self._URL_EXT, video_url)
2315 if umobj is None:
2316 raise ValueError('Can not determine filename extension')
2317 ext = umobj.group(1)
2318
2319 info = {
2320 'id': data['item_id'],
2321 'url': video_url,
2322 'uploader': data['display_name'],
2323 'upload_date': upload_date,
2324 'title': data['title'],
2325 'ext': ext,
2326 'format': data['media']['mimeType'],
2327 'thumbnail': data['thumbnailUrl'],
2328 'description': data['description'],
2329 'player_url': data['embedUrl']
2330 }
2331 except (ValueError,KeyError) as err:
2332 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2333 return
2334
2335 std_headers['User-Agent'] = 'iTunes/10.6.1'
2336 return [info]
d77c3dfd
FV
2337
2338
2339class MyVideoIE(InfoExtractor):
59ae15a5
PH
2340 """Information Extractor for myvideo.de."""
2341
2342 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2343 IE_NAME = u'myvideo'
2344
2345 def __init__(self, downloader=None):
2346 InfoExtractor.__init__(self, downloader)
2347
2348 def report_download_webpage(self, video_id):
2349 """Report webpage download."""
2350 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2351
2352 def report_extraction(self, video_id):
2353 """Report information extraction."""
2354 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2355
2356 def _real_extract(self,url):
2357 mobj = re.match(self._VALID_URL, url)
2358 if mobj is None:
2359 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2360 return
2361
2362 video_id = mobj.group(1)
2363
2364 # Get video webpage
2365 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2366 try:
2367 self.report_download_webpage(video_id)
2368 webpage = compat_urllib_request.urlopen(request).read()
2369 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2370 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2371 return
2372
2373 self.report_extraction(video_id)
2374 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2375 webpage)
2376 if mobj is None:
2377 self._downloader.trouble(u'ERROR: unable to extract media URL')
2378 return
2379 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2380
2381 mobj = re.search('<title>([^<]+)</title>', webpage)
2382 if mobj is None:
2383 self._downloader.trouble(u'ERROR: unable to extract title')
2384 return
2385
2386 video_title = mobj.group(1)
2387
2388 return [{
2389 'id': video_id,
2390 'url': video_url,
2391 'uploader': None,
2392 'upload_date': None,
2393 'title': video_title,
2394 'ext': u'flv',
2395 }]
d77c3dfd
FV
2396
2397class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2398 """Information extractor for The Daily Show and Colbert Report """
2399
ca6849e6 2400 # urls can be abbreviations like :thedailyshow or :colbert
2401 # urls for episodes like:
2402 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2403 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2404 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2405 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2406 |(https?://)?(www\.)?
2407 (?P<showname>thedailyshow|colbertnation)\.com/
2408 (full-episodes/(?P<episode>.*)|
2409 (?P<clip>
2410 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2411 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2412 $"""
59ae15a5
PH
2413 IE_NAME = u'comedycentral'
2414
2415 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2416
2417 _video_extensions = {
2418 '3500': 'mp4',
2419 '2200': 'mp4',
2420 '1700': 'mp4',
2421 '1200': 'mp4',
2422 '750': 'mp4',
2423 '400': 'mp4',
2424 }
2425 _video_dimensions = {
2426 '3500': '1280x720',
2427 '2200': '960x540',
2428 '1700': '768x432',
2429 '1200': '640x360',
2430 '750': '512x288',
2431 '400': '384x216',
2432 }
2433
ca6849e6 2434 def suitable(self, url):
2435 """Receives a URL and returns True if suitable for this IE."""
2436 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2437
59ae15a5
PH
2438 def report_extraction(self, episode_id):
2439 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2440
2441 def report_config_download(self, episode_id):
2442 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2443
2444 def report_index_download(self, episode_id):
2445 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2446
2447 def report_player_url(self, episode_id):
2448 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2449
2450
2451 def _print_formats(self, formats):
2452 print('Available formats:')
2453 for x in formats:
2454 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2455
2456
2457 def _real_extract(self, url):
ca6849e6 2458 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2459 if mobj is None:
2460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2461 return
2462
2463 if mobj.group('shortname'):
2464 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2465 url = u'http://www.thedailyshow.com/full-episodes/'
2466 else:
2467 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2468 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2469 assert mobj is not None
2470
ca6849e6 2471 if mobj.group('clip'):
2472 if mobj.group('showname') == 'thedailyshow':
2473 epTitle = mobj.group('tdstitle')
2474 else:
2475 epTitle = mobj.group('cntitle')
2476 dlNewest = False
59ae15a5 2477 else:
ca6849e6 2478 dlNewest = not mobj.group('episode')
2479 if dlNewest:
2480 epTitle = mobj.group('showname')
2481 else:
2482 epTitle = mobj.group('episode')
59ae15a5
PH
2483
2484 req = compat_urllib_request.Request(url)
2485 self.report_extraction(epTitle)
2486 try:
2487 htmlHandle = compat_urllib_request.urlopen(req)
2488 html = htmlHandle.read()
2489 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2491 return
2492 if dlNewest:
2493 url = htmlHandle.geturl()
ca6849e6 2494 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2495 if mobj is None:
2496 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2497 return
2498 if mobj.group('episode') == '':
2499 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2500 return
2501 epTitle = mobj.group('episode')
2502
ca6849e6 2503 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
59ae15a5
PH
2504
2505 if len(mMovieParams) == 0:
2506 # The Colbert Report embeds the information in a without
2507 # a URL prefix; so extract the alternate reference
2508 # and then add the URL prefix manually.
2509
ca6849e6 2510 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
59ae15a5
PH
2511 if len(altMovieParams) == 0:
2512 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2513 return
2514 else:
2515 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2516
2517 playerUrl_raw = mMovieParams[0][0]
2518 self.report_player_url(epTitle)
2519 try:
2520 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2521 playerUrl = urlHandle.geturl()
2522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2524 return
2525
2526 uri = mMovieParams[0][1]
2527 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2528 self.report_index_download(epTitle)
2529 try:
2530 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2533 return
2534
2535 results = []
2536
2537 idoc = xml.etree.ElementTree.fromstring(indexXml)
2538 itemEls = idoc.findall('.//item')
2539 for itemEl in itemEls:
2540 mediaId = itemEl.findall('./guid')[0].text
2541 shortMediaId = mediaId.split(':')[-1]
2542 showId = mediaId.split(':')[-2].replace('.com', '')
2543 officialTitle = itemEl.findall('./title')[0].text
2544 officialDate = itemEl.findall('./pubDate')[0].text
2545
2546 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2547 compat_urllib_parse.urlencode({'uri': mediaId}))
2548 configReq = compat_urllib_request.Request(configUrl)
2549 self.report_config_download(epTitle)
2550 try:
2551 configXml = compat_urllib_request.urlopen(configReq).read()
2552 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2553 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2554 return
2555
2556 cdoc = xml.etree.ElementTree.fromstring(configXml)
2557 turls = []
2558 for rendition in cdoc.findall('.//rendition'):
2559 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2560 turls.append(finfo)
2561
2562 if len(turls) == 0:
2563 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2564 continue
2565
2566 if self._downloader.params.get('listformats', None):
2567 self._print_formats([i[0] for i in turls])
2568 return
2569
2570 # For now, just pick the highest bitrate
2571 format,video_url = turls[-1]
2572
2573 # Get the format arg from the arg stream
2574 req_format = self._downloader.params.get('format', None)
2575
2576 # Select format if we can find one
2577 for f,v in turls:
2578 if f == req_format:
2579 format, video_url = f, v
2580 break
2581
2582 # Patch to download from alternative CDN, which does not
2583 # break on current RTMPDump builds
2584 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2585 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2586
2587 if video_url.startswith(broken_cdn):
2588 video_url = video_url.replace(broken_cdn, better_cdn)
2589
2590 effTitle = showId + u'-' + epTitle
2591 info = {
2592 'id': shortMediaId,
2593 'url': video_url,
2594 'uploader': showId,
2595 'upload_date': officialDate,
2596 'title': effTitle,
2597 'ext': 'mp4',
2598 'format': format,
2599 'thumbnail': None,
2600 'description': officialTitle,
2601 'player_url': None #playerUrl
2602 }
2603
2604 results.append(info)
2605
2606 return results
d77c3dfd
FV
2607
2608
2609class EscapistIE(InfoExtractor):
59ae15a5
PH
2610 """Information extractor for The Escapist """
2611
2612 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2613 IE_NAME = u'escapist'
2614
2615 def report_extraction(self, showName):
2616 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2617
2618 def report_config_download(self, showName):
2619 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2620
2621 def _real_extract(self, url):
2622 mobj = re.match(self._VALID_URL, url)
2623 if mobj is None:
2624 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2625 return
2626 showName = mobj.group('showname')
2627 videoId = mobj.group('episode')
2628
2629 self.report_extraction(showName)
2630 try:
2631 webPage = compat_urllib_request.urlopen(url)
2632 webPageBytes = webPage.read()
2633 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2634 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2636 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2637 return
2638
2639 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2640 description = unescapeHTML(descMatch.group(1))
2641 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2642 imgUrl = unescapeHTML(imgMatch.group(1))
2643 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2644 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2645 configUrlMatch = re.search('config=(.*)$', playerUrl)
2646 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2647
2648 self.report_config_download(showName)
2649 try:
2650 configJSON = compat_urllib_request.urlopen(configUrl).read()
2651 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2652 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2653 return
2654
2655 # Technically, it's JavaScript, not JSON
2656 configJSON = configJSON.replace("'", '"')
2657
2658 try:
2659 config = json.loads(configJSON)
2660 except (ValueError,) as err:
2661 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2662 return
2663
2664 playlist = config['playlist']
2665 videoUrl = playlist[1]['url']
2666
2667 info = {
2668 'id': videoId,
2669 'url': videoUrl,
2670 'uploader': showName,
2671 'upload_date': None,
2672 'title': showName,
2673 'ext': 'flv',
2674 'thumbnail': imgUrl,
2675 'description': description,
2676 'player_url': playerUrl,
2677 }
2678
2679 return [info]
d77c3dfd
FV
2680
2681
2682class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2683 """Information extractor for collegehumor.com"""
2684
0eb0faa2 2685 _WORKING = False
59ae15a5
PH
2686 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2687 IE_NAME = u'collegehumor'
2688
799c0763 2689 def report_manifest(self, video_id):
59ae15a5 2690 """Report information extraction."""
799c0763 2691 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
59ae15a5
PH
2692
2693 def report_extraction(self, video_id):
2694 """Report information extraction."""
2695 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2696
2697 def _real_extract(self, url):
2698 mobj = re.match(self._VALID_URL, url)
2699 if mobj is None:
2700 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701 return
2702 video_id = mobj.group('videoid')
2703
59ae15a5
PH
2704 info = {
2705 'id': video_id,
59ae15a5
PH
2706 'uploader': None,
2707 'upload_date': None,
2708 }
2709
2710 self.report_extraction(video_id)
799c0763 2711 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2712 try:
2713 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2716 return
2717
2718 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2719 try:
2720 videoNode = mdoc.findall('./video')[0]
2721 info['description'] = videoNode.findall('./description')[0].text
2722 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2723 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2724 manifest_url = videoNode.findall('./file')[0].text
59ae15a5
PH
2725 except IndexError:
2726 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2727 return
2728
799c0763
PH
2729 manifest_url += '?hdcore=2.10.3'
2730 self.report_manifest(video_id)
2731 try:
2732 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2733 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2734 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2735 return
2736
2737 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2738 try:
2739 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2740 node_id = media_node.attrib['url']
2741 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2742 except IndexError as err:
2743 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2744 return
2745
2746 url_pr = compat_urllib_parse_urlparse(manifest_url)
2747 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2748
2749 info['url'] = url
2750 info['ext'] = 'f4f'
59ae15a5 2751 return [info]
d77c3dfd
FV
2752
2753
2754class XVideosIE(InfoExtractor):
59ae15a5 2755 """Information extractor for xvideos.com"""
d77c3dfd 2756
59ae15a5
PH
2757 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2758 IE_NAME = u'xvideos'
d77c3dfd 2759
59ae15a5
PH
2760 def report_webpage(self, video_id):
2761 """Report information extraction."""
2762 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
d77c3dfd 2763
59ae15a5
PH
2764 def report_extraction(self, video_id):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
d77c3dfd 2767
59ae15a5
PH
2768 def _real_extract(self, url):
2769 mobj = re.match(self._VALID_URL, url)
2770 if mobj is None:
2771 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2772 return
8588a86f 2773 video_id = mobj.group(1)
d77c3dfd 2774
59ae15a5 2775 self.report_webpage(video_id)
d77c3dfd 2776
59ae15a5
PH
2777 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2778 try:
8588a86f
PH
2779 webpage_bytes = compat_urllib_request.urlopen(request).read()
2780 webpage = webpage_bytes.decode('utf-8', 'replace')
59ae15a5
PH
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2783 return
d77c3dfd 2784
59ae15a5 2785 self.report_extraction(video_id)
d77c3dfd
FV
2786
2787
59ae15a5
PH
2788 # Extract video URL
2789 mobj = re.search(r'flv_url=(.+?)&', webpage)
2790 if mobj is None:
2791 self._downloader.trouble(u'ERROR: unable to extract video url')
2792 return
8588a86f 2793 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2794
2795
59ae15a5
PH
2796 # Extract title
2797 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2798 if mobj is None:
2799 self._downloader.trouble(u'ERROR: unable to extract video title')
2800 return
8588a86f 2801 video_title = mobj.group(1)
d77c3dfd
FV
2802
2803
59ae15a5
PH
2804 # Extract video thumbnail
2805 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2806 if mobj is None:
2807 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2808 return
8588a86f 2809 video_thumbnail = mobj.group(0)
d77c3dfd 2810
59ae15a5
PH
2811 info = {
2812 'id': video_id,
2813 'url': video_url,
2814 'uploader': None,
2815 'upload_date': None,
2816 'title': video_title,
2817 'ext': 'flv',
2818 'thumbnail': video_thumbnail,
2819 'description': None,
2820 }
d77c3dfd 2821
59ae15a5 2822 return [info]
d77c3dfd
FV
2823
2824
2825class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2826 """Information extractor for soundcloud.com
2827 To access the media, the uid of the song and a stream token
2828 must be extracted from the page source and the script must make
2829 a request to media.soundcloud.com/crossdomain.xml. Then
2830 the media can be grabbed by requesting from an url composed
2831 of the stream token and uid
2832 """
2833
2834 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2835 IE_NAME = u'soundcloud'
2836
2837 def __init__(self, downloader=None):
2838 InfoExtractor.__init__(self, downloader)
2839
8fd3afd5 2840 def report_resolve(self, video_id):
59ae15a5 2841 """Report information extraction."""
8fd3afd5 2842 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
59ae15a5
PH
2843
2844 def report_extraction(self, video_id):
2845 """Report information extraction."""
8fd3afd5 2846 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
59ae15a5
PH
2847
2848 def _real_extract(self, url):
2849 mobj = re.match(self._VALID_URL, url)
2850 if mobj is None:
2851 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2852 return
2853
2854 # extract uploader (which is in the url)
15c8d833 2855 uploader = mobj.group(1)
59ae15a5 2856 # extract simple title (uploader + slug of song title)
15c8d833 2857 slug_title = mobj.group(2)
59ae15a5
PH
2858 simple_title = uploader + u'-' + slug_title
2859
8fd3afd5 2860 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2861
8fd3afd5
PH
2862 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2863 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2864 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2865 try:
8fd3afd5
PH
2866 info_json_bytes = compat_urllib_request.urlopen(request).read()
2867 info_json = info_json_bytes.decode('utf-8')
59ae15a5
PH
2868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2870 return
2871
8fd3afd5
PH
2872 info = json.loads(info_json)
2873 video_id = info['id']
59ae15a5
PH
2874 self.report_extraction('%s/%s' % (uploader, slug_title))
2875
8fd3afd5 2876 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2877 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2878 try:
2879 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2880 stream_json = stream_json_bytes.decode('utf-8')
2881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2882 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
b4cd069d 2883 return
59ae15a5 2884
8fd3afd5 2885 streams = json.loads(stream_json)
c7214f9a 2886 mediaURL = streams['http_mp3_128_url']
59ae15a5
PH
2887
2888 return [{
c7214f9a 2889 'id': info['id'],
59ae15a5 2890 'url': mediaURL,
c7214f9a
PH
2891 'uploader': info['user']['username'],
2892 'upload_date': info['created_at'],
2893 'title': info['title'],
59ae15a5 2894 'ext': u'mp3',
c7214f9a 2895 'description': info['description'],
59ae15a5 2896 }]
d77c3dfd
FV
2897
2898
2899class InfoQIE(InfoExtractor):
59ae15a5
PH
2900 """Information extractor for infoq.com"""
2901
2902 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2903 IE_NAME = u'infoq'
2904
2905 def report_webpage(self, video_id):
2906 """Report information extraction."""
2907 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2908
2909 def report_extraction(self, video_id):
2910 """Report information extraction."""
2911 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2912
2913 def _real_extract(self, url):
2914 mobj = re.match(self._VALID_URL, url)
2915 if mobj is None:
2916 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2917 return
2918
2919 self.report_webpage(url)
2920
2921 request = compat_urllib_request.Request(url)
2922 try:
2923 webpage = compat_urllib_request.urlopen(request).read()
2924 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2925 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2926 return
2927
2928 self.report_extraction(url)
2929
2930
2931 # Extract video URL
2932 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2933 if mobj is None:
2934 self._downloader.trouble(u'ERROR: unable to extract video url')
2935 return
2936 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2937
2938
2939 # Extract title
2940 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2941 if mobj is None:
2942 self._downloader.trouble(u'ERROR: unable to extract video title')
2943 return
2944 video_title = mobj.group(1).decode('utf-8')
2945
2946 # Extract description
2947 video_description = u'No description available.'
2948 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2949 if mobj is not None:
2950 video_description = mobj.group(1).decode('utf-8')
2951
2952 video_filename = video_url.split('/')[-1]
2953 video_id, extension = video_filename.split('.')
2954
2955 info = {
2956 'id': video_id,
2957 'url': video_url,
2958 'uploader': None,
2959 'upload_date': None,
2960 'title': video_title,
2961 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2962 'thumbnail': None,
2963 'description': video_description,
2964 }
2965
2966 return [info]
d77c3dfd
FV
2967
2968class MixcloudIE(InfoExtractor):
59ae15a5
PH
2969 """Information extractor for www.mixcloud.com"""
2970 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2971 IE_NAME = u'mixcloud'
2972
2973 def __init__(self, downloader=None):
2974 InfoExtractor.__init__(self, downloader)
2975
2976 def report_download_json(self, file_id):
2977 """Report JSON download."""
2978 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2979
2980 def report_extraction(self, file_id):
2981 """Report information extraction."""
2982 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2983
2984 def get_urls(self, jsonData, fmt, bitrate='best'):
2985 """Get urls from 'audio_formats' section in json"""
2986 file_url = None
2987 try:
2988 bitrate_list = jsonData[fmt]
2989 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2990 bitrate = max(bitrate_list) # select highest
2991
2992 url_list = jsonData[fmt][bitrate]
2993 except TypeError: # we have no bitrate info.
2994 url_list = jsonData[fmt]
2995 return url_list
2996
2997 def check_urls(self, url_list):
2998 """Returns 1st active url from list"""
2999 for url in url_list:
3000 try:
3001 compat_urllib_request.urlopen(url)
3002 return url
3003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004 url = None
3005
3006 return None
3007
3008 def _print_formats(self, formats):
3009 print('Available formats:')
3010 for fmt in formats.keys():
3011 for b in formats[fmt]:
3012 try:
3013 ext = formats[fmt][b][0]
3014 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3015 except TypeError: # we have no bitrate info
3016 ext = formats[fmt][0]
3017 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3018 break
3019
3020 def _real_extract(self, url):
3021 mobj = re.match(self._VALID_URL, url)
3022 if mobj is None:
3023 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3024 return
3025 # extract uploader & filename from url
3026 uploader = mobj.group(1).decode('utf-8')
3027 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3028
3029 # construct API request
3030 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3031 # retrieve .json file with links to files
3032 request = compat_urllib_request.Request(file_url)
3033 try:
3034 self.report_download_json(file_url)
3035 jsonData = compat_urllib_request.urlopen(request).read()
3036 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3038 return
3039
3040 # parse JSON
3041 json_data = json.loads(jsonData)
3042 player_url = json_data['player_swf_url']
3043 formats = dict(json_data['audio_formats'])
3044
3045 req_format = self._downloader.params.get('format', None)
3046 bitrate = None
3047
3048 if self._downloader.params.get('listformats', None):
3049 self._print_formats(formats)
3050 return
3051
3052 if req_format is None or req_format == 'best':
3053 for format_param in formats.keys():
3054 url_list = self.get_urls(formats, format_param)
3055 # check urls
3056 file_url = self.check_urls(url_list)
3057 if file_url is not None:
3058 break # got it!
3059 else:
3060 if req_format not in formats.keys():
3061 self._downloader.trouble(u'ERROR: format is not available')
3062 return
3063
3064 url_list = self.get_urls(formats, req_format)
3065 file_url = self.check_urls(url_list)
3066 format_param = req_format
3067
3068 return [{
3069 'id': file_id.decode('utf-8'),
3070 'url': file_url.decode('utf-8'),
3071 'uploader': uploader.decode('utf-8'),
3072 'upload_date': None,
3073 'title': json_data['name'],
3074 'ext': file_url.split('.')[-1].decode('utf-8'),
3075 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3076 'thumbnail': json_data['thumbnail_url'],
3077 'description': json_data['description'],
3078 'player_url': player_url.decode('utf-8'),
3079 }]
d77c3dfd
FV
3080
3081class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
3082 """Information extractor for Stanford's Open ClassRoom"""
3083
3084 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3085 IE_NAME = u'stanfordoc'
3086
3087 def report_download_webpage(self, objid):
3088 """Report information extraction."""
3089 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3090
3091 def report_extraction(self, video_id):
3092 """Report information extraction."""
3093 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3094
3095 def _real_extract(self, url):
3096 mobj = re.match(self._VALID_URL, url)
3097 if mobj is None:
3098 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3099 return
3100
3101 if mobj.group('course') and mobj.group('video'): # A specific video
3102 course = mobj.group('course')
3103 video = mobj.group('video')
3104 info = {
3105 'id': course + '_' + video,
3106 'uploader': None,
3107 'upload_date': None,
3108 }
3109
3110 self.report_extraction(info['id'])
3111 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3112 xmlUrl = baseUrl + video + '.xml'
3113 try:
3114 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3116 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3117 return
3118 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3119 try:
3120 info['title'] = mdoc.findall('./title')[0].text
3121 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3122 except IndexError:
3123 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3124 return
3125 info['ext'] = info['url'].rpartition('.')[2]
3126 return [info]
3127 elif mobj.group('course'): # A course page
3128 course = mobj.group('course')
3129 info = {
3130 'id': course,
3131 'type': 'playlist',
3132 'uploader': None,
3133 'upload_date': None,
3134 }
3135
3136 self.report_download_webpage(info['id'])
3137 try:
3138 coursepage = compat_urllib_request.urlopen(url).read()
3139 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3140 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3141 return
3142
3143 m = re.search('<h1>([^<]+)</h1>', coursepage)
3144 if m:
3145 info['title'] = unescapeHTML(m.group(1))
3146 else:
3147 info['title'] = info['id']
3148
3149 m = re.search('<description>([^<]+)</description>', coursepage)
3150 if m:
3151 info['description'] = unescapeHTML(m.group(1))
3152
3153 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3154 info['list'] = [
3155 {
3156 'type': 'reference',
3157 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3158 }
3159 for vpage in links]
3160 results = []
3161 for entry in info['list']:
3162 assert entry['type'] == 'reference'
3163 results += self.extract(entry['url'])
3164 return results
3165
3166 else: # Root page
3167 info = {
3168 'id': 'Stanford OpenClassroom',
3169 'type': 'playlist',
3170 'uploader': None,
3171 'upload_date': None,
3172 }
3173
3174 self.report_download_webpage(info['id'])
3175 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3176 try:
3177 rootpage = compat_urllib_request.urlopen(rootURL).read()
3178 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3179 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3180 return
3181
3182 info['title'] = info['id']
3183
3184 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3185 info['list'] = [
3186 {
3187 'type': 'reference',
3188 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3189 }
3190 for cpage in links]
3191
3192 results = []
3193 for entry in info['list']:
3194 assert entry['type'] == 'reference'
3195 results += self.extract(entry['url'])
3196 return results
d77c3dfd
FV
3197
3198class MTVIE(InfoExtractor):
59ae15a5
PH
3199 """Information extractor for MTV.com"""
3200
3201 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3202 IE_NAME = u'mtv'
3203
3204 def report_webpage(self, video_id):
3205 """Report information extraction."""
3206 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3207
3208 def report_extraction(self, video_id):
3209 """Report information extraction."""
3210 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3211
3212 def _real_extract(self, url):
3213 mobj = re.match(self._VALID_URL, url)
3214 if mobj is None:
3215 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3216 return
3217 if not mobj.group('proto'):
3218 url = 'http://' + url
3219 video_id = mobj.group('videoid')
3220 self.report_webpage(video_id)
3221
3222 request = compat_urllib_request.Request(url)
3223 try:
3224 webpage = compat_urllib_request.urlopen(request).read()
3225 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3226 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3227 return
3228
3229 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3230 if mobj is None:
3231 self._downloader.trouble(u'ERROR: unable to extract song name')
3232 return
3233 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3234 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3235 if mobj is None:
3236 self._downloader.trouble(u'ERROR: unable to extract performer')
3237 return
3238 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3239 video_title = performer + ' - ' + song_name
3240
3241 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3242 if mobj is None:
3243 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3244 return
3245 mtvn_uri = mobj.group(1)
3246
3247 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3248 if mobj is None:
3249 self._downloader.trouble(u'ERROR: unable to extract content id')
3250 return
3251 content_id = mobj.group(1)
3252
3253 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3254 self.report_extraction(video_id)
3255 request = compat_urllib_request.Request(videogen_url)
3256 try:
3257 metadataXml = compat_urllib_request.urlopen(request).read()
3258 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3259 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3260 return
3261
3262 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3263 renditions = mdoc.findall('.//rendition')
3264
3265 # For now, always pick the highest quality.
3266 rendition = renditions[-1]
3267
3268 try:
3269 _,_,ext = rendition.attrib['type'].partition('/')
3270 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3271 video_url = rendition.find('./src').text
3272 except KeyError:
3273 self._downloader.trouble('Invalid rendition field.')
3274 return
3275
3276 info = {
3277 'id': video_id,
3278 'url': video_url,
3279 'uploader': performer,
3280 'upload_date': None,
3281 'title': video_title,
3282 'ext': ext,
3283 'format': format,
3284 }
3285
3286 return [info]
6de7ef9b 3287
302efc19 3288
302efc19 3289class YoukuIE(InfoExtractor):
3290
59ae15a5
PH
3291 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3292 IE_NAME = u'Youku'
3293
3294 def __init__(self, downloader=None):
3295 InfoExtractor.__init__(self, downloader)
3296
3297 def report_download_webpage(self, file_id):
3298 """Report webpage download."""
3299 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3300
3301 def report_extraction(self, file_id):
3302 """Report information extraction."""
3303 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3304
3305 def _gen_sid(self):
3306 nowTime = int(time.time() * 1000)
3307 random1 = random.randint(1000,1998)
3308 random2 = random.randint(1000,9999)
3309
3310 return "%d%d%d" %(nowTime,random1,random2)
3311
3312 def _get_file_ID_mix_string(self, seed):
3313 mixed = []
3314 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3315 seed = float(seed)
3316 for i in range(len(source)):
3317 seed = (seed * 211 + 30031 ) % 65536
3318 index = math.floor(seed / 65536 * len(source) )
3319 mixed.append(source[int(index)])
3320 source.remove(source[int(index)])
3321 #return ''.join(mixed)
3322 return mixed
3323
3324 def _get_file_id(self, fileId, seed):
3325 mixed = self._get_file_ID_mix_string(seed)
3326 ids = fileId.split('*')
3327 realId = []
3328 for ch in ids:
3329 if ch:
3330 realId.append(mixed[int(ch)])
3331 return ''.join(realId)
3332
3333 def _real_extract(self, url):
3334 mobj = re.match(self._VALID_URL, url)
3335 if mobj is None:
3336 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3337 return
3338 video_id = mobj.group('ID')
3339
3340 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3341
3342 request = compat_urllib_request.Request(info_url, None, std_headers)
3343 try:
3344 self.report_download_webpage(video_id)
3345 jsondata = compat_urllib_request.urlopen(request).read()
3346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3347 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3348 return
3349
3350 self.report_extraction(video_id)
3351 try:
3352 config = json.loads(jsondata)
3353
3354 video_title = config['data'][0]['title']
3355 seed = config['data'][0]['seed']
3356
3357 format = self._downloader.params.get('format', None)
3358 supported_format = config['data'][0]['streamfileids'].keys()
3359
3360 if format is None or format == 'best':
3361 if 'hd2' in supported_format:
3362 format = 'hd2'
3363 else:
3364 format = 'flv'
3365 ext = u'flv'
3366 elif format == 'worst':
3367 format = 'mp4'
3368 ext = u'mp4'
3369 else:
3370 format = 'flv'
3371 ext = u'flv'
3372
3373
3374 fileid = config['data'][0]['streamfileids'][format]
3375 seg_number = len(config['data'][0]['segs'][format])
3376
3377 keys=[]
3378 for i in xrange(seg_number):
3379 keys.append(config['data'][0]['segs'][format][i]['k'])
3380
3381 #TODO check error
3382 #youku only could be viewed from mainland china
3383 except:
3384 self._downloader.trouble(u'ERROR: unable to extract info section')
3385 return
3386
3387 files_info=[]
3388 sid = self._gen_sid()
3389 fileid = self._get_file_id(fileid, seed)
3390
3391 #column 8,9 of fileid represent the segment number
3392 #fileid[7:9] should be changed
3393 for index, key in enumerate(keys):
3394
3395 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3396 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3397
3398 info = {
3399 'id': '%s_part%02d' % (video_id, index),
3400 'url': download_url,
3401 'uploader': None,
3402 'upload_date': None,
3403 'title': video_title,
3404 'ext': ext,
3405 }
3406 files_info.append(info)
3407
3408 return files_info
5dc846fa
FV
3409
3410
6de7ef9b 3411class XNXXIE(InfoExtractor):
59ae15a5
PH
3412 """Information extractor for xnxx.com"""
3413
3414 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3415 IE_NAME = u'xnxx'
3416 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3417 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3418 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3419
3420 def report_webpage(self, video_id):
3421 """Report information extraction"""
3422 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3423
3424 def report_extraction(self, video_id):
3425 """Report information extraction"""
3426 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3427
3428 def _real_extract(self, url):
3429 mobj = re.match(self._VALID_URL, url)
3430 if mobj is None:
3431 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3432 return
3433 video_id = mobj.group(1).decode('utf-8')
3434
3435 self.report_webpage(video_id)
3436
3437 # Get webpage content
3438 try:
3439 webpage = compat_urllib_request.urlopen(url).read()
3440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3441 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3442 return
3443
3444 result = re.search(self.VIDEO_URL_RE, webpage)
3445 if result is None:
3446 self._downloader.trouble(u'ERROR: unable to extract video url')
3447 return
3448 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3449
3450 result = re.search(self.VIDEO_TITLE_RE, webpage)
3451 if result is None:
3452 self._downloader.trouble(u'ERROR: unable to extract video title')
3453 return
3454 video_title = result.group(1).decode('utf-8')
3455
3456 result = re.search(self.VIDEO_THUMB_RE, webpage)
3457 if result is None:
3458 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3459 return
3460 video_thumbnail = result.group(1).decode('utf-8')
3461
3462 return [{
3463 'id': video_id,
3464 'url': video_url,
3465 'uploader': None,
3466 'upload_date': None,
3467 'title': video_title,
3468 'ext': 'flv',
3469 'thumbnail': video_thumbnail,
3470 'description': None,
3471 }]
fd873c69
FV
3472
3473
d443aca8 3474class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3475 """Information extractor for plus.google.com."""
3476
3477 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3478 IE_NAME = u'plus.google'
3479
3480 def __init__(self, downloader=None):
3481 InfoExtractor.__init__(self, downloader)
3482
3483 def report_extract_entry(self, url):
3484 """Report downloading extry"""
3485 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3486
3487 def report_date(self, upload_date):
3488 """Report downloading extry"""
3489 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3490
3491 def report_uploader(self, uploader):
3492 """Report downloading extry"""
3493 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3494
3495 def report_title(self, video_title):
3496 """Report downloading extry"""
3497 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3498
3499 def report_extract_vid_page(self, video_page):
3500 """Report information extraction."""
3501 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3502
3503 def _real_extract(self, url):
3504 # Extract id from URL
3505 mobj = re.match(self._VALID_URL, url)
3506 if mobj is None:
3507 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3508 return
3509
3510 post_url = mobj.group(0)
3511 video_id = mobj.group(2)
3512
3513 video_extension = 'flv'
3514
3515 # Step 1, Retrieve post webpage to extract further information
3516 self.report_extract_entry(post_url)
3517 request = compat_urllib_request.Request(post_url)
3518 try:
3519 webpage = compat_urllib_request.urlopen(request).read()
3520 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3521 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3522 return
3523
3524 # Extract update date
3525 upload_date = None
3526 pattern = 'title="Timestamp">(.*?)</a>'
3527 mobj = re.search(pattern, webpage)
3528 if mobj:
3529 upload_date = mobj.group(1)
3530 # Convert timestring to a format suitable for filename
3531 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3532 upload_date = upload_date.strftime('%Y%m%d')
3533 self.report_date(upload_date)
3534
3535 # Extract uploader
3536 uploader = None
3537 pattern = r'rel\="author".*?>(.*?)</a>'
3538 mobj = re.search(pattern, webpage)
3539 if mobj:
3540 uploader = mobj.group(1)
3541 self.report_uploader(uploader)
3542
3543 # Extract title
3544 # Get the first line for title
3545 video_title = u'NA'
3546 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3547 mobj = re.search(pattern, webpage)
3548 if mobj:
3549 video_title = mobj.group(1)
3550 self.report_title(video_title)
3551
3552 # Step 2, Stimulate clicking the image box to launch video
3553 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3554 mobj = re.search(pattern, webpage)
3555 if mobj is None:
3556 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3557
3558 video_page = mobj.group(1)
3559 request = compat_urllib_request.Request(video_page)
3560 try:
3561 webpage = compat_urllib_request.urlopen(request).read()
3562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3563 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3564 return
3565 self.report_extract_vid_page(video_page)
3566
3567
3568 # Extract video links on video page
3569 """Extract video links of all sizes"""
3570 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3571 mobj = re.findall(pattern, webpage)
3572 if len(mobj) == 0:
3573 self._downloader.trouble(u'ERROR: unable to extract video links')
3574
3575 # Sort in resolution
3576 links = sorted(mobj)
3577
3578 # Choose the lowest of the sort, i.e. highest resolution
3579 video_url = links[-1]
3580 # Only get the url. The resolution part in the tuple has no use anymore
3581 video_url = video_url[-1]
3582 # Treat escaped \u0026 style hex
3583 video_url = unicode(video_url, "unicode_escape")
3584
3585
3586 return [{
3587 'id': video_id.decode('utf-8'),
3588 'url': video_url,
3589 'uploader': uploader.decode('utf-8'),
3590 'upload_date': upload_date.decode('utf-8'),
3591 'title': video_title.decode('utf-8'),
3592 'ext': video_extension.decode('utf-8'),
3593 }]
4cc3d074
PH
3594
3595class NBAIE(InfoExtractor):
3596 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3597 IE_NAME = u'nba'
3598
3599 def report_extraction(self, video_id):
3600 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3601
3602 def _real_extract(self, url):
3603 mobj = re.match(self._VALID_URL, url)
3604 if mobj is None:
3605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3606 return
3607
3608 video_id = mobj.group(1)
3609 if video_id.endswith('/index.html'):
3610 video_id = video_id[:-len('/index.html')]
3611
3612 self.report_extraction(video_id)
3613 try:
3614 urlh = compat_urllib_request.urlopen(url)
3615 webpage_bytes = urlh.read()
3616 webpage = webpage_bytes.decode('utf-8', 'ignore')
3617 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3618 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3619 return
3620
3621 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3622 def _findProp(rexp, default=None):
3623 m = re.search(rexp, webpage)
3624 if m:
3625 return unescapeHTML(m.group(1))
3626 else:
3627 return default
3628
3629 shortened_video_id = video_id.rpartition('/')[2]
3630 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3631 info = {
3632 'id': shortened_video_id,
3633 'url': video_url,
3634 'ext': 'mp4',
3635 'title': title,
3636 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3637 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3638 }
3639 return [info]