]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Merge pull request #601 from paullik/no-post-overwrites
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
d77c3dfd 6import datetime
d77c3dfd
FV
7import netrc
8import os
9import re
10import socket
11import time
d77c3dfd 12import email.utils
921a1455 13import xml.etree.ElementTree
302efc19 14import random
15import math
d77c3dfd 16
9e8056d5 17from .utils import *
d77c3dfd
FV
18
19
20class InfoExtractor(object):
59ae15a5 21 """Information Extractor class.
d77c3dfd 22
59ae15a5
PH
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
cdb30764 26 others. The information is stored in a dictionary which is then
59ae15a5
PH
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
717b1f72 30
59ae15a5 31 The dictionaries must include the following fields:
717b1f72 32
59ae15a5
PH
33 id: Video identifier.
34 url: Final video URL.
77c4beab 35 uploader: Full name of the video uploader, unescaped.
59ae15a5
PH
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
717b1f72 39
59ae15a5 40 The following fields are optional:
717b1f72 41
59ae15a5
PH
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
77c4beab 45 uploader_id: Nickname or id of the video uploader.
59ae15a5
PH
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
d77c3dfd 50
59ae15a5 51 The fields should all be Unicode strings.
9ce5d9ee 52
59ae15a5
PH
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
717b1f72 56
59ae15a5
PH
57 _real_extract() must return a *list* of information dictionaries as
58 described above.
03c5b0fb 59
59ae15a5
PH
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
62 """
d77c3dfd 63
59ae15a5
PH
64 _ready = False
65 _downloader = None
66 _WORKING = True
d77c3dfd 67
59ae15a5
PH
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
70 self._ready = False
71 self.set_downloader(downloader)
d77c3dfd 72
59ae15a5
PH
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
d77c3dfd 76
59ae15a5
PH
77 def working(self):
78 """Getter method for _WORKING."""
79 return self._WORKING
03c5b0fb 80
59ae15a5
PH
81 def initialize(self):
82 """Initializes an instance (authentication, etc)."""
83 if not self._ready:
84 self._real_initialize()
85 self._ready = True
d77c3dfd 86
59ae15a5
PH
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
89 self.initialize()
90 return self._real_extract(url)
d77c3dfd 91
59ae15a5
PH
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
d77c3dfd 95
59ae15a5
PH
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
98 pass
d77c3dfd 99
59ae15a5
PH
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
102 pass
d77c3dfd
FV
103
104
105class YoutubeIE(InfoExtractor):
59ae15a5
PH
106 """Information extractor for youtube.com."""
107
108 _VALID_URL = r"""^
109 (
110 (?:https?://)? # http(s):// (optional)
111 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
112 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
113 (?:.*?\#/)? # handle anchor (#/) redirect urls
114 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
115 (?: # the various things that can precede the ID:
116 (?:(?:v|embed|e)/) # v/ or embed/ or e/
117 |(?: # or the v= param in all its forms
118 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
119 (?:\?|\#!?) # the params delimiter ? or # or #!
120 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
121 v=
122 )
123 )? # optional -> youtube.com/xxxx is OK
124 )? # all until now is optional -> you can pass the naked ID
125 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
126 (?(1).+)? # if we found the ID, everything can follow
127 $"""
128 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
129 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
130 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
131 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
132 _NETRC_MACHINE = 'youtube'
133 # Listed in order of quality
134 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
135 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
136 _video_extensions = {
137 '13': '3gp',
138 '17': 'mp4',
139 '18': 'mp4',
140 '22': 'mp4',
141 '37': 'mp4',
142 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
143 '43': 'webm',
144 '44': 'webm',
145 '45': 'webm',
146 '46': 'webm',
147 }
148 _video_dimensions = {
149 '5': '240x400',
150 '6': '???',
151 '13': '???',
152 '17': '144x176',
153 '18': '360x640',
154 '22': '720x1280',
155 '34': '360x640',
156 '35': '480x854',
157 '37': '1080x1920',
158 '38': '3072x4096',
159 '43': '360x640',
160 '44': '480x854',
161 '45': '720x1280',
162 '46': '1080x1920',
cdb30764 163 }
59ae15a5
PH
164 IE_NAME = u'youtube'
165
166 def suitable(self, url):
167 """Receives a URL and returns True if suitable for this IE."""
168 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169
170 def report_lang(self):
171 """Report attempt to set language."""
172 self._downloader.to_screen(u'[youtube] Setting language')
173
174 def report_login(self):
175 """Report attempt to log in."""
176 self._downloader.to_screen(u'[youtube] Logging in')
177
178 def report_age_confirmation(self):
179 """Report attempt to confirm age."""
180 self._downloader.to_screen(u'[youtube] Confirming age')
181
182 def report_video_webpage_download(self, video_id):
183 """Report attempt to download video webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185
186 def report_video_info_webpage_download(self, video_id):
187 """Report attempt to download video info webpage."""
188 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189
190 def report_video_subtitles_download(self, video_id):
191 """Report attempt to download video info webpage."""
192 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193
194 def report_information_extraction(self, video_id):
195 """Report attempt to extract video information."""
196 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197
198 def report_unavailable_format(self, video_id, format):
199 """Report extracted video URL."""
200 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201
202 def report_rtmp_download(self):
203 """Indicate the download will use the RTMP protocol."""
204 self._downloader.to_screen(u'[youtube] RTMP download detected')
205
206 def _closed_captions_xml_to_srt(self, xml_string):
207 srt = ''
208 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
209 # TODO parse xml instead of regex
210 for n, (start, dur_tag, dur, caption) in enumerate(texts):
211 if not dur: dur = '4'
212 start = float(start)
213 end = start + float(dur)
214 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
215 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
216 caption = unescapeHTML(caption)
217 caption = unescapeHTML(caption) # double cycle, intentional
218 srt += str(n+1) + '\n'
219 srt += start + ' --> ' + end + '\n'
220 srt += caption + '\n\n'
221 return srt
222
056d8575
FV
223 def _extract_subtitles(self, video_id):
224 self.report_video_subtitles_download(video_id)
225 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
226 try:
227 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
228 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
229 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
230 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
231 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
232 if not srt_lang_list:
233 return (u'WARNING: video has no closed captions', None)
234 if self._downloader.params.get('subtitleslang', False):
235 srt_lang = self._downloader.params.get('subtitleslang')
236 elif 'en' in srt_lang_list:
237 srt_lang = 'en'
238 else:
1a2c3c0f 239 srt_lang = list(srt_lang_list.keys())[0]
056d8575
FV
240 if not srt_lang in srt_lang_list:
241 return (u'WARNING: no closed captions found in the specified language', None)
242 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
243 try:
244 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
245 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
246 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
247 if not srt_xml:
248 return (u'WARNING: unable to download video subtitles', None)
249 return (None, self._closed_captions_xml_to_srt(srt_xml))
250
59ae15a5
PH
251 def _print_formats(self, formats):
252 print('Available formats:')
253 for x in formats:
254 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
255
256 def _real_initialize(self):
257 if self._downloader is None:
258 return
259
260 username = None
261 password = None
262 downloader_params = self._downloader.params
263
264 # Attempt to use provided username and password or .netrc data
265 if downloader_params.get('username', None) is not None:
266 username = downloader_params['username']
267 password = downloader_params['password']
268 elif downloader_params.get('usenetrc', False):
269 try:
270 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
271 if info is not None:
272 username = info[0]
273 password = info[2]
274 else:
275 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
276 except (IOError, netrc.NetrcParseError) as err:
277 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
278 return
279
280 # Set language
281 request = compat_urllib_request.Request(self._LANG_URL)
282 try:
283 self.report_lang()
284 compat_urllib_request.urlopen(request).read()
285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
286 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
287 return
288
289 # No authentication to be performed
290 if username is None:
291 return
292
293 # Log in
294 login_form = {
295 'current_form': 'loginForm',
296 'next': '/',
297 'action_login': 'Log In',
298 'username': username,
299 'password': password,
300 }
301 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
302 try:
303 self.report_login()
80d3177e 304 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
305 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
306 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
307 return
308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
309 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
310 return
311
312 # Confirm age
313 age_form = {
314 'next_url': '/',
315 'action_confirm': 'Confirm',
316 }
317 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
318 try:
319 self.report_age_confirmation()
80d3177e 320 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
322 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
323 return
324
325 def _real_extract(self, url):
326 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
327 mobj = re.search(self._NEXT_URL_RE, url)
328 if mobj:
329 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
330
331 # Extract video id from URL
332 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
333 if mobj is None:
334 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
335 return
336 video_id = mobj.group(2)
337
338 # Get video webpage
339 self.report_video_webpage_download(video_id)
340 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
341 try:
342 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
345 return
346
347 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
348
349 # Attempt to extract SWF player URL
350 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
351 if mobj is not None:
352 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
353 else:
354 player_url = None
355
356 # Get video info
357 self.report_video_info_webpage_download(video_id)
358 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
359 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
360 % (video_id, el_type))
361 request = compat_urllib_request.Request(video_info_url)
362 try:
363 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
365 video_info = compat_parse_qs(video_info_webpage)
366 if 'token' in video_info:
367 break
368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
369 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
370 return
371 if 'token' not in video_info:
372 if 'reason' in video_info:
373 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
374 else:
375 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
376 return
377
378 # Check for "rental" videos
379 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
380 self._downloader.trouble(u'ERROR: "rental" videos not supported')
381 return
382
383 # Start extracting information
384 self.report_information_extraction(video_id)
385
386 # uploader
387 if 'author' not in video_info:
77c4beab 388 self._downloader.trouble(u'ERROR: unable to extract uploader name')
59ae15a5
PH
389 return
390 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
391
77c4beab
FV
392 # uploader_id
393 video_uploader_id = None
394 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
395 if mobj is not None:
396 video_uploader_id = mobj.group(1)
397 else:
398 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
399
59ae15a5
PH
400 # title
401 if 'title' not in video_info:
402 self._downloader.trouble(u'ERROR: unable to extract video title')
403 return
404 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
405
406 # thumbnail image
407 if 'thumbnail_url' not in video_info:
408 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
409 video_thumbnail = ''
410 else: # don't panic if we can't find it
411 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
412
413 # upload date
414 upload_date = None
415 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
416 if mobj is not None:
417 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
418 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
419 for expression in format_expressions:
420 try:
421 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
422 except:
423 pass
424
425 # description
426 video_description = get_element_by_id("eow-description", video_webpage)
427 if video_description:
428 video_description = clean_html(video_description)
429 else:
430 video_description = ''
431
432 # closed captions
433 video_subtitles = None
434 if self._downloader.params.get('writesubtitles', False):
056d8575
FV
435 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
436 if srt_error:
437 self._downloader.trouble(srt_error)
59ae15a5
PH
438
439 if 'length_seconds' not in video_info:
440 self._downloader.trouble(u'WARNING: unable to extract video duration')
441 video_duration = ''
442 else:
443 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
444
445 # token
446 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
447
448 # Decide which formats to download
449 req_format = self._downloader.params.get('format', None)
450
451 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
452 self.report_rtmp_download()
453 video_url_list = [(None, video_info['conn'][0])]
454 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
455 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
456 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1a2c3c0f 457 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
59ae15a5
PH
458 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
459
460 format_limit = self._downloader.params.get('format_limit', None)
461 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
462 if format_limit is not None and format_limit in available_formats:
463 format_list = available_formats[available_formats.index(format_limit):]
464 else:
465 format_list = available_formats
466 existing_formats = [x for x in format_list if x in url_map]
467 if len(existing_formats) == 0:
468 self._downloader.trouble(u'ERROR: no known formats available for video')
469 return
470 if self._downloader.params.get('listformats', None):
471 self._print_formats(existing_formats)
472 return
473 if req_format is None or req_format == 'best':
474 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
475 elif req_format == 'worst':
476 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
477 elif req_format in ('-1', 'all'):
478 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
479 else:
480 # Specific formats. We pick the first in a slash-delimeted sequence.
481 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
482 req_formats = req_format.split('/')
483 video_url_list = None
484 for rf in req_formats:
485 if rf in url_map:
486 video_url_list = [(rf, url_map[rf])]
487 break
488 if video_url_list is None:
489 self._downloader.trouble(u'ERROR: requested format not available')
490 return
491 else:
492 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
493 return
494
495 results = []
496 for format_param, video_real_url in video_url_list:
497 # Extension
498 video_extension = self._video_extensions.get(format_param, 'flv')
499
32761d86
FV
500 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
501 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
502
503 results.append({
504 'id': video_id,
505 'url': video_real_url,
506 'uploader': video_uploader,
77c4beab 507 'uploader_id': video_uploader_id,
59ae15a5
PH
508 'upload_date': upload_date,
509 'title': video_title,
510 'ext': video_extension,
511 'format': video_format,
512 'thumbnail': video_thumbnail,
513 'description': video_description,
514 'player_url': player_url,
515 'subtitles': video_subtitles,
516 'duration': video_duration
517 })
518 return results
d77c3dfd
FV
519
520
521class MetacafeIE(InfoExtractor):
59ae15a5
PH
522 """Information Extractor for metacafe.com."""
523
524 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
525 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
526 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
527 IE_NAME = u'metacafe'
528
529 def __init__(self, downloader=None):
530 InfoExtractor.__init__(self, downloader)
531
532 def report_disclaimer(self):
533 """Report disclaimer retrieval."""
534 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
535
536 def report_age_confirmation(self):
537 """Report attempt to confirm age."""
538 self._downloader.to_screen(u'[metacafe] Confirming age')
539
540 def report_download_webpage(self, video_id):
541 """Report webpage download."""
542 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
543
544 def report_extraction(self, video_id):
545 """Report information extraction."""
546 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
547
548 def _real_initialize(self):
549 # Retrieve disclaimer
550 request = compat_urllib_request.Request(self._DISCLAIMER)
551 try:
552 self.report_disclaimer()
553 disclaimer = compat_urllib_request.urlopen(request).read()
554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
555 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
556 return
557
558 # Confirm age
559 disclaimer_form = {
560 'filters': '0',
561 'submit': "Continue - I'm over 18",
562 }
563 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
564 try:
565 self.report_age_confirmation()
566 disclaimer = compat_urllib_request.urlopen(request).read()
567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
569 return
570
571 def _real_extract(self, url):
572 # Extract id and simplified title from URL
573 mobj = re.match(self._VALID_URL, url)
574 if mobj is None:
575 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
576 return
577
578 video_id = mobj.group(1)
579
580 # Check if video comes from YouTube
581 mobj2 = re.match(r'^yt-(.*)$', video_id)
582 if mobj2 is not None:
583 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
584 return
585
586 # Retrieve video webpage to extract further information
587 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
588 try:
589 self.report_download_webpage(video_id)
590 webpage = compat_urllib_request.urlopen(request).read()
591 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
592 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
593 return
594
595 # Extract URL, uploader and title from webpage
596 self.report_extraction(video_id)
597 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
598 if mobj is not None:
599 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
600 video_extension = mediaURL[-3:]
601
602 # Extract gdaKey if available
603 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
604 if mobj is None:
605 video_url = mediaURL
606 else:
607 gdaKey = mobj.group(1)
608 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
609 else:
610 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
611 if mobj is None:
612 self._downloader.trouble(u'ERROR: unable to extract media URL')
613 return
614 vardict = compat_parse_qs(mobj.group(1))
615 if 'mediaData' not in vardict:
616 self._downloader.trouble(u'ERROR: unable to extract media URL')
617 return
618 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
619 if mobj is None:
620 self._downloader.trouble(u'ERROR: unable to extract media URL')
621 return
622 mediaURL = mobj.group(1).replace('\\/', '/')
623 video_extension = mediaURL[-3:]
624 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
625
626 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
627 if mobj is None:
628 self._downloader.trouble(u'ERROR: unable to extract title')
629 return
630 video_title = mobj.group(1).decode('utf-8')
631
632 mobj = re.search(r'submitter=(.*?);', webpage)
633 if mobj is None:
634 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
635 return
636 video_uploader = mobj.group(1)
637
638 return [{
639 'id': video_id.decode('utf-8'),
640 'url': video_url.decode('utf-8'),
641 'uploader': video_uploader.decode('utf-8'),
642 'upload_date': None,
643 'title': video_title,
644 'ext': video_extension.decode('utf-8'),
645 }]
d77c3dfd
FV
646
647
648class DailymotionIE(InfoExtractor):
59ae15a5
PH
649 """Information Extractor for Dailymotion"""
650
651 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
652 IE_NAME = u'dailymotion'
653
654 def __init__(self, downloader=None):
655 InfoExtractor.__init__(self, downloader)
656
657 def report_download_webpage(self, video_id):
658 """Report webpage download."""
659 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
660
661 def report_extraction(self, video_id):
662 """Report information extraction."""
663 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
664
665 def _real_extract(self, url):
666 # Extract id and simplified title from URL
667 mobj = re.match(self._VALID_URL, url)
668 if mobj is None:
669 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
670 return
671
672 video_id = mobj.group(1).split('_')[0].split('?')[0]
673
674 video_extension = 'mp4'
675
676 # Retrieve video webpage to extract further information
677 request = compat_urllib_request.Request(url)
678 request.add_header('Cookie', 'family_filter=off')
679 try:
680 self.report_download_webpage(video_id)
28ca6b5a
PH
681 webpage_bytes = compat_urllib_request.urlopen(request).read()
682 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
683 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
684 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
685 return
686
687 # Extract URL, uploader and title from webpage
688 self.report_extraction(video_id)
689 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
690 if mobj is None:
691 self._downloader.trouble(u'ERROR: unable to extract media URL')
692 return
693 flashvars = compat_urllib_parse.unquote(mobj.group(1))
694
695 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
696 if key in flashvars:
697 max_quality = key
698 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
699 break
700 else:
701 self._downloader.trouble(u'ERROR: unable to extract video URL')
702 return
703
704 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
705 if mobj is None:
706 self._downloader.trouble(u'ERROR: unable to extract video URL')
707 return
708
709 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
710
711 # TODO: support choosing qualities
712
713 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
714 if mobj is None:
715 self._downloader.trouble(u'ERROR: unable to extract title')
716 return
28ca6b5a 717 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
718
719 video_uploader = None
720 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
721 if mobj is None:
722 # lookin for official user
723 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
724 if mobj_official is None:
725 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
726 else:
727 video_uploader = mobj_official.group(1)
728 else:
729 video_uploader = mobj.group(1)
730
731 video_upload_date = None
732 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
733 if mobj is not None:
734 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
735
736 return [{
28ca6b5a
PH
737 'id': video_id,
738 'url': video_url,
739 'uploader': video_uploader,
59ae15a5
PH
740 'upload_date': video_upload_date,
741 'title': video_title,
28ca6b5a 742 'ext': video_extension,
59ae15a5 743 }]
d77c3dfd
FV
744
745
d77c3dfd 746class PhotobucketIE(InfoExtractor):
59ae15a5
PH
747 """Information extractor for photobucket.com."""
748
749 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
750 IE_NAME = u'photobucket'
751
752 def __init__(self, downloader=None):
753 InfoExtractor.__init__(self, downloader)
754
755 def report_download_webpage(self, video_id):
756 """Report webpage download."""
757 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
758
759 def report_extraction(self, video_id):
760 """Report information extraction."""
761 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
762
763 def _real_extract(self, url):
764 # Extract id from URL
765 mobj = re.match(self._VALID_URL, url)
766 if mobj is None:
767 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
768 return
769
770 video_id = mobj.group(1)
771
772 video_extension = 'flv'
773
774 # Retrieve video webpage to extract further information
775 request = compat_urllib_request.Request(url)
776 try:
777 self.report_download_webpage(video_id)
778 webpage = compat_urllib_request.urlopen(request).read()
779 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
780 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
781 return
782
783 # Extract URL, uploader, and title from webpage
784 self.report_extraction(video_id)
785 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
786 if mobj is None:
787 self._downloader.trouble(u'ERROR: unable to extract media URL')
788 return
789 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
790
791 video_url = mediaURL
792
793 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
794 if mobj is None:
795 self._downloader.trouble(u'ERROR: unable to extract title')
796 return
797 video_title = mobj.group(1).decode('utf-8')
798
799 video_uploader = mobj.group(2).decode('utf-8')
800
801 return [{
802 'id': video_id.decode('utf-8'),
803 'url': video_url.decode('utf-8'),
804 'uploader': video_uploader,
805 'upload_date': None,
806 'title': video_title,
807 'ext': video_extension.decode('utf-8'),
808 }]
d77c3dfd
FV
809
810
811class YahooIE(InfoExtractor):
59ae15a5
PH
812 """Information extractor for video.yahoo.com."""
813
93702113 814 _WORKING = False
59ae15a5
PH
815 # _VALID_URL matches all Yahoo! Video URLs
816 # _VPAGE_URL matches only the extractable '/watch/' URLs
817 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
818 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
819 IE_NAME = u'video.yahoo'
820
821 def __init__(self, downloader=None):
822 InfoExtractor.__init__(self, downloader)
823
824 def report_download_webpage(self, video_id):
825 """Report webpage download."""
826 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
827
828 def report_extraction(self, video_id):
829 """Report information extraction."""
830 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
831
832 def _real_extract(self, url, new_video=True):
833 # Extract ID from URL
834 mobj = re.match(self._VALID_URL, url)
835 if mobj is None:
836 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
837 return
838
839 video_id = mobj.group(2)
840 video_extension = 'flv'
841
842 # Rewrite valid but non-extractable URLs as
843 # extractable English language /watch/ URLs
844 if re.match(self._VPAGE_URL, url) is None:
845 request = compat_urllib_request.Request(url)
846 try:
847 webpage = compat_urllib_request.urlopen(request).read()
848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
849 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
850 return
851
852 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
853 if mobj is None:
854 self._downloader.trouble(u'ERROR: Unable to extract id field')
855 return
856 yahoo_id = mobj.group(1)
857
858 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
859 if mobj is None:
860 self._downloader.trouble(u'ERROR: Unable to extract vid field')
861 return
862 yahoo_vid = mobj.group(1)
863
864 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
865 return self._real_extract(url, new_video=False)
866
867 # Retrieve video webpage to extract further information
868 request = compat_urllib_request.Request(url)
869 try:
870 self.report_download_webpage(video_id)
871 webpage = compat_urllib_request.urlopen(request).read()
872 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
873 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
874 return
875
876 # Extract uploader and title from webpage
877 self.report_extraction(video_id)
878 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
879 if mobj is None:
880 self._downloader.trouble(u'ERROR: unable to extract video title')
881 return
882 video_title = mobj.group(1).decode('utf-8')
883
884 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
885 if mobj is None:
886 self._downloader.trouble(u'ERROR: unable to extract video uploader')
887 return
888 video_uploader = mobj.group(1).decode('utf-8')
889
890 # Extract video thumbnail
891 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
892 if mobj is None:
893 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
894 return
895 video_thumbnail = mobj.group(1).decode('utf-8')
896
897 # Extract video description
898 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
899 if mobj is None:
900 self._downloader.trouble(u'ERROR: unable to extract video description')
901 return
902 video_description = mobj.group(1).decode('utf-8')
903 if not video_description:
904 video_description = 'No description available.'
905
906 # Extract video height and width
907 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: unable to extract video height')
910 return
911 yv_video_height = mobj.group(1)
912
913 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
914 if mobj is None:
915 self._downloader.trouble(u'ERROR: unable to extract video width')
916 return
917 yv_video_width = mobj.group(1)
918
919 # Retrieve video playlist to extract media URL
920 # I'm not completely sure what all these options are, but we
921 # seem to need most of them, otherwise the server sends a 401.
922 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
923 yv_bitrate = '700' # according to Wikipedia this is hard-coded
924 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
925 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
926 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
927 try:
928 self.report_download_webpage(video_id)
929 webpage = compat_urllib_request.urlopen(request).read()
930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
931 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
932 return
933
934 # Extract media URL from playlist XML
935 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
936 if mobj is None:
937 self._downloader.trouble(u'ERROR: Unable to extract media URL')
938 return
939 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
940 video_url = unescapeHTML(video_url)
941
942 return [{
943 'id': video_id.decode('utf-8'),
944 'url': video_url,
945 'uploader': video_uploader,
946 'upload_date': None,
947 'title': video_title,
948 'ext': video_extension.decode('utf-8'),
949 'thumbnail': video_thumbnail.decode('utf-8'),
950 'description': video_description,
951 }]
d77c3dfd
FV
952
953
954class VimeoIE(InfoExtractor):
59ae15a5
PH
955 """Information extractor for vimeo.com."""
956
957 # _VALID_URL matches Vimeo URLs
958 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
959 IE_NAME = u'vimeo'
960
961 def __init__(self, downloader=None):
962 InfoExtractor.__init__(self, downloader)
963
964 def report_download_webpage(self, video_id):
965 """Report webpage download."""
966 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
967
968 def report_extraction(self, video_id):
969 """Report information extraction."""
970 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
971
972 def _real_extract(self, url, new_video=True):
973 # Extract ID from URL
974 mobj = re.match(self._VALID_URL, url)
975 if mobj is None:
976 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
977 return
978
979 video_id = mobj.group(1)
980
981 # Retrieve video webpage to extract further information
982 request = compat_urllib_request.Request(url, None, std_headers)
983 try:
984 self.report_download_webpage(video_id)
f1171f7c
PH
985 webpage_bytes = compat_urllib_request.urlopen(request).read()
986 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
987 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
988 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
989 return
990
991 # Now we begin extracting as much information as we can from what we
992 # retrieved. First we extract the information common to all extractors,
993 # and latter we extract those that are Vimeo specific.
994 self.report_extraction(video_id)
995
996 # Extract the config JSON
59ae15a5 997 try:
1ca63e3a 998 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
999 config = json.loads(config)
1000 except:
1001 self._downloader.trouble(u'ERROR: unable to extract info section')
1002 return
cdb30764 1003
59ae15a5
PH
1004 # Extract title
1005 video_title = config["video"]["title"]
1006
77c4beab 1007 # Extract uploader and uploader_id
59ae15a5 1008 video_uploader = config["video"]["owner"]["name"]
77c4beab 1009 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1010
1011 # Extract video thumbnail
1012 video_thumbnail = config["video"]["thumbnail"]
1013
1014 # Extract video description
0dcfb234 1015 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5
PH
1016 if video_description: video_description = clean_html(video_description)
1017 else: video_description = ''
1018
1019 # Extract upload date
1020 video_upload_date = None
6b3aef80 1021 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1022 if mobj is not None:
6b3aef80 1023 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1024
1025 # Vimeo specific: extract request signature and timestamp
1026 sig = config['request']['signature']
1027 timestamp = config['request']['timestamp']
1028
1029 # Vimeo specific: extract video codec and quality information
1030 # First consider quality, then codecs, then take everything
1031 # TODO bind to format param
1032 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1033 files = { 'hd': [], 'sd': [], 'other': []}
1034 for codec_name, codec_extension in codecs:
1035 if codec_name in config["video"]["files"]:
1036 if 'hd' in config["video"]["files"][codec_name]:
1037 files['hd'].append((codec_name, codec_extension, 'hd'))
1038 elif 'sd' in config["video"]["files"][codec_name]:
1039 files['sd'].append((codec_name, codec_extension, 'sd'))
1040 else:
1041 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1042
1043 for quality in ('hd', 'sd', 'other'):
1044 if len(files[quality]) > 0:
1045 video_quality = files[quality][0][2]
1046 video_codec = files[quality][0][0]
1047 video_extension = files[quality][0][1]
1048 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1049 break
1050 else:
1051 self._downloader.trouble(u'ERROR: no known codec found')
1052 return
1053
1054 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1055 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1056
1057 return [{
1058 'id': video_id,
1059 'url': video_url,
1060 'uploader': video_uploader,
77c4beab 1061 'uploader_id': video_uploader_id,
59ae15a5
PH
1062 'upload_date': video_upload_date,
1063 'title': video_title,
1064 'ext': video_extension,
1065 'thumbnail': video_thumbnail,
1066 'description': video_description,
1067 }]
d77c3dfd
FV
1068
1069
f2ad10a9 1070class ArteTvIE(InfoExtractor):
59ae15a5
PH
1071 """arte.tv information extractor."""
1072
1073 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1074 _LIVE_URL = r'index-[0-9]+\.html$'
1075
1076 IE_NAME = u'arte.tv'
1077
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1080
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1084
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1088
1089 def fetch_webpage(self, url):
1090 self._downloader.increment_downloads()
1091 request = compat_urllib_request.Request(url)
1092 try:
1093 self.report_download_webpage(url)
1094 webpage = compat_urllib_request.urlopen(request).read()
1095 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1096 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1097 return
1098 except ValueError as err:
1099 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1100 return
1101 return webpage
1102
1103 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1104 page = self.fetch_webpage(url)
1105 mobj = re.search(regex, page, regexFlags)
1106 info = {}
1107
1108 if mobj is None:
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110 return
1111
1112 for (i, key, err) in matchTuples:
1113 if mobj.group(i) is None:
1114 self._downloader.trouble(err)
1115 return
1116 else:
1117 info[key] = mobj.group(i)
1118
1119 return info
1120
1121 def extractLiveStream(self, url):
1122 video_lang = url.split('/')[-4]
1123 info = self.grep_webpage(
1124 url,
1125 r'src="(.*?/videothek_js.*?\.js)',
1126 0,
1127 [
1128 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1129 ]
1130 )
1131 http_host = url.split('/')[2]
1132 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1133 info = self.grep_webpage(
1134 next_url,
1135 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1136 '(http://.*?\.swf).*?' +
1137 '(rtmp://.*?)\'',
1138 re.DOTALL,
1139 [
1140 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1141 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1142 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1143 ]
1144 )
1145 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1146
1147 def extractPlus7Stream(self, url):
1148 video_lang = url.split('/')[-3]
1149 info = self.grep_webpage(
1150 url,
1151 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1152 0,
1153 [
1154 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1155 ]
1156 )
1157 next_url = compat_urllib_parse.unquote(info.get('url'))
1158 info = self.grep_webpage(
1159 next_url,
1160 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1161 0,
1162 [
1163 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1164 ]
1165 )
1166 next_url = compat_urllib_parse.unquote(info.get('url'))
1167
1168 info = self.grep_webpage(
1169 next_url,
1170 r'<video id="(.*?)".*?>.*?' +
1171 '<name>(.*?)</name>.*?' +
1172 '<dateVideo>(.*?)</dateVideo>.*?' +
1173 '<url quality="hd">(.*?)</url>',
1174 re.DOTALL,
1175 [
1176 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1177 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1178 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1179 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1180 ]
1181 )
1182
1183 return {
1184 'id': info.get('id'),
1185 'url': compat_urllib_parse.unquote(info.get('url')),
1186 'uploader': u'arte.tv',
1187 'upload_date': info.get('date'),
93702113 1188 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1189 'ext': u'mp4',
1190 'format': u'NA',
1191 'player_url': None,
1192 }
1193
1194 def _real_extract(self, url):
1195 video_id = url.split('/')[-1]
1196 self.report_extraction(video_id)
1197
1198 if re.search(self._LIVE_URL, video_id) is not None:
1199 self.extractLiveStream(url)
1200 return
1201 else:
1202 info = self.extractPlus7Stream(url)
1203
1204 return [info]
f2ad10a9
CA
1205
1206
d77c3dfd 1207class GenericIE(InfoExtractor):
59ae15a5
PH
1208 """Generic last-resort information extractor."""
1209
1210 _VALID_URL = r'.*'
1211 IE_NAME = u'generic'
1212
1213 def __init__(self, downloader=None):
1214 InfoExtractor.__init__(self, downloader)
1215
1216 def report_download_webpage(self, video_id):
1217 """Report webpage download."""
1218 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1219 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1220
1221 def report_extraction(self, video_id):
1222 """Report information extraction."""
1223 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1224
1225 def report_following_redirect(self, new_url):
1226 """Report information extraction."""
1227 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1228
59ae15a5
PH
1229 def _test_redirect(self, url):
1230 """Check if it is a redirect, like url shorteners, in case restart chain."""
1231 class HeadRequest(compat_urllib_request.Request):
1232 def get_method(self):
1233 return "HEAD"
1234
1235 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1236 """
cdb30764 1237 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1238 HeadRequest also on the redirected URL
1239 """
cdb30764 1240 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1241 if code in (301, 302, 303, 307):
cdb30764 1242 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1243 newheaders = dict((k,v) for k,v in req.headers.items()
1244 if k.lower() not in ("content-length", "content-type"))
cdb30764 1245 return HeadRequest(newurl,
59ae15a5 1246 headers=newheaders,
cdb30764
ND
1247 origin_req_host=req.get_origin_req_host(),
1248 unverifiable=True)
1249 else:
1250 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1251
1252 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1253 """
1254 Fallback to GET if HEAD is not allowed (405 HTTP error)
1255 """
cdb30764 1256 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1257 fp.read()
1258 fp.close()
1259
1260 newheaders = dict((k,v) for k,v in req.headers.items()
1261 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1262 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1263 headers=newheaders,
1264 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1265 unverifiable=True))
1266
1267 # Build our opener
cdb30764 1268 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1269 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1270 HTTPMethodFallback, HEADRedirectHandler,
1271 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1272 opener.add_handler(handler())
1273
1274 response = opener.open(HeadRequest(url))
1275 new_url = response.geturl()
1276
1277 if url == new_url:
1278 return False
1279
1280 self.report_following_redirect(new_url)
1281 self._downloader.download([new_url])
1282 return True
1283
1284 def _real_extract(self, url):
1285 if self._test_redirect(url): return
1286
1287 video_id = url.split('/')[-1]
1288 request = compat_urllib_request.Request(url)
1289 try:
1290 self.report_download_webpage(video_id)
1291 webpage = compat_urllib_request.urlopen(request).read()
1292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1293 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1294 return
1295 except ValueError as err:
1296 # since this is the last-resort InfoExtractor, if
1297 # this error is thrown, it'll be thrown here
1298 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1299 return
1300
1301 self.report_extraction(video_id)
1302 # Start with something easy: JW Player in SWFObject
1303 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1304 if mobj is None:
1305 # Broaden the search a little bit
1306 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1307 if mobj is None:
1308 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309 return
1310
1311 # It's possible that one of the regexes
1312 # matched, but returned an empty group:
1313 if mobj.group(1) is None:
1314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315 return
1316
1317 video_url = compat_urllib_parse.unquote(mobj.group(1))
1318 video_id = os.path.basename(video_url)
1319
1320 # here's a fun little line of code for you:
1321 video_extension = os.path.splitext(video_id)[1][1:]
1322 video_id = os.path.splitext(video_id)[0]
1323
1324 # it's tempting to parse this further, but you would
1325 # have to take into account all the variations like
1326 # Video Title - Site Name
1327 # Site Name | Video Title
1328 # Video Title - Tagline | Site Name
1329 # and so on and so forth; it's just not practical
1330 mobj = re.search(r'<title>(.*)</title>', webpage)
1331 if mobj is None:
1332 self._downloader.trouble(u'ERROR: unable to extract title')
1333 return
f1171f7c 1334 video_title = mobj.group(1)
59ae15a5
PH
1335
1336 # video uploader is domain name
1337 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1338 if mobj is None:
1339 self._downloader.trouble(u'ERROR: unable to extract title')
1340 return
f1171f7c 1341 video_uploader = mobj.group(1)
59ae15a5
PH
1342
1343 return [{
f1171f7c
PH
1344 'id': video_id,
1345 'url': video_url,
59ae15a5
PH
1346 'uploader': video_uploader,
1347 'upload_date': None,
1348 'title': video_title,
f1171f7c 1349 'ext': video_extension,
59ae15a5 1350 }]
d77c3dfd
FV
1351
1352
1353class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1354 """Information Extractor for YouTube search queries."""
1355 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1356 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1357 _max_youtube_results = 1000
1358 IE_NAME = u'youtube:search'
1359
1360 def __init__(self, downloader=None):
1361 InfoExtractor.__init__(self, downloader)
1362
1363 def report_download_page(self, query, pagenum):
1364 """Report attempt to download search page with given number."""
1365 query = query.decode(preferredencoding())
1366 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1367
1368 def _real_extract(self, query):
1369 mobj = re.match(self._VALID_URL, query)
1370 if mobj is None:
1371 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1372 return
1373
1374 prefix, query = query.split(':')
1375 prefix = prefix[8:]
1376 query = query.encode('utf-8')
1377 if prefix == '':
1378 self._download_n_results(query, 1)
1379 return
1380 elif prefix == 'all':
1381 self._download_n_results(query, self._max_youtube_results)
1382 return
1383 else:
1384 try:
1385 n = int(prefix)
1386 if n <= 0:
1387 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1388 return
1389 elif n > self._max_youtube_results:
1390 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1391 n = self._max_youtube_results
1392 self._download_n_results(query, n)
1393 return
1394 except ValueError: # parsing prefix as integer fails
1395 self._download_n_results(query, 1)
1396 return
1397
1398 def _download_n_results(self, query, n):
1399 """Downloads a specified number of results for a query"""
1400
1401 video_ids = []
1402 pagenum = 0
1403 limit = n
1404
1405 while (50 * pagenum) < limit:
1406 self.report_download_page(query, pagenum+1)
1407 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1408 request = compat_urllib_request.Request(result_url)
1409 try:
1410 data = compat_urllib_request.urlopen(request).read()
1411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1412 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1413 return
1414 api_response = json.loads(data)['data']
1415
1416 new_ids = list(video['id'] for video in api_response['items'])
1417 video_ids += new_ids
1418
1419 limit = min(n, api_response['totalItems'])
1420 pagenum += 1
1421
1422 if len(video_ids) > n:
1423 video_ids = video_ids[:n]
1424 for id in video_ids:
1425 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1426 return
d77c3dfd
FV
1427
1428
1429class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1430 """Information Extractor for Google Video search queries."""
1431 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1432 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1433 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1434 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1435 _max_google_results = 1000
1436 IE_NAME = u'video.google:search'
1437
1438 def __init__(self, downloader=None):
1439 InfoExtractor.__init__(self, downloader)
1440
1441 def report_download_page(self, query, pagenum):
1442 """Report attempt to download playlist page with given number."""
1443 query = query.decode(preferredencoding())
1444 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1445
1446 def _real_extract(self, query):
1447 mobj = re.match(self._VALID_URL, query)
1448 if mobj is None:
1449 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1450 return
1451
1452 prefix, query = query.split(':')
1453 prefix = prefix[8:]
1454 query = query.encode('utf-8')
1455 if prefix == '':
1456 self._download_n_results(query, 1)
1457 return
1458 elif prefix == 'all':
1459 self._download_n_results(query, self._max_google_results)
1460 return
1461 else:
1462 try:
1463 n = int(prefix)
1464 if n <= 0:
1465 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1466 return
1467 elif n > self._max_google_results:
1468 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1469 n = self._max_google_results
1470 self._download_n_results(query, n)
1471 return
1472 except ValueError: # parsing prefix as integer fails
1473 self._download_n_results(query, 1)
1474 return
1475
1476 def _download_n_results(self, query, n):
1477 """Downloads a specified number of results for a query"""
1478
1479 video_ids = []
1480 pagenum = 0
1481
1482 while True:
1483 self.report_download_page(query, pagenum)
1484 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1485 request = compat_urllib_request.Request(result_url)
1486 try:
1487 page = compat_urllib_request.urlopen(request).read()
1488 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1489 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1490 return
1491
1492 # Extract video identifiers
1493 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1494 video_id = mobj.group(1)
1495 if video_id not in video_ids:
1496 video_ids.append(video_id)
1497 if len(video_ids) == n:
1498 # Specified n videos reached
1499 for id in video_ids:
1500 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1501 return
1502
1503 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1504 for id in video_ids:
1505 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506 return
1507
1508 pagenum = pagenum + 1
d77c3dfd
FV
1509
1510
1511class YahooSearchIE(InfoExtractor):
59ae15a5 1512 """Information Extractor for Yahoo! Video search queries."""
93702113
FV
1513
1514 _WORKING = False
59ae15a5
PH
1515 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1516 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1517 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1518 _MORE_PAGES_INDICATOR = r'\s*Next'
1519 _max_yahoo_results = 1000
1520 IE_NAME = u'video.yahoo:search'
1521
1522 def __init__(self, downloader=None):
1523 InfoExtractor.__init__(self, downloader)
1524
1525 def report_download_page(self, query, pagenum):
1526 """Report attempt to download playlist page with given number."""
1527 query = query.decode(preferredencoding())
1528 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1529
1530 def _real_extract(self, query):
1531 mobj = re.match(self._VALID_URL, query)
1532 if mobj is None:
1533 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1534 return
1535
1536 prefix, query = query.split(':')
1537 prefix = prefix[8:]
1538 query = query.encode('utf-8')
1539 if prefix == '':
1540 self._download_n_results(query, 1)
1541 return
1542 elif prefix == 'all':
1543 self._download_n_results(query, self._max_yahoo_results)
1544 return
1545 else:
1546 try:
1547 n = int(prefix)
1548 if n <= 0:
1549 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1550 return
1551 elif n > self._max_yahoo_results:
1552 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1553 n = self._max_yahoo_results
1554 self._download_n_results(query, n)
1555 return
1556 except ValueError: # parsing prefix as integer fails
1557 self._download_n_results(query, 1)
1558 return
1559
1560 def _download_n_results(self, query, n):
1561 """Downloads a specified number of results for a query"""
1562
1563 video_ids = []
1564 already_seen = set()
1565 pagenum = 1
1566
1567 while True:
1568 self.report_download_page(query, pagenum)
1569 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1570 request = compat_urllib_request.Request(result_url)
1571 try:
1572 page = compat_urllib_request.urlopen(request).read()
1573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1574 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1575 return
1576
1577 # Extract video identifiers
1578 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1579 video_id = mobj.group(1)
1580 if video_id not in already_seen:
1581 video_ids.append(video_id)
1582 already_seen.add(video_id)
1583 if len(video_ids) == n:
1584 # Specified n videos reached
1585 for id in video_ids:
1586 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1587 return
1588
1589 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1590 for id in video_ids:
1591 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592 return
1593
1594 pagenum = pagenum + 1
d77c3dfd
FV
1595
1596
1597class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1598 """Information Extractor for YouTube playlists."""
1599
e387eb5a 1600 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
59ae15a5
PH
1601 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1602 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
9789a05c 1603 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1604 IE_NAME = u'youtube:playlist'
1605
1606 def __init__(self, downloader=None):
1607 InfoExtractor.__init__(self, downloader)
1608
1609 def report_download_page(self, playlist_id, pagenum):
1610 """Report attempt to download playlist page with given number."""
1611 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1612
1613 def _real_extract(self, url):
1614 # Extract playlist id
1615 mobj = re.match(self._VALID_URL, url)
1616 if mobj is None:
1617 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1618 return
1619
1620 # Single video case
1621 if mobj.group(3) is not None:
1622 self._downloader.download([mobj.group(3)])
1623 return
1624
1625 # Download playlist pages
1626 # prefix is 'p' as default for playlists but there are other types that need extra care
1627 playlist_prefix = mobj.group(1)
1628 if playlist_prefix == 'a':
1629 playlist_access = 'artist'
1630 else:
1631 playlist_prefix = 'p'
1632 playlist_access = 'view_play_list'
1633 playlist_id = mobj.group(2)
1634 video_ids = []
1635 pagenum = 1
1636
1637 while True:
1638 self.report_download_page(playlist_id, pagenum)
1639 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1640 request = compat_urllib_request.Request(url)
1641 try:
80d3177e 1642 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1644 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1645 return
1646
1647 # Extract video identifiers
1648 ids_in_page = []
1649 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1650 if mobj.group(1) not in ids_in_page:
1651 ids_in_page.append(mobj.group(1))
1652 video_ids.extend(ids_in_page)
1653
9789a05c 1654 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1655 break
1656 pagenum = pagenum + 1
1657
9789a05c
FV
1658 total = len(video_ids)
1659
59ae15a5
PH
1660 playliststart = self._downloader.params.get('playliststart', 1) - 1
1661 playlistend = self._downloader.params.get('playlistend', -1)
1662 if playlistend == -1:
1663 video_ids = video_ids[playliststart:]
1664 else:
1665 video_ids = video_ids[playliststart:playlistend]
1666
9789a05c
FV
1667 if len(video_ids) == total:
1668 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1669 else:
1670 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1671
59ae15a5
PH
1672 for id in video_ids:
1673 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1674 return
d77c3dfd
FV
1675
1676
902b2a0a 1677class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1678 """Information Extractor for YouTube channels."""
1679
1680 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1681 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
9789a05c 1682 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1683 IE_NAME = u'youtube:channel'
1684
1685 def report_download_page(self, channel_id, pagenum):
1686 """Report attempt to download channel page with given number."""
1687 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1688
1689 def _real_extract(self, url):
1690 # Extract channel id
1691 mobj = re.match(self._VALID_URL, url)
1692 if mobj is None:
1693 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1694 return
1695
1696 # Download channel pages
1697 channel_id = mobj.group(1)
1698 video_ids = []
1699 pagenum = 1
1700
1701 while True:
1702 self.report_download_page(channel_id, pagenum)
1703 url = self._TEMPLATE_URL % (channel_id, pagenum)
1704 request = compat_urllib_request.Request(url)
1705 try:
9789a05c 1706 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5
PH
1707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1709 return
1710
1711 # Extract video identifiers
1712 ids_in_page = []
1713 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1714 if mobj.group(1) not in ids_in_page:
1715 ids_in_page.append(mobj.group(1))
1716 video_ids.extend(ids_in_page)
1717
9789a05c 1718 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1719 break
1720 pagenum = pagenum + 1
1721
9789a05c
FV
1722 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1723
59ae15a5
PH
1724 for id in video_ids:
1725 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1726 return
902b2a0a
FV
1727
1728
d77c3dfd 1729class YoutubeUserIE(InfoExtractor):
59ae15a5 1730 """Information Extractor for YouTube users."""
d77c3dfd 1731
59ae15a5
PH
1732 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1733 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1734 _GDATA_PAGE_SIZE = 50
1735 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1736 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1737 IE_NAME = u'youtube:user'
d77c3dfd 1738
59ae15a5
PH
1739 def __init__(self, downloader=None):
1740 InfoExtractor.__init__(self, downloader)
d77c3dfd 1741
59ae15a5
PH
1742 def report_download_page(self, username, start_index):
1743 """Report attempt to download user page."""
1744 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1745 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1746
59ae15a5
PH
1747 def _real_extract(self, url):
1748 # Extract username
1749 mobj = re.match(self._VALID_URL, url)
1750 if mobj is None:
1751 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1752 return
d77c3dfd 1753
59ae15a5 1754 username = mobj.group(1)
d77c3dfd 1755
59ae15a5
PH
1756 # Download video ids using YouTube Data API. Result size per
1757 # query is limited (currently to 50 videos) so we need to query
1758 # page by page until there are no video ids - it means we got
1759 # all of them.
d77c3dfd 1760
59ae15a5
PH
1761 video_ids = []
1762 pagenum = 0
d77c3dfd 1763
59ae15a5
PH
1764 while True:
1765 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1766 self.report_download_page(username, start_index)
d77c3dfd 1767
59ae15a5 1768 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1769
59ae15a5 1770 try:
80d3177e 1771 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1772 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774 return
d77c3dfd 1775
59ae15a5
PH
1776 # Extract video identifiers
1777 ids_in_page = []
d77c3dfd 1778
59ae15a5
PH
1779 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1780 if mobj.group(1) not in ids_in_page:
1781 ids_in_page.append(mobj.group(1))
d77c3dfd 1782
59ae15a5 1783 video_ids.extend(ids_in_page)
d77c3dfd 1784
59ae15a5
PH
1785 # A little optimization - if current page is not
1786 # "full", ie. does not contain PAGE_SIZE video ids then
1787 # we can assume that this page is the last one - there
1788 # are no more ids on further pages - no need to query
1789 # again.
d77c3dfd 1790
59ae15a5
PH
1791 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1792 break
d77c3dfd 1793
59ae15a5 1794 pagenum += 1
d77c3dfd 1795
59ae15a5
PH
1796 all_ids_count = len(video_ids)
1797 playliststart = self._downloader.params.get('playliststart', 1) - 1
1798 playlistend = self._downloader.params.get('playlistend', -1)
d77c3dfd 1799
59ae15a5
PH
1800 if playlistend == -1:
1801 video_ids = video_ids[playliststart:]
1802 else:
1803 video_ids = video_ids[playliststart:playlistend]
d77c3dfd 1804
59ae15a5
PH
1805 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1806 (username, all_ids_count, len(video_ids)))
d77c3dfd 1807
59ae15a5
PH
1808 for video_id in video_ids:
1809 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1810
1811
eeeb4daa 1812class BlipTVUserIE(InfoExtractor):
59ae15a5 1813 """Information Extractor for blip.tv users."""
eeeb4daa 1814
59ae15a5
PH
1815 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1816 _PAGE_SIZE = 12
1817 IE_NAME = u'blip.tv:user'
eeeb4daa 1818
59ae15a5
PH
1819 def __init__(self, downloader=None):
1820 InfoExtractor.__init__(self, downloader)
eeeb4daa 1821
59ae15a5
PH
1822 def report_download_page(self, username, pagenum):
1823 """Report attempt to download user page."""
1824 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1825 (self.IE_NAME, username, pagenum))
eeeb4daa 1826
59ae15a5
PH
1827 def _real_extract(self, url):
1828 # Extract username
1829 mobj = re.match(self._VALID_URL, url)
1830 if mobj is None:
1831 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1832 return
eeeb4daa 1833
59ae15a5 1834 username = mobj.group(1)
eeeb4daa 1835
59ae15a5 1836 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1837
59ae15a5 1838 request = compat_urllib_request.Request(url)
eeeb4daa 1839
59ae15a5
PH
1840 try:
1841 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1842 mobj = re.search(r'data-users-id="([^"]+)"', page)
1843 page_base = page_base % mobj.group(1)
1844 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1846 return
eeeb4daa
JCGS
1847
1848
59ae15a5
PH
1849 # Download video ids using BlipTV Ajax calls. Result size per
1850 # query is limited (currently to 12 videos) so we need to query
1851 # page by page until there are no video ids - it means we got
1852 # all of them.
eeeb4daa 1853
59ae15a5
PH
1854 video_ids = []
1855 pagenum = 1
eeeb4daa 1856
59ae15a5
PH
1857 while True:
1858 self.report_download_page(username, pagenum)
eeeb4daa 1859
59ae15a5 1860 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa 1861
59ae15a5
PH
1862 try:
1863 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1866 return
eeeb4daa 1867
59ae15a5
PH
1868 # Extract video identifiers
1869 ids_in_page = []
eeeb4daa 1870
59ae15a5
PH
1871 for mobj in re.finditer(r'href="/([^"]+)"', page):
1872 if mobj.group(1) not in ids_in_page:
1873 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1874
59ae15a5 1875 video_ids.extend(ids_in_page)
eeeb4daa 1876
59ae15a5
PH
1877 # A little optimization - if current page is not
1878 # "full", ie. does not contain PAGE_SIZE video ids then
1879 # we can assume that this page is the last one - there
1880 # are no more ids on further pages - no need to query
1881 # again.
eeeb4daa 1882
59ae15a5
PH
1883 if len(ids_in_page) < self._PAGE_SIZE:
1884 break
eeeb4daa 1885
59ae15a5 1886 pagenum += 1
eeeb4daa 1887
59ae15a5
PH
1888 all_ids_count = len(video_ids)
1889 playliststart = self._downloader.params.get('playliststart', 1) - 1
1890 playlistend = self._downloader.params.get('playlistend', -1)
eeeb4daa 1891
59ae15a5
PH
1892 if playlistend == -1:
1893 video_ids = video_ids[playliststart:]
1894 else:
1895 video_ids = video_ids[playliststart:playlistend]
eeeb4daa 1896
59ae15a5
PH
1897 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1898 (self.IE_NAME, username, all_ids_count, len(video_ids)))
eeeb4daa 1899
59ae15a5
PH
1900 for video_id in video_ids:
1901 self._downloader.download([u'http://blip.tv/'+video_id])
eeeb4daa
JCGS
1902
1903
d77c3dfd 1904class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1905 """Information extractor for depositfiles.com"""
1906
1907 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1908 IE_NAME = u'DepositFiles'
1909
1910 def __init__(self, downloader=None):
1911 InfoExtractor.__init__(self, downloader)
1912
1913 def report_download_webpage(self, file_id):
1914 """Report webpage download."""
1915 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1916
1917 def report_extraction(self, file_id):
1918 """Report information extraction."""
1919 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1920
1921 def _real_extract(self, url):
1922 file_id = url.split('/')[-1]
1923 # Rebuild url in english locale
1924 url = 'http://depositfiles.com/en/files/' + file_id
1925
1926 # Retrieve file webpage with 'Free download' button pressed
1927 free_download_indication = { 'gateway_result' : '1' }
1928 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1929 try:
1930 self.report_download_webpage(file_id)
1931 webpage = compat_urllib_request.urlopen(request).read()
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1934 return
1935
1936 # Search for the real file URL
1937 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1938 if (mobj is None) or (mobj.group(1) is None):
1939 # Try to figure out reason of the error.
1940 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1941 if (mobj is not None) and (mobj.group(1) is not None):
1942 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1943 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1944 else:
1945 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1946 return
1947
1948 file_url = mobj.group(1)
1949 file_extension = os.path.splitext(file_url)[1][1:]
1950
1951 # Search for file title
1952 mobj = re.search(r'<b title="(.*?)">', webpage)
1953 if mobj is None:
1954 self._downloader.trouble(u'ERROR: unable to extract title')
1955 return
1956 file_title = mobj.group(1).decode('utf-8')
1957
1958 return [{
1959 'id': file_id.decode('utf-8'),
1960 'url': file_url.decode('utf-8'),
1961 'uploader': None,
1962 'upload_date': None,
1963 'title': file_title,
1964 'ext': file_extension.decode('utf-8'),
1965 }]
d77c3dfd
FV
1966
1967
1968class FacebookIE(InfoExtractor):
59ae15a5
PH
1969 """Information Extractor for Facebook"""
1970
1971 _WORKING = False
1972 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1973 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1974 _NETRC_MACHINE = 'facebook'
1975 _available_formats = ['video', 'highqual', 'lowqual']
1976 _video_extensions = {
1977 'video': 'mp4',
1978 'highqual': 'mp4',
1979 'lowqual': 'mp4',
1980 }
1981 IE_NAME = u'facebook'
1982
1983 def __init__(self, downloader=None):
1984 InfoExtractor.__init__(self, downloader)
1985
1986 def _reporter(self, message):
1987 """Add header and report message."""
1988 self._downloader.to_screen(u'[facebook] %s' % message)
1989
1990 def report_login(self):
1991 """Report attempt to log in."""
1992 self._reporter(u'Logging in')
1993
1994 def report_video_webpage_download(self, video_id):
1995 """Report attempt to download video webpage."""
1996 self._reporter(u'%s: Downloading video webpage' % video_id)
1997
1998 def report_information_extraction(self, video_id):
1999 """Report attempt to extract video information."""
2000 self._reporter(u'%s: Extracting video information' % video_id)
2001
2002 def _parse_page(self, video_webpage):
2003 """Extract video information from page"""
2004 # General data
2005 data = {'title': r'\("video_title", "(.*?)"\)',
2006 'description': r'<div class="datawrap">(.*?)</div>',
2007 'owner': r'\("video_owner_name", "(.*?)"\)',
2008 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2009 }
2010 video_info = {}
2011 for piece in data.keys():
2012 mobj = re.search(data[piece], video_webpage)
2013 if mobj is not None:
2014 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2015
2016 # Video urls
2017 video_urls = {}
2018 for fmt in self._available_formats:
2019 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2020 if mobj is not None:
2021 # URL is in a Javascript segment inside an escaped Unicode format within
2022 # the generally utf-8 page
2023 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024 video_info['video_urls'] = video_urls
2025
2026 return video_info
2027
2028 def _real_initialize(self):
2029 if self._downloader is None:
2030 return
2031
2032 useremail = None
2033 password = None
2034 downloader_params = self._downloader.params
2035
2036 # Attempt to use provided username and password or .netrc data
2037 if downloader_params.get('username', None) is not None:
2038 useremail = downloader_params['username']
2039 password = downloader_params['password']
2040 elif downloader_params.get('usenetrc', False):
2041 try:
2042 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2043 if info is not None:
2044 useremail = info[0]
2045 password = info[2]
2046 else:
2047 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2048 except (IOError, netrc.NetrcParseError) as err:
2049 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2050 return
2051
2052 if useremail is None:
2053 return
2054
2055 # Log in
2056 login_form = {
2057 'email': useremail,
2058 'pass': password,
2059 'login': 'Log+In'
2060 }
2061 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2062 try:
2063 self.report_login()
2064 login_results = compat_urllib_request.urlopen(request).read()
2065 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2066 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2067 return
2068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2070 return
2071
2072 def _real_extract(self, url):
2073 mobj = re.match(self._VALID_URL, url)
2074 if mobj is None:
2075 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2076 return
2077 video_id = mobj.group('ID')
2078
2079 # Get video webpage
2080 self.report_video_webpage_download(video_id)
2081 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2082 try:
2083 page = compat_urllib_request.urlopen(request)
2084 video_webpage = page.read()
2085 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2086 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2087 return
2088
2089 # Start extracting information
2090 self.report_information_extraction(video_id)
2091
2092 # Extract information
2093 video_info = self._parse_page(video_webpage)
2094
2095 # uploader
2096 if 'owner' not in video_info:
2097 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2098 return
2099 video_uploader = video_info['owner']
2100
2101 # title
2102 if 'title' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract video title')
2104 return
2105 video_title = video_info['title']
2106 video_title = video_title.decode('utf-8')
2107
2108 # thumbnail image
2109 if 'thumbnail' not in video_info:
2110 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2111 video_thumbnail = ''
2112 else:
2113 video_thumbnail = video_info['thumbnail']
2114
2115 # upload date
2116 upload_date = None
2117 if 'upload_date' in video_info:
2118 upload_time = video_info['upload_date']
2119 timetuple = email.utils.parsedate_tz(upload_time)
2120 if timetuple is not None:
2121 try:
2122 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2123 except:
2124 pass
2125
2126 # description
2127 video_description = video_info.get('description', 'No description available.')
2128
2129 url_map = video_info['video_urls']
1a2c3c0f 2130 if len(list(url_map.keys())) > 0:
59ae15a5
PH
2131 # Decide which formats to download
2132 req_format = self._downloader.params.get('format', None)
2133 format_limit = self._downloader.params.get('format_limit', None)
2134
2135 if format_limit is not None and format_limit in self._available_formats:
2136 format_list = self._available_formats[self._available_formats.index(format_limit):]
2137 else:
2138 format_list = self._available_formats
2139 existing_formats = [x for x in format_list if x in url_map]
2140 if len(existing_formats) == 0:
2141 self._downloader.trouble(u'ERROR: no known formats available for video')
2142 return
2143 if req_format is None:
2144 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2145 elif req_format == 'worst':
2146 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2147 elif req_format == '-1':
2148 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2149 else:
2150 # Specific format
2151 if req_format not in url_map:
2152 self._downloader.trouble(u'ERROR: requested format not available')
2153 return
2154 video_url_list = [(req_format, url_map[req_format])] # Specific format
2155
2156 results = []
2157 for format_param, video_real_url in video_url_list:
2158 # Extension
2159 video_extension = self._video_extensions.get(format_param, 'mp4')
2160
2161 results.append({
2162 'id': video_id.decode('utf-8'),
2163 'url': video_real_url.decode('utf-8'),
2164 'uploader': video_uploader.decode('utf-8'),
2165 'upload_date': upload_date,
2166 'title': video_title,
2167 'ext': video_extension.decode('utf-8'),
2168 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2169 'thumbnail': video_thumbnail.decode('utf-8'),
2170 'description': video_description.decode('utf-8'),
2171 })
2172 return results
d77c3dfd
FV
2173
2174class BlipTVIE(InfoExtractor):
59ae15a5
PH
2175 """Information extractor for blip.tv"""
2176
2177 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2178 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2179 IE_NAME = u'blip.tv'
2180
2181 def report_extraction(self, file_id):
2182 """Report information extraction."""
2183 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2184
2185 def report_direct_download(self, title):
2186 """Report information extraction."""
2187 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2188
2189 def _real_extract(self, url):
2190 mobj = re.match(self._VALID_URL, url)
2191 if mobj is None:
2192 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2193 return
2194
2195 if '?' in url:
2196 cchar = '&'
2197 else:
2198 cchar = '?'
2199 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2200 request = compat_urllib_request.Request(json_url)
59ae15a5
PH
2201 self.report_extraction(mobj.group(1))
2202 info = None
2203 try:
2204 urlh = compat_urllib_request.urlopen(request)
2205 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2206 basename = url.split('/')[-1]
2207 title,ext = os.path.splitext(basename)
2208 title = title.decode('UTF-8')
2209 ext = ext.replace('.', '')
2210 self.report_direct_download(title)
2211 info = {
2212 'id': title,
2213 'url': url,
2214 'uploader': None,
2215 'upload_date': None,
2216 'title': title,
2217 'ext': ext,
2218 'urlhandle': urlh
2219 }
2220 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2221 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2222 return
2223 if info is None: # Regular URL
2224 try:
55c05398
PH
2225 json_code_bytes = urlh.read()
2226 json_code = json_code_bytes.decode('utf-8')
59ae15a5
PH
2227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2228 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2229 return
2230
2231 try:
2232 json_data = json.loads(json_code)
2233 if 'Post' in json_data:
2234 data = json_data['Post']
2235 else:
2236 data = json_data
2237
2238 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2239 video_url = data['media']['url']
2240 umobj = re.match(self._URL_EXT, video_url)
2241 if umobj is None:
2242 raise ValueError('Can not determine filename extension')
2243 ext = umobj.group(1)
2244
2245 info = {
2246 'id': data['item_id'],
2247 'url': video_url,
2248 'uploader': data['display_name'],
2249 'upload_date': upload_date,
2250 'title': data['title'],
2251 'ext': ext,
2252 'format': data['media']['mimeType'],
2253 'thumbnail': data['thumbnailUrl'],
2254 'description': data['description'],
2255 'player_url': data['embedUrl']
2256 }
2257 except (ValueError,KeyError) as err:
2258 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2259 return
2260
2261 std_headers['User-Agent'] = 'iTunes/10.6.1'
2262 return [info]
d77c3dfd
FV
2263
2264
2265class MyVideoIE(InfoExtractor):
59ae15a5
PH
2266 """Information Extractor for myvideo.de."""
2267
2268 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2269 IE_NAME = u'myvideo'
2270
2271 def __init__(self, downloader=None):
2272 InfoExtractor.__init__(self, downloader)
cdb30764 2273
59ae15a5
PH
2274 def report_download_webpage(self, video_id):
2275 """Report webpage download."""
2276 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2277
2278 def report_extraction(self, video_id):
2279 """Report information extraction."""
2280 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2281
2282 def _real_extract(self,url):
2283 mobj = re.match(self._VALID_URL, url)
2284 if mobj is None:
2285 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2286 return
2287
2288 video_id = mobj.group(1)
2289
2290 # Get video webpage
2291 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2292 try:
2293 self.report_download_webpage(video_id)
93702113 2294 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
2295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2296 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2297 return
2298
2299 self.report_extraction(video_id)
2300 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2301 webpage)
2302 if mobj is None:
2303 self._downloader.trouble(u'ERROR: unable to extract media URL')
2304 return
2305 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2306
2307 mobj = re.search('<title>([^<]+)</title>', webpage)
2308 if mobj is None:
2309 self._downloader.trouble(u'ERROR: unable to extract title')
2310 return
2311
2312 video_title = mobj.group(1)
2313
2314 return [{
2315 'id': video_id,
2316 'url': video_url,
2317 'uploader': None,
2318 'upload_date': None,
2319 'title': video_title,
2320 'ext': u'flv',
2321 }]
d77c3dfd
FV
2322
2323class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2324 """Information extractor for The Daily Show and Colbert Report """
2325
ca6849e6 2326 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2327 # urls for episodes like:
ca6849e6 2328 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2329 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2330 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2331 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2332 |(https?://)?(www\.)?
2333 (?P<showname>thedailyshow|colbertnation)\.com/
2334 (full-episodes/(?P<episode>.*)|
2335 (?P<clip>
2336 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2337 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2338 $"""
59ae15a5
PH
2339 IE_NAME = u'comedycentral'
2340
2341 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2342
2343 _video_extensions = {
2344 '3500': 'mp4',
2345 '2200': 'mp4',
2346 '1700': 'mp4',
2347 '1200': 'mp4',
2348 '750': 'mp4',
2349 '400': 'mp4',
2350 }
2351 _video_dimensions = {
2352 '3500': '1280x720',
2353 '2200': '960x540',
2354 '1700': '768x432',
2355 '1200': '640x360',
2356 '750': '512x288',
2357 '400': '384x216',
2358 }
2359
ca6849e6 2360 def suitable(self, url):
2361 """Receives a URL and returns True if suitable for this IE."""
2362 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2363
59ae15a5
PH
2364 def report_extraction(self, episode_id):
2365 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2366
2367 def report_config_download(self, episode_id):
2368 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2369
2370 def report_index_download(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2372
2373 def report_player_url(self, episode_id):
2374 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2375
2376
2377 def _print_formats(self, formats):
2378 print('Available formats:')
2379 for x in formats:
2380 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2381
2382
2383 def _real_extract(self, url):
ca6849e6 2384 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2385 if mobj is None:
2386 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2387 return
2388
2389 if mobj.group('shortname'):
2390 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2391 url = u'http://www.thedailyshow.com/full-episodes/'
2392 else:
2393 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2394 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2395 assert mobj is not None
2396
ca6849e6 2397 if mobj.group('clip'):
2398 if mobj.group('showname') == 'thedailyshow':
2399 epTitle = mobj.group('tdstitle')
2400 else:
2401 epTitle = mobj.group('cntitle')
2402 dlNewest = False
59ae15a5 2403 else:
ca6849e6 2404 dlNewest = not mobj.group('episode')
2405 if dlNewest:
2406 epTitle = mobj.group('showname')
2407 else:
2408 epTitle = mobj.group('episode')
59ae15a5
PH
2409
2410 req = compat_urllib_request.Request(url)
2411 self.report_extraction(epTitle)
2412 try:
2413 htmlHandle = compat_urllib_request.urlopen(req)
2414 html = htmlHandle.read()
2415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2416 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2417 return
2418 if dlNewest:
2419 url = htmlHandle.geturl()
ca6849e6 2420 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2421 if mobj is None:
2422 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2423 return
2424 if mobj.group('episode') == '':
2425 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2426 return
2427 epTitle = mobj.group('episode')
2428
ca6849e6 2429 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
59ae15a5
PH
2430
2431 if len(mMovieParams) == 0:
2432 # The Colbert Report embeds the information in a without
2433 # a URL prefix; so extract the alternate reference
2434 # and then add the URL prefix manually.
2435
ca6849e6 2436 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
59ae15a5
PH
2437 if len(altMovieParams) == 0:
2438 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2439 return
2440 else:
2441 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2442
59ae15a5
PH
2443 playerUrl_raw = mMovieParams[0][0]
2444 self.report_player_url(epTitle)
2445 try:
2446 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2447 playerUrl = urlHandle.geturl()
2448 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2449 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2450 return
2451
2452 uri = mMovieParams[0][1]
2453 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2454 self.report_index_download(epTitle)
2455 try:
2456 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2458 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2459 return
2460
2461 results = []
2462
2463 idoc = xml.etree.ElementTree.fromstring(indexXml)
2464 itemEls = idoc.findall('.//item')
2465 for itemEl in itemEls:
2466 mediaId = itemEl.findall('./guid')[0].text
2467 shortMediaId = mediaId.split(':')[-1]
2468 showId = mediaId.split(':')[-2].replace('.com', '')
2469 officialTitle = itemEl.findall('./title')[0].text
2470 officialDate = itemEl.findall('./pubDate')[0].text
2471
2472 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2473 compat_urllib_parse.urlencode({'uri': mediaId}))
2474 configReq = compat_urllib_request.Request(configUrl)
2475 self.report_config_download(epTitle)
2476 try:
2477 configXml = compat_urllib_request.urlopen(configReq).read()
2478 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2479 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2480 return
2481
2482 cdoc = xml.etree.ElementTree.fromstring(configXml)
2483 turls = []
2484 for rendition in cdoc.findall('.//rendition'):
2485 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2486 turls.append(finfo)
2487
2488 if len(turls) == 0:
2489 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2490 continue
cdb30764 2491
59ae15a5
PH
2492 if self._downloader.params.get('listformats', None):
2493 self._print_formats([i[0] for i in turls])
2494 return
2495
2496 # For now, just pick the highest bitrate
2497 format,video_url = turls[-1]
2498
2499 # Get the format arg from the arg stream
2500 req_format = self._downloader.params.get('format', None)
2501
2502 # Select format if we can find one
2503 for f,v in turls:
2504 if f == req_format:
2505 format, video_url = f, v
2506 break
2507
2508 # Patch to download from alternative CDN, which does not
2509 # break on current RTMPDump builds
2510 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2511 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2512
2513 if video_url.startswith(broken_cdn):
2514 video_url = video_url.replace(broken_cdn, better_cdn)
2515
2516 effTitle = showId + u'-' + epTitle
2517 info = {
2518 'id': shortMediaId,
2519 'url': video_url,
2520 'uploader': showId,
2521 'upload_date': officialDate,
2522 'title': effTitle,
2523 'ext': 'mp4',
2524 'format': format,
2525 'thumbnail': None,
2526 'description': officialTitle,
2527 'player_url': None #playerUrl
2528 }
2529
2530 results.append(info)
cdb30764 2531
59ae15a5 2532 return results
d77c3dfd
FV
2533
2534
2535class EscapistIE(InfoExtractor):
59ae15a5
PH
2536 """Information extractor for The Escapist """
2537
2538 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2539 IE_NAME = u'escapist'
2540
2541 def report_extraction(self, showName):
2542 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2543
2544 def report_config_download(self, showName):
2545 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2546
2547 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2549 if mobj is None:
2550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2551 return
2552 showName = mobj.group('showname')
2553 videoId = mobj.group('episode')
2554
2555 self.report_extraction(showName)
2556 try:
2557 webPage = compat_urllib_request.urlopen(url)
2558 webPageBytes = webPage.read()
2559 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2560 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2562 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2563 return
2564
2565 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2566 description = unescapeHTML(descMatch.group(1))
2567 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2568 imgUrl = unescapeHTML(imgMatch.group(1))
2569 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2570 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2571 configUrlMatch = re.search('config=(.*)$', playerUrl)
2572 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2573
2574 self.report_config_download(showName)
2575 try:
93702113
FV
2576 configJSON = compat_urllib_request.urlopen(configUrl)
2577 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2578 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
59ae15a5
PH
2579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2581 return
2582
2583 # Technically, it's JavaScript, not JSON
2584 configJSON = configJSON.replace("'", '"')
2585
2586 try:
2587 config = json.loads(configJSON)
2588 except (ValueError,) as err:
2589 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2590 return
2591
2592 playlist = config['playlist']
2593 videoUrl = playlist[1]['url']
2594
2595 info = {
2596 'id': videoId,
2597 'url': videoUrl,
2598 'uploader': showName,
2599 'upload_date': None,
2600 'title': showName,
2601 'ext': 'flv',
2602 'thumbnail': imgUrl,
2603 'description': description,
2604 'player_url': playerUrl,
2605 }
2606
2607 return [info]
d77c3dfd
FV
2608
2609
2610class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2611 """Information extractor for collegehumor.com"""
2612
0eb0faa2 2613 _WORKING = False
59ae15a5
PH
2614 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2615 IE_NAME = u'collegehumor'
2616
799c0763 2617 def report_manifest(self, video_id):
59ae15a5 2618 """Report information extraction."""
799c0763 2619 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
59ae15a5
PH
2620
2621 def report_extraction(self, video_id):
2622 """Report information extraction."""
2623 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2624
2625 def _real_extract(self, url):
2626 mobj = re.match(self._VALID_URL, url)
2627 if mobj is None:
2628 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2629 return
2630 video_id = mobj.group('videoid')
2631
59ae15a5
PH
2632 info = {
2633 'id': video_id,
59ae15a5
PH
2634 'uploader': None,
2635 'upload_date': None,
2636 }
2637
2638 self.report_extraction(video_id)
799c0763 2639 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2640 try:
2641 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2644 return
2645
2646 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2647 try:
2648 videoNode = mdoc.findall('./video')[0]
2649 info['description'] = videoNode.findall('./description')[0].text
2650 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2651 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2652 manifest_url = videoNode.findall('./file')[0].text
59ae15a5
PH
2653 except IndexError:
2654 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2655 return
2656
799c0763
PH
2657 manifest_url += '?hdcore=2.10.3'
2658 self.report_manifest(video_id)
2659 try:
2660 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2661 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2662 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2663 return
2664
2665 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2666 try:
2667 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2668 node_id = media_node.attrib['url']
2669 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2670 except IndexError as err:
2671 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2672 return
2673
2674 url_pr = compat_urllib_parse_urlparse(manifest_url)
2675 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2676
2677 info['url'] = url
2678 info['ext'] = 'f4f'
59ae15a5 2679 return [info]
d77c3dfd
FV
2680
2681
2682class XVideosIE(InfoExtractor):
59ae15a5 2683 """Information extractor for xvideos.com"""
d77c3dfd 2684
59ae15a5
PH
2685 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2686 IE_NAME = u'xvideos'
d77c3dfd 2687
59ae15a5
PH
2688 def report_webpage(self, video_id):
2689 """Report information extraction."""
2690 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
d77c3dfd 2691
59ae15a5
PH
2692 def report_extraction(self, video_id):
2693 """Report information extraction."""
2694 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
d77c3dfd 2695
59ae15a5
PH
2696 def _real_extract(self, url):
2697 mobj = re.match(self._VALID_URL, url)
2698 if mobj is None:
2699 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2700 return
8588a86f 2701 video_id = mobj.group(1)
d77c3dfd 2702
59ae15a5 2703 self.report_webpage(video_id)
d77c3dfd 2704
59ae15a5
PH
2705 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2706 try:
8588a86f
PH
2707 webpage_bytes = compat_urllib_request.urlopen(request).read()
2708 webpage = webpage_bytes.decode('utf-8', 'replace')
59ae15a5
PH
2709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2711 return
d77c3dfd 2712
59ae15a5 2713 self.report_extraction(video_id)
d77c3dfd
FV
2714
2715
59ae15a5
PH
2716 # Extract video URL
2717 mobj = re.search(r'flv_url=(.+?)&', webpage)
2718 if mobj is None:
2719 self._downloader.trouble(u'ERROR: unable to extract video url')
2720 return
8588a86f 2721 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2722
2723
59ae15a5
PH
2724 # Extract title
2725 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2726 if mobj is None:
2727 self._downloader.trouble(u'ERROR: unable to extract video title')
2728 return
8588a86f 2729 video_title = mobj.group(1)
d77c3dfd
FV
2730
2731
59ae15a5
PH
2732 # Extract video thumbnail
2733 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2734 if mobj is None:
2735 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2736 return
8588a86f 2737 video_thumbnail = mobj.group(0)
d77c3dfd 2738
59ae15a5
PH
2739 info = {
2740 'id': video_id,
2741 'url': video_url,
2742 'uploader': None,
2743 'upload_date': None,
2744 'title': video_title,
2745 'ext': 'flv',
2746 'thumbnail': video_thumbnail,
2747 'description': None,
2748 }
d77c3dfd 2749
59ae15a5 2750 return [info]
d77c3dfd
FV
2751
2752
2753class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2754 """Information extractor for soundcloud.com
2755 To access the media, the uid of the song and a stream token
2756 must be extracted from the page source and the script must make
2757 a request to media.soundcloud.com/crossdomain.xml. Then
2758 the media can be grabbed by requesting from an url composed
2759 of the stream token and uid
2760 """
2761
2762 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2763 IE_NAME = u'soundcloud'
2764
2765 def __init__(self, downloader=None):
2766 InfoExtractor.__init__(self, downloader)
2767
8fd3afd5 2768 def report_resolve(self, video_id):
59ae15a5 2769 """Report information extraction."""
8fd3afd5 2770 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
59ae15a5
PH
2771
2772 def report_extraction(self, video_id):
2773 """Report information extraction."""
8fd3afd5 2774 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
59ae15a5
PH
2775
2776 def _real_extract(self, url):
2777 mobj = re.match(self._VALID_URL, url)
2778 if mobj is None:
2779 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2780 return
2781
2782 # extract uploader (which is in the url)
15c8d833 2783 uploader = mobj.group(1)
59ae15a5 2784 # extract simple title (uploader + slug of song title)
15c8d833 2785 slug_title = mobj.group(2)
59ae15a5
PH
2786 simple_title = uploader + u'-' + slug_title
2787
8fd3afd5 2788 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2789
8fd3afd5
PH
2790 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2791 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2792 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2793 try:
8fd3afd5
PH
2794 info_json_bytes = compat_urllib_request.urlopen(request).read()
2795 info_json = info_json_bytes.decode('utf-8')
59ae15a5
PH
2796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2797 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2798 return
2799
8fd3afd5
PH
2800 info = json.loads(info_json)
2801 video_id = info['id']
59ae15a5
PH
2802 self.report_extraction('%s/%s' % (uploader, slug_title))
2803
8fd3afd5 2804 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2805 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2806 try:
2807 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2808 stream_json = stream_json_bytes.decode('utf-8')
2809 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
b4cd069d 2811 return
59ae15a5 2812
8fd3afd5 2813 streams = json.loads(stream_json)
c7214f9a 2814 mediaURL = streams['http_mp3_128_url']
59ae15a5
PH
2815
2816 return [{
c7214f9a 2817 'id': info['id'],
59ae15a5 2818 'url': mediaURL,
c7214f9a
PH
2819 'uploader': info['user']['username'],
2820 'upload_date': info['created_at'],
2821 'title': info['title'],
59ae15a5 2822 'ext': u'mp3',
c7214f9a 2823 'description': info['description'],
59ae15a5 2824 }]
d77c3dfd
FV
2825
2826
2827class InfoQIE(InfoExtractor):
59ae15a5
PH
2828 """Information extractor for infoq.com"""
2829
2830 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2831 IE_NAME = u'infoq'
2832
2833 def report_webpage(self, video_id):
2834 """Report information extraction."""
2835 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2836
2837 def report_extraction(self, video_id):
2838 """Report information extraction."""
2839 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2840
2841 def _real_extract(self, url):
2842 mobj = re.match(self._VALID_URL, url)
2843 if mobj is None:
2844 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2845 return
2846
2847 self.report_webpage(url)
2848
2849 request = compat_urllib_request.Request(url)
2850 try:
2851 webpage = compat_urllib_request.urlopen(request).read()
2852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2854 return
2855
2856 self.report_extraction(url)
2857
2858
2859 # Extract video URL
2860 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2861 if mobj is None:
2862 self._downloader.trouble(u'ERROR: unable to extract video url')
2863 return
2864 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2865
2866
2867 # Extract title
2868 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2869 if mobj is None:
2870 self._downloader.trouble(u'ERROR: unable to extract video title')
2871 return
2872 video_title = mobj.group(1).decode('utf-8')
2873
2874 # Extract description
2875 video_description = u'No description available.'
2876 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2877 if mobj is not None:
2878 video_description = mobj.group(1).decode('utf-8')
2879
2880 video_filename = video_url.split('/')[-1]
2881 video_id, extension = video_filename.split('.')
2882
2883 info = {
2884 'id': video_id,
2885 'url': video_url,
2886 'uploader': None,
2887 'upload_date': None,
2888 'title': video_title,
2889 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2890 'thumbnail': None,
2891 'description': video_description,
2892 }
2893
2894 return [info]
d77c3dfd
FV
2895
2896class MixcloudIE(InfoExtractor):
59ae15a5 2897 """Information extractor for www.mixcloud.com"""
93702113
FV
2898
2899 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2900 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2901 IE_NAME = u'mixcloud'
2902
2903 def __init__(self, downloader=None):
2904 InfoExtractor.__init__(self, downloader)
2905
2906 def report_download_json(self, file_id):
2907 """Report JSON download."""
2908 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2909
2910 def report_extraction(self, file_id):
2911 """Report information extraction."""
2912 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2913
2914 def get_urls(self, jsonData, fmt, bitrate='best'):
2915 """Get urls from 'audio_formats' section in json"""
2916 file_url = None
2917 try:
2918 bitrate_list = jsonData[fmt]
2919 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2920 bitrate = max(bitrate_list) # select highest
2921
2922 url_list = jsonData[fmt][bitrate]
2923 except TypeError: # we have no bitrate info.
2924 url_list = jsonData[fmt]
2925 return url_list
2926
2927 def check_urls(self, url_list):
2928 """Returns 1st active url from list"""
2929 for url in url_list:
2930 try:
2931 compat_urllib_request.urlopen(url)
2932 return url
2933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2934 url = None
2935
2936 return None
2937
2938 def _print_formats(self, formats):
2939 print('Available formats:')
2940 for fmt in formats.keys():
2941 for b in formats[fmt]:
2942 try:
2943 ext = formats[fmt][b][0]
2944 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2945 except TypeError: # we have no bitrate info
2946 ext = formats[fmt][0]
2947 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2948 break
2949
2950 def _real_extract(self, url):
2951 mobj = re.match(self._VALID_URL, url)
2952 if mobj is None:
2953 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2954 return
2955 # extract uploader & filename from url
2956 uploader = mobj.group(1).decode('utf-8')
2957 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2958
2959 # construct API request
2960 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2961 # retrieve .json file with links to files
2962 request = compat_urllib_request.Request(file_url)
2963 try:
2964 self.report_download_json(file_url)
2965 jsonData = compat_urllib_request.urlopen(request).read()
2966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2967 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2968 return
2969
2970 # parse JSON
2971 json_data = json.loads(jsonData)
2972 player_url = json_data['player_swf_url']
2973 formats = dict(json_data['audio_formats'])
2974
2975 req_format = self._downloader.params.get('format', None)
2976 bitrate = None
2977
2978 if self._downloader.params.get('listformats', None):
2979 self._print_formats(formats)
2980 return
2981
2982 if req_format is None or req_format == 'best':
2983 for format_param in formats.keys():
2984 url_list = self.get_urls(formats, format_param)
2985 # check urls
2986 file_url = self.check_urls(url_list)
2987 if file_url is not None:
2988 break # got it!
2989 else:
1a2c3c0f 2990 if req_format not in list(formats.keys()):
59ae15a5
PH
2991 self._downloader.trouble(u'ERROR: format is not available')
2992 return
2993
2994 url_list = self.get_urls(formats, req_format)
2995 file_url = self.check_urls(url_list)
2996 format_param = req_format
2997
2998 return [{
2999 'id': file_id.decode('utf-8'),
3000 'url': file_url.decode('utf-8'),
3001 'uploader': uploader.decode('utf-8'),
3002 'upload_date': None,
3003 'title': json_data['name'],
3004 'ext': file_url.split('.')[-1].decode('utf-8'),
3005 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3006 'thumbnail': json_data['thumbnail_url'],
3007 'description': json_data['description'],
3008 'player_url': player_url.decode('utf-8'),
3009 }]
d77c3dfd
FV
3010
3011class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
3012 """Information extractor for Stanford's Open ClassRoom"""
3013
3014 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3015 IE_NAME = u'stanfordoc'
3016
3017 def report_download_webpage(self, objid):
3018 """Report information extraction."""
3019 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3020
3021 def report_extraction(self, video_id):
3022 """Report information extraction."""
3023 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3024
3025 def _real_extract(self, url):
3026 mobj = re.match(self._VALID_URL, url)
3027 if mobj is None:
3028 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3029 return
3030
3031 if mobj.group('course') and mobj.group('video'): # A specific video
3032 course = mobj.group('course')
3033 video = mobj.group('video')
3034 info = {
3035 'id': course + '_' + video,
3036 'uploader': None,
3037 'upload_date': None,
3038 }
3039
3040 self.report_extraction(info['id'])
3041 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3042 xmlUrl = baseUrl + video + '.xml'
3043 try:
3044 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3045 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3046 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3047 return
3048 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3049 try:
3050 info['title'] = mdoc.findall('./title')[0].text
3051 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3052 except IndexError:
3053 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3054 return
3055 info['ext'] = info['url'].rpartition('.')[2]
3056 return [info]
3057 elif mobj.group('course'): # A course page
3058 course = mobj.group('course')
3059 info = {
3060 'id': course,
3061 'type': 'playlist',
3062 'uploader': None,
3063 'upload_date': None,
3064 }
3065
3066 self.report_download_webpage(info['id'])
3067 try:
3068 coursepage = compat_urllib_request.urlopen(url).read()
3069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3070 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3071 return
3072
3073 m = re.search('<h1>([^<]+)</h1>', coursepage)
3074 if m:
3075 info['title'] = unescapeHTML(m.group(1))
3076 else:
3077 info['title'] = info['id']
3078
3079 m = re.search('<description>([^<]+)</description>', coursepage)
3080 if m:
3081 info['description'] = unescapeHTML(m.group(1))
3082
3083 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3084 info['list'] = [
3085 {
3086 'type': 'reference',
3087 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3088 }
3089 for vpage in links]
3090 results = []
3091 for entry in info['list']:
3092 assert entry['type'] == 'reference'
3093 results += self.extract(entry['url'])
3094 return results
cdb30764 3095
59ae15a5
PH
3096 else: # Root page
3097 info = {
3098 'id': 'Stanford OpenClassroom',
3099 'type': 'playlist',
3100 'uploader': None,
3101 'upload_date': None,
3102 }
3103
3104 self.report_download_webpage(info['id'])
3105 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3106 try:
3107 rootpage = compat_urllib_request.urlopen(rootURL).read()
3108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3109 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3110 return
3111
3112 info['title'] = info['id']
3113
3114 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3115 info['list'] = [
3116 {
3117 'type': 'reference',
3118 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3119 }
3120 for cpage in links]
3121
3122 results = []
3123 for entry in info['list']:
3124 assert entry['type'] == 'reference'
3125 results += self.extract(entry['url'])
3126 return results
d77c3dfd
FV
3127
3128class MTVIE(InfoExtractor):
59ae15a5
PH
3129 """Information extractor for MTV.com"""
3130
3131 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3132 IE_NAME = u'mtv'
3133
3134 def report_webpage(self, video_id):
3135 """Report information extraction."""
3136 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3137
3138 def report_extraction(self, video_id):
3139 """Report information extraction."""
3140 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3141
3142 def _real_extract(self, url):
3143 mobj = re.match(self._VALID_URL, url)
3144 if mobj is None:
3145 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3146 return
3147 if not mobj.group('proto'):
3148 url = 'http://' + url
3149 video_id = mobj.group('videoid')
3150 self.report_webpage(video_id)
3151
3152 request = compat_urllib_request.Request(url)
3153 try:
3154 webpage = compat_urllib_request.urlopen(request).read()
3155 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3156 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3157 return
3158
3159 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3160 if mobj is None:
3161 self._downloader.trouble(u'ERROR: unable to extract song name')
3162 return
3163 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3164 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3165 if mobj is None:
3166 self._downloader.trouble(u'ERROR: unable to extract performer')
3167 return
3168 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 3169 video_title = performer + ' - ' + song_name
59ae15a5
PH
3170
3171 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3172 if mobj is None:
3173 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3174 return
3175 mtvn_uri = mobj.group(1)
3176
3177 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3178 if mobj is None:
3179 self._downloader.trouble(u'ERROR: unable to extract content id')
3180 return
3181 content_id = mobj.group(1)
3182
3183 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3184 self.report_extraction(video_id)
3185 request = compat_urllib_request.Request(videogen_url)
3186 try:
3187 metadataXml = compat_urllib_request.urlopen(request).read()
3188 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3189 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3190 return
3191
3192 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3193 renditions = mdoc.findall('.//rendition')
3194
3195 # For now, always pick the highest quality.
3196 rendition = renditions[-1]
3197
3198 try:
3199 _,_,ext = rendition.attrib['type'].partition('/')
3200 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3201 video_url = rendition.find('./src').text
3202 except KeyError:
3203 self._downloader.trouble('Invalid rendition field.')
3204 return
3205
3206 info = {
3207 'id': video_id,
3208 'url': video_url,
3209 'uploader': performer,
3210 'upload_date': None,
3211 'title': video_title,
3212 'ext': ext,
3213 'format': format,
3214 }
3215
3216 return [info]
6de7ef9b 3217
302efc19 3218
302efc19 3219class YoukuIE(InfoExtractor):
3220
59ae15a5
PH
3221 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3222 IE_NAME = u'Youku'
3223
3224 def __init__(self, downloader=None):
3225 InfoExtractor.__init__(self, downloader)
3226
3227 def report_download_webpage(self, file_id):
3228 """Report webpage download."""
3229 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3230
3231 def report_extraction(self, file_id):
3232 """Report information extraction."""
3233 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3234
3235 def _gen_sid(self):
3236 nowTime = int(time.time() * 1000)
3237 random1 = random.randint(1000,1998)
3238 random2 = random.randint(1000,9999)
3239
3240 return "%d%d%d" %(nowTime,random1,random2)
3241
3242 def _get_file_ID_mix_string(self, seed):
3243 mixed = []
3244 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3245 seed = float(seed)
3246 for i in range(len(source)):
3247 seed = (seed * 211 + 30031 ) % 65536
3248 index = math.floor(seed / 65536 * len(source) )
3249 mixed.append(source[int(index)])
3250 source.remove(source[int(index)])
3251 #return ''.join(mixed)
3252 return mixed
3253
3254 def _get_file_id(self, fileId, seed):
3255 mixed = self._get_file_ID_mix_string(seed)
3256 ids = fileId.split('*')
3257 realId = []
3258 for ch in ids:
3259 if ch:
3260 realId.append(mixed[int(ch)])
3261 return ''.join(realId)
3262
3263 def _real_extract(self, url):
3264 mobj = re.match(self._VALID_URL, url)
3265 if mobj is None:
3266 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3267 return
3268 video_id = mobj.group('ID')
3269
3270 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3271
3272 request = compat_urllib_request.Request(info_url, None, std_headers)
3273 try:
3274 self.report_download_webpage(video_id)
3275 jsondata = compat_urllib_request.urlopen(request).read()
3276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3277 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3278 return
3279
3280 self.report_extraction(video_id)
3281 try:
8f6f40d9
PH
3282 jsonstr = jsondata.decode('utf-8')
3283 config = json.loads(jsonstr)
59ae15a5
PH
3284
3285 video_title = config['data'][0]['title']
3286 seed = config['data'][0]['seed']
3287
3288 format = self._downloader.params.get('format', None)
1a2c3c0f 3289 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
3290
3291 if format is None or format == 'best':
3292 if 'hd2' in supported_format:
3293 format = 'hd2'
3294 else:
3295 format = 'flv'
3296 ext = u'flv'
3297 elif format == 'worst':
3298 format = 'mp4'
3299 ext = u'mp4'
3300 else:
3301 format = 'flv'
3302 ext = u'flv'
3303
3304
3305 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 3306 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 3307 except (UnicodeDecodeError, ValueError, KeyError):
59ae15a5
PH
3308 self._downloader.trouble(u'ERROR: unable to extract info section')
3309 return
3310
3311 files_info=[]
3312 sid = self._gen_sid()
3313 fileid = self._get_file_id(fileid, seed)
3314
3315 #column 8,9 of fileid represent the segment number
3316 #fileid[7:9] should be changed
3317 for index, key in enumerate(keys):
3318
3319 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3320 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3321
3322 info = {
3323 'id': '%s_part%02d' % (video_id, index),
3324 'url': download_url,
3325 'uploader': None,
3326 'upload_date': None,
3327 'title': video_title,
3328 'ext': ext,
3329 }
3330 files_info.append(info)
3331
3332 return files_info
5dc846fa
FV
3333
3334
6de7ef9b 3335class XNXXIE(InfoExtractor):
59ae15a5
PH
3336 """Information extractor for xnxx.com"""
3337
3338 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3339 IE_NAME = u'xnxx'
3340 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3341 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3342 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3343
3344 def report_webpage(self, video_id):
3345 """Report information extraction"""
3346 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3347
3348 def report_extraction(self, video_id):
3349 """Report information extraction"""
3350 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3351
3352 def _real_extract(self, url):
3353 mobj = re.match(self._VALID_URL, url)
3354 if mobj is None:
3355 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3356 return
bec102a8 3357 video_id = mobj.group(1)
59ae15a5
PH
3358
3359 self.report_webpage(video_id)
3360
3361 # Get webpage content
3362 try:
bec102a8
PH
3363 webpage_bytes = compat_urllib_request.urlopen(url).read()
3364 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
3365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3367 return
3368
3369 result = re.search(self.VIDEO_URL_RE, webpage)
3370 if result is None:
3371 self._downloader.trouble(u'ERROR: unable to extract video url')
3372 return
bec102a8 3373 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
3374
3375 result = re.search(self.VIDEO_TITLE_RE, webpage)
3376 if result is None:
3377 self._downloader.trouble(u'ERROR: unable to extract video title')
3378 return
bec102a8 3379 video_title = result.group(1)
59ae15a5
PH
3380
3381 result = re.search(self.VIDEO_THUMB_RE, webpage)
3382 if result is None:
3383 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3384 return
bec102a8 3385 video_thumbnail = result.group(1)
59ae15a5
PH
3386
3387 return [{
3388 'id': video_id,
3389 'url': video_url,
3390 'uploader': None,
3391 'upload_date': None,
3392 'title': video_title,
3393 'ext': 'flv',
3394 'thumbnail': video_thumbnail,
3395 'description': None,
3396 }]
fd873c69
FV
3397
3398
d443aca8 3399class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3400 """Information extractor for plus.google.com."""
3401
93702113 3402 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
3403 IE_NAME = u'plus.google'
3404
3405 def __init__(self, downloader=None):
3406 InfoExtractor.__init__(self, downloader)
3407
3408 def report_extract_entry(self, url):
3409 """Report downloading extry"""
93702113 3410 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
59ae15a5
PH
3411
3412 def report_date(self, upload_date):
3413 """Report downloading extry"""
3414 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3415
3416 def report_uploader(self, uploader):
3417 """Report downloading extry"""
93702113 3418 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
59ae15a5
PH
3419
3420 def report_title(self, video_title):
3421 """Report downloading extry"""
93702113 3422 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
59ae15a5
PH
3423
3424 def report_extract_vid_page(self, video_page):
3425 """Report information extraction."""
93702113 3426 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
59ae15a5
PH
3427
3428 def _real_extract(self, url):
3429 # Extract id from URL
3430 mobj = re.match(self._VALID_URL, url)
3431 if mobj is None:
3432 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3433 return
3434
3435 post_url = mobj.group(0)
93702113 3436 video_id = mobj.group(1)
59ae15a5
PH
3437
3438 video_extension = 'flv'
3439
3440 # Step 1, Retrieve post webpage to extract further information
3441 self.report_extract_entry(post_url)
3442 request = compat_urllib_request.Request(post_url)
3443 try:
93702113 3444 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3447 return
3448
3449 # Extract update date
3450 upload_date = None
3451 pattern = 'title="Timestamp">(.*?)</a>'
3452 mobj = re.search(pattern, webpage)
3453 if mobj:
3454 upload_date = mobj.group(1)
3455 # Convert timestring to a format suitable for filename
3456 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3457 upload_date = upload_date.strftime('%Y%m%d')
3458 self.report_date(upload_date)
3459
3460 # Extract uploader
3461 uploader = None
3462 pattern = r'rel\="author".*?>(.*?)</a>'
3463 mobj = re.search(pattern, webpage)
3464 if mobj:
3465 uploader = mobj.group(1)
3466 self.report_uploader(uploader)
3467
3468 # Extract title
3469 # Get the first line for title
3470 video_title = u'NA'
3471 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3472 mobj = re.search(pattern, webpage)
3473 if mobj:
3474 video_title = mobj.group(1)
3475 self.report_title(video_title)
3476
3477 # Step 2, Stimulate clicking the image box to launch video
3478 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3479 mobj = re.search(pattern, webpage)
3480 if mobj is None:
3481 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3482
3483 video_page = mobj.group(1)
3484 request = compat_urllib_request.Request(video_page)
3485 try:
93702113 3486 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3489 return
3490 self.report_extract_vid_page(video_page)
3491
3492
3493 # Extract video links on video page
3494 """Extract video links of all sizes"""
3495 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3496 mobj = re.findall(pattern, webpage)
3497 if len(mobj) == 0:
3498 self._downloader.trouble(u'ERROR: unable to extract video links')
3499
3500 # Sort in resolution
3501 links = sorted(mobj)
3502
3503 # Choose the lowest of the sort, i.e. highest resolution
3504 video_url = links[-1]
3505 # Only get the url. The resolution part in the tuple has no use anymore
3506 video_url = video_url[-1]
3507 # Treat escaped \u0026 style hex
93702113
FV
3508 try:
3509 video_url = video_url.decode("unicode_escape")
3510 except AttributeError: # Python 3
3511 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3512
3513
3514 return [{
93702113 3515 'id': video_id,
59ae15a5 3516 'url': video_url,
93702113
FV
3517 'uploader': uploader,
3518 'upload_date': upload_date,
3519 'title': video_title,
3520 'ext': video_extension,
59ae15a5 3521 }]
4cc3d074
PH
3522
3523class NBAIE(InfoExtractor):
3524 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3525 IE_NAME = u'nba'
3526
3527 def report_extraction(self, video_id):
3528 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3529
3530 def _real_extract(self, url):
3531 mobj = re.match(self._VALID_URL, url)
3532 if mobj is None:
3533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3534 return
3535
3536 video_id = mobj.group(1)
3537 if video_id.endswith('/index.html'):
3538 video_id = video_id[:-len('/index.html')]
3539
3540 self.report_extraction(video_id)
3541 try:
3542 urlh = compat_urllib_request.urlopen(url)
3543 webpage_bytes = urlh.read()
3544 webpage = webpage_bytes.decode('utf-8', 'ignore')
3545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3547 return
3548
3549 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3550 def _findProp(rexp, default=None):
3551 m = re.search(rexp, webpage)
3552 if m:
3553 return unescapeHTML(m.group(1))
3554 else:
3555 return default
3556
3557 shortened_video_id = video_id.rpartition('/')[2]
3558 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3559 info = {
3560 'id': shortened_video_id,
3561 'url': video_url,
3562 'ext': 'mp4',
3563 'title': title,
3564 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3565 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3566 }
3567 return [info]
0b40544f
DV
3568
3569class JustinTVIE(InfoExtractor):
3570 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3571 # TODO: One broadcast may be split into multiple videos. The key
3572 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3573 # starts at 1 and increases. Can we treat all parts as one video?
3574
4096b609
DV
3575 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3576 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3577 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3578 IE_NAME = u'justin.tv'
3579
3580 def report_extraction(self, file_id):
3581 """Report information extraction."""
3582 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3583
4096b609
DV
3584 def report_download_page(self, channel, offset):
3585 """Report attempt to download a single page of videos."""
3586 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3587 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3588
2ab1c5ed
DV
3589 # Return count of items, list of *valid* items
3590 def _parse_page(self, url):
0b40544f 3591 try:
2ab1c5ed 3592 urlh = compat_urllib_request.urlopen(url)
0b40544f
DV
3593 webpage_bytes = urlh.read()
3594 webpage = webpage_bytes.decode('utf-8', 'ignore')
3595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3596 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3597 return
cdb30764 3598
0b40544f
DV
3599 response = json.loads(webpage)
3600 info = []
3601 for clip in response:
3602 video_url = clip['video_file_url']
3603 if video_url:
3604 video_extension = os.path.splitext(video_url)[1][1:]
3605 video_date = re.sub('-', '', clip['created_on'][:10])
3606 info.append({
3607 'id': clip['id'],
3608 'url': video_url,
3609 'title': clip['title'],
4096b609 3610 'uploader': clip.get('user_id', clip.get('channel_id')),
0b40544f
DV
3611 'upload_date': video_date,
3612 'ext': video_extension,
3613 })
2ab1c5ed
DV
3614 return (len(response), info)
3615
3616 def _real_extract(self, url):
3617 mobj = re.match(self._VALID_URL, url)
3618 if mobj is None:
3619 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3620 return
cdb30764 3621
2ab1c5ed
DV
3622 api = 'http://api.justin.tv'
3623 video_id = mobj.group(mobj.lastindex)
3624 paged = False
3625 if mobj.lastindex == 1:
3626 paged = True
3627 api += '/channel/archives/%s.json'
3628 else:
3629 api += '/clip/show/%s.json'
3630 api = api % (video_id,)
cdb30764 3631
2ab1c5ed 3632 self.report_extraction(video_id)
cdb30764 3633
2ab1c5ed
DV
3634 info = []
3635 offset = 0
4096b609
DV
3636 limit = self._JUSTIN_PAGE_LIMIT
3637 while True:
3638 if paged:
3639 self.report_download_page(video_id, offset)
2ab1c5ed
DV
3640 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3641 page_count, page_info = self._parse_page(page_url)
3642 info.extend(page_info)
3643 if not paged or page_count != limit:
3644 break
3645 offset += limit
0b40544f 3646 return info
21a9c6aa
PH
3647
3648class FunnyOrDieIE(InfoExtractor):
3649 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3650 IE_NAME = u'FunnyOrDie'
3651
3652 def report_extraction(self, video_id):
3653 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3654
3655 def _real_extract(self, url):
3656 mobj = re.match(self._VALID_URL, url)
3657 if mobj is None:
3658 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3659 return
3660
3661 video_id = mobj.group('id')
3662 self.report_extraction(video_id)
3663 try:
3664 urlh = compat_urllib_request.urlopen(url)
3665 webpage_bytes = urlh.read()
3666 webpage = webpage_bytes.decode('utf-8', 'ignore')
3667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3668 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3669 return
3670
3671 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3672 if not m:
3673 self._downloader.trouble(u'ERROR: unable to find video information')
3674 video_url = unescapeHTML(m.group('url'))
3675 print(video_url)
3676
3677 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3678 if not m:
3679 self._downloader.trouble(u'Cannot find video title')
3680 title = unescapeHTML(m.group('title'))
3681
3682 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3683 if m:
3684 desc = unescapeHTML(m.group('desc'))
3685 else:
3686 desc = None
3687
3688 info = {
3689 'id': video_id,
3690 'url': video_url,
3691 'ext': 'mp4',
3692 'title': title,
3693 'description': desc,
3694 }
3695 return [info]