]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Make ustream IE more robust
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import netrc
9 import os
10 import re
11 import socket
12 import time
13 import email.utils
14 import xml.etree.ElementTree
15 import random
16 import math
17
18 from .utils import *
19
20
21 class InfoExtractor(object):
22 """Information Extractor class.
23
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
31
32 The dictionaries must include the following fields:
33
34 id: Video identifier.
35 url: Final video URL.
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
51
52 The fields should all be Unicode strings.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57
58 _real_extract() must return a *list* of information dictionaries as
59 described above.
60
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
63 """
64
65 _ready = False
66 _downloader = None
67 _WORKING = True
68
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
71 self._ready = False
72 self.set_downloader(downloader)
73
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
77
78 def working(self):
79 """Getter method for _WORKING."""
80 return self._WORKING
81
82 def initialize(self):
83 """Initializes an instance (authentication, etc)."""
84 if not self._ready:
85 self._real_initialize()
86 self._ready = True
87
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
90 self.initialize()
91 return self._real_extract(url)
92
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
96
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
99 pass
100
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
103 pass
104
105 @property
106 def IE_NAME(self):
107 return type(self).__name__[:-2]
108
109 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
110 if note is None:
111 note = u'Downloading video webpage'
112 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
113 try:
114 urlh = compat_urllib_request.urlopen(url_or_request)
115 webpage_bytes = urlh.read()
116 return webpage_bytes.decode('utf-8', 'replace')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 if errnote is None:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
121
122
123 class YoutubeIE(InfoExtractor):
124 """Information extractor for youtube.com."""
125
126 _VALID_URL = r"""^
127 (
128 (?:https?://)? # http(s):// (optional)
129 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
131 (?:.*?\#/)? # handle anchor (#/) redirect urls
132 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
133 (?: # the various things that can precede the ID:
134 (?:(?:v|embed|e)/) # v/ or embed/ or e/
135 |(?: # or the v= param in all its forms
136 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137 (?:\?|\#!?) # the params delimiter ? or # or #!
138 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
139 v=
140 )
141 )? # optional -> youtube.com/xxxx is OK
142 )? # all until now is optional -> you can pass the naked ID
143 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
144 (?(1).+)? # if we found the ID, everything can follow
145 $"""
146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150 _NETRC_MACHINE = 'youtube'
151 # Listed in order of quality
152 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154 _video_extensions = {
155 '13': '3gp',
156 '17': 'mp4',
157 '18': 'mp4',
158 '22': 'mp4',
159 '37': 'mp4',
160 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
161 '43': 'webm',
162 '44': 'webm',
163 '45': 'webm',
164 '46': 'webm',
165 }
166 _video_dimensions = {
167 '5': '240x400',
168 '6': '???',
169 '13': '???',
170 '17': '144x176',
171 '18': '360x640',
172 '22': '720x1280',
173 '34': '360x640',
174 '35': '480x854',
175 '37': '1080x1920',
176 '38': '3072x4096',
177 '43': '360x640',
178 '44': '480x854',
179 '45': '720x1280',
180 '46': '1080x1920',
181 }
182 IE_NAME = u'youtube'
183
184 def suitable(self, url):
185 """Receives a URL and returns True if suitable for this IE."""
186 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
187
188 def report_lang(self):
189 """Report attempt to set language."""
190 self._downloader.to_screen(u'[youtube] Setting language')
191
192 def report_login(self):
193 """Report attempt to log in."""
194 self._downloader.to_screen(u'[youtube] Logging in')
195
196 def report_age_confirmation(self):
197 """Report attempt to confirm age."""
198 self._downloader.to_screen(u'[youtube] Confirming age')
199
200 def report_video_webpage_download(self, video_id):
201 """Report attempt to download video webpage."""
202 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
203
204 def report_video_info_webpage_download(self, video_id):
205 """Report attempt to download video info webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
207
208 def report_video_subtitles_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
211
212 def report_information_extraction(self, video_id):
213 """Report attempt to extract video information."""
214 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
215
216 def report_unavailable_format(self, video_id, format):
217 """Report extracted video URL."""
218 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
219
220 def report_rtmp_download(self):
221 """Indicate the download will use the RTMP protocol."""
222 self._downloader.to_screen(u'[youtube] RTMP download detected')
223
224 def _closed_captions_xml_to_srt(self, xml_string):
225 srt = ''
226 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227 # TODO parse xml instead of regex
228 for n, (start, dur_tag, dur, caption) in enumerate(texts):
229 if not dur: dur = '4'
230 start = float(start)
231 end = start + float(dur)
232 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234 caption = unescapeHTML(caption)
235 caption = unescapeHTML(caption) # double cycle, intentional
236 srt += str(n+1) + '\n'
237 srt += start + ' --> ' + end + '\n'
238 srt += caption + '\n\n'
239 return srt
240
241 def _extract_subtitles(self, video_id):
242 self.report_video_subtitles_download(video_id)
243 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
244 try:
245 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250 if not srt_lang_list:
251 return (u'WARNING: video has no closed captions', None)
252 if self._downloader.params.get('subtitleslang', False):
253 srt_lang = self._downloader.params.get('subtitleslang')
254 elif 'en' in srt_lang_list:
255 srt_lang = 'en'
256 else:
257 srt_lang = list(srt_lang_list.keys())[0]
258 if not srt_lang in srt_lang_list:
259 return (u'WARNING: no closed captions found in the specified language', None)
260 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
261 try:
262 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265 if not srt_xml:
266 return (u'WARNING: unable to download video subtitles', None)
267 return (None, self._closed_captions_xml_to_srt(srt_xml))
268
269 def _print_formats(self, formats):
270 print('Available formats:')
271 for x in formats:
272 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
273
274 def _real_initialize(self):
275 if self._downloader is None:
276 return
277
278 username = None
279 password = None
280 downloader_params = self._downloader.params
281
282 # Attempt to use provided username and password or .netrc data
283 if downloader_params.get('username', None) is not None:
284 username = downloader_params['username']
285 password = downloader_params['password']
286 elif downloader_params.get('usenetrc', False):
287 try:
288 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
289 if info is not None:
290 username = info[0]
291 password = info[2]
292 else:
293 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294 except (IOError, netrc.NetrcParseError) as err:
295 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
296 return
297
298 # Set language
299 request = compat_urllib_request.Request(self._LANG_URL)
300 try:
301 self.report_lang()
302 compat_urllib_request.urlopen(request).read()
303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
305 return
306
307 # No authentication to be performed
308 if username is None:
309 return
310
311 # Log in
312 login_form = {
313 'current_form': 'loginForm',
314 'next': '/',
315 'action_login': 'Log In',
316 'username': username,
317 'password': password,
318 }
319 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
320 try:
321 self.report_login()
322 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
325 return
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
328 return
329
330 # Confirm age
331 age_form = {
332 'next_url': '/',
333 'action_confirm': 'Confirm',
334 }
335 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
336 try:
337 self.report_age_confirmation()
338 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
341 return
342
343 def _extract_id(self, url):
344 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
345 if mobj is None:
346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
347 return
348 video_id = mobj.group(2)
349 return video_id
350
351 def _real_extract(self, url):
352 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353 mobj = re.search(self._NEXT_URL_RE, url)
354 if mobj:
355 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356 video_id = self._extract_id(url)
357
358 # Get video webpage
359 self.report_video_webpage_download(video_id)
360 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361 request = compat_urllib_request.Request(url)
362 try:
363 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
366 return
367
368 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
369
370 # Attempt to extract SWF player URL
371 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
372 if mobj is not None:
373 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
374 else:
375 player_url = None
376
377 # Get video info
378 self.report_video_info_webpage_download(video_id)
379 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381 % (video_id, el_type))
382 request = compat_urllib_request.Request(video_info_url)
383 try:
384 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386 video_info = compat_parse_qs(video_info_webpage)
387 if 'token' in video_info:
388 break
389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
391 return
392 if 'token' not in video_info:
393 if 'reason' in video_info:
394 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
395 else:
396 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
397 return
398
399 # Check for "rental" videos
400 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401 self._downloader.trouble(u'ERROR: "rental" videos not supported')
402 return
403
404 # Start extracting information
405 self.report_information_extraction(video_id)
406
407 # uploader
408 if 'author' not in video_info:
409 self._downloader.trouble(u'ERROR: unable to extract uploader name')
410 return
411 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
412
413 # uploader_id
414 video_uploader_id = None
415 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
416 if mobj is not None:
417 video_uploader_id = mobj.group(1)
418 else:
419 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
420
421 # title
422 if 'title' not in video_info:
423 self._downloader.trouble(u'ERROR: unable to extract video title')
424 return
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
426
427 # thumbnail image
428 if 'thumbnail_url' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
430 video_thumbnail = ''
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
433
434 # upload date
435 upload_date = None
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
437 if mobj is not None:
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440 for expression in format_expressions:
441 try:
442 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
443 except:
444 pass
445
446 # description
447 video_description = get_element_by_id("eow-description", video_webpage)
448 if video_description:
449 video_description = clean_html(video_description)
450 else:
451 video_description = ''
452
453 # closed captions
454 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
457 if srt_error:
458 self._downloader.trouble(srt_error)
459
460 if 'length_seconds' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video duration')
462 video_duration = ''
463 else:
464 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
465
466 # token
467 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
468
469 # Decide which formats to download
470 req_format = self._downloader.params.get('format', None)
471
472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473 self.report_rtmp_download()
474 video_url_list = [(None, video_info['conn'][0])]
475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
480
481 format_limit = self._downloader.params.get('format_limit', None)
482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483 if format_limit is not None and format_limit in available_formats:
484 format_list = available_formats[available_formats.index(format_limit):]
485 else:
486 format_list = available_formats
487 existing_formats = [x for x in format_list if x in url_map]
488 if len(existing_formats) == 0:
489 self._downloader.trouble(u'ERROR: no known formats available for video')
490 return
491 if self._downloader.params.get('listformats', None):
492 self._print_formats(existing_formats)
493 return
494 if req_format is None or req_format == 'best':
495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496 elif req_format == 'worst':
497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498 elif req_format in ('-1', 'all'):
499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
500 else:
501 # Specific formats. We pick the first in a slash-delimeted sequence.
502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503 req_formats = req_format.split('/')
504 video_url_list = None
505 for rf in req_formats:
506 if rf in url_map:
507 video_url_list = [(rf, url_map[rf])]
508 break
509 if video_url_list is None:
510 self._downloader.trouble(u'ERROR: requested format not available')
511 return
512 else:
513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
514 return
515
516 results = []
517 for format_param, video_real_url in video_url_list:
518 # Extension
519 video_extension = self._video_extensions.get(format_param, 'flv')
520
521 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522 self._video_dimensions.get(format_param, '???'))
523
524 results.append({
525 'id': video_id,
526 'url': video_real_url,
527 'uploader': video_uploader,
528 'uploader_id': video_uploader_id,
529 'upload_date': upload_date,
530 'title': video_title,
531 'ext': video_extension,
532 'format': video_format,
533 'thumbnail': video_thumbnail,
534 'description': video_description,
535 'player_url': player_url,
536 'subtitles': video_subtitles,
537 'duration': video_duration
538 })
539 return results
540
541
542 class MetacafeIE(InfoExtractor):
543 """Information Extractor for metacafe.com."""
544
545 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548 IE_NAME = u'metacafe'
549
550 def __init__(self, downloader=None):
551 InfoExtractor.__init__(self, downloader)
552
553 def report_disclaimer(self):
554 """Report disclaimer retrieval."""
555 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
556
557 def report_age_confirmation(self):
558 """Report attempt to confirm age."""
559 self._downloader.to_screen(u'[metacafe] Confirming age')
560
561 def report_download_webpage(self, video_id):
562 """Report webpage download."""
563 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
564
565 def report_extraction(self, video_id):
566 """Report information extraction."""
567 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
568
569 def _real_initialize(self):
570 # Retrieve disclaimer
571 request = compat_urllib_request.Request(self._DISCLAIMER)
572 try:
573 self.report_disclaimer()
574 disclaimer = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
577 return
578
579 # Confirm age
580 disclaimer_form = {
581 'filters': '0',
582 'submit': "Continue - I'm over 18",
583 }
584 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
585 try:
586 self.report_age_confirmation()
587 disclaimer = compat_urllib_request.urlopen(request).read()
588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
590 return
591
592 def _real_extract(self, url):
593 # Extract id and simplified title from URL
594 mobj = re.match(self._VALID_URL, url)
595 if mobj is None:
596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
597 return
598
599 video_id = mobj.group(1)
600
601 # Check if video comes from YouTube
602 mobj2 = re.match(r'^yt-(.*)$', video_id)
603 if mobj2 is not None:
604 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
605 return
606
607 # Retrieve video webpage to extract further information
608 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
609 try:
610 self.report_download_webpage(video_id)
611 webpage = compat_urllib_request.urlopen(request).read()
612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
614 return
615
616 # Extract URL, uploader and title from webpage
617 self.report_extraction(video_id)
618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
619 if mobj is not None:
620 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621 video_extension = mediaURL[-3:]
622
623 # Extract gdaKey if available
624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
625 if mobj is None:
626 video_url = mediaURL
627 else:
628 gdaKey = mobj.group(1)
629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
630 else:
631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
632 if mobj is None:
633 self._downloader.trouble(u'ERROR: unable to extract media URL')
634 return
635 vardict = compat_parse_qs(mobj.group(1))
636 if 'mediaData' not in vardict:
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 return
639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
640 if mobj is None:
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 return
643 mediaURL = mobj.group(1).replace('\\/', '/')
644 video_extension = mediaURL[-3:]
645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
646
647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
648 if mobj is None:
649 self._downloader.trouble(u'ERROR: unable to extract title')
650 return
651 video_title = mobj.group(1).decode('utf-8')
652
653 mobj = re.search(r'submitter=(.*?);', webpage)
654 if mobj is None:
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656 return
657 video_uploader = mobj.group(1)
658
659 return [{
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
663 'upload_date': None,
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
666 }]
667
668
669 class DailymotionIE(InfoExtractor):
670 """Information Extractor for Dailymotion"""
671
672 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673 IE_NAME = u'dailymotion'
674
675 def __init__(self, downloader=None):
676 InfoExtractor.__init__(self, downloader)
677
678 def report_extraction(self, video_id):
679 """Report information extraction."""
680 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
681
682 def _real_extract(self, url):
683 # Extract id and simplified title from URL
684 mobj = re.match(self._VALID_URL, url)
685 if mobj is None:
686 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
687 return
688
689 video_id = mobj.group(1).split('_')[0].split('?')[0]
690
691 video_extension = 'mp4'
692
693 # Retrieve video webpage to extract further information
694 request = compat_urllib_request.Request(url)
695 request.add_header('Cookie', 'family_filter=off')
696 webpage = self._download_webpage(request, video_id)
697
698 # Extract URL, uploader and title from webpage
699 self.report_extraction(video_id)
700 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
701 if mobj is None:
702 self._downloader.trouble(u'ERROR: unable to extract media URL')
703 return
704 flashvars = compat_urllib_parse.unquote(mobj.group(1))
705
706 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
707 if key in flashvars:
708 max_quality = key
709 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
710 break
711 else:
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
713 return
714
715 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
716 if mobj is None:
717 self._downloader.trouble(u'ERROR: unable to extract video URL')
718 return
719
720 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
721
722 # TODO: support choosing qualities
723
724 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
725 if mobj is None:
726 self._downloader.trouble(u'ERROR: unable to extract title')
727 return
728 video_title = unescapeHTML(mobj.group('title'))
729
730 video_uploader = None
731 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
732 if mobj is None:
733 # lookin for official user
734 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
735 if mobj_official is None:
736 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
737 else:
738 video_uploader = mobj_official.group(1)
739 else:
740 video_uploader = mobj.group(1)
741
742 video_upload_date = None
743 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
744 if mobj is not None:
745 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
746
747 return [{
748 'id': video_id,
749 'url': video_url,
750 'uploader': video_uploader,
751 'upload_date': video_upload_date,
752 'title': video_title,
753 'ext': video_extension,
754 }]
755
756
757 class PhotobucketIE(InfoExtractor):
758 """Information extractor for photobucket.com."""
759
760 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
761 IE_NAME = u'photobucket'
762
763 def __init__(self, downloader=None):
764 InfoExtractor.__init__(self, downloader)
765
766 def report_download_webpage(self, video_id):
767 """Report webpage download."""
768 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
769
770 def report_extraction(self, video_id):
771 """Report information extraction."""
772 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
773
774 def _real_extract(self, url):
775 # Extract id from URL
776 mobj = re.match(self._VALID_URL, url)
777 if mobj is None:
778 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
779 return
780
781 video_id = mobj.group(1)
782
783 video_extension = 'flv'
784
785 # Retrieve video webpage to extract further information
786 request = compat_urllib_request.Request(url)
787 try:
788 self.report_download_webpage(video_id)
789 webpage = compat_urllib_request.urlopen(request).read()
790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
792 return
793
794 # Extract URL, uploader, and title from webpage
795 self.report_extraction(video_id)
796 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
797 if mobj is None:
798 self._downloader.trouble(u'ERROR: unable to extract media URL')
799 return
800 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
801
802 video_url = mediaURL
803
804 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
805 if mobj is None:
806 self._downloader.trouble(u'ERROR: unable to extract title')
807 return
808 video_title = mobj.group(1).decode('utf-8')
809
810 video_uploader = mobj.group(2).decode('utf-8')
811
812 return [{
813 'id': video_id.decode('utf-8'),
814 'url': video_url.decode('utf-8'),
815 'uploader': video_uploader,
816 'upload_date': None,
817 'title': video_title,
818 'ext': video_extension.decode('utf-8'),
819 }]
820
821
822 class YahooIE(InfoExtractor):
823 """Information extractor for video.yahoo.com."""
824
825 _WORKING = False
826 # _VALID_URL matches all Yahoo! Video URLs
827 # _VPAGE_URL matches only the extractable '/watch/' URLs
828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
830 IE_NAME = u'video.yahoo'
831
832 def __init__(self, downloader=None):
833 InfoExtractor.__init__(self, downloader)
834
835 def report_download_webpage(self, video_id):
836 """Report webpage download."""
837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
838
839 def report_extraction(self, video_id):
840 """Report information extraction."""
841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
842
843 def _real_extract(self, url, new_video=True):
844 # Extract ID from URL
845 mobj = re.match(self._VALID_URL, url)
846 if mobj is None:
847 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
848 return
849
850 video_id = mobj.group(2)
851 video_extension = 'flv'
852
853 # Rewrite valid but non-extractable URLs as
854 # extractable English language /watch/ URLs
855 if re.match(self._VPAGE_URL, url) is None:
856 request = compat_urllib_request.Request(url)
857 try:
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861 return
862
863 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
864 if mobj is None:
865 self._downloader.trouble(u'ERROR: Unable to extract id field')
866 return
867 yahoo_id = mobj.group(1)
868
869 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
870 if mobj is None:
871 self._downloader.trouble(u'ERROR: Unable to extract vid field')
872 return
873 yahoo_vid = mobj.group(1)
874
875 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
876 return self._real_extract(url, new_video=False)
877
878 # Retrieve video webpage to extract further information
879 request = compat_urllib_request.Request(url)
880 try:
881 self.report_download_webpage(video_id)
882 webpage = compat_urllib_request.urlopen(request).read()
883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
885 return
886
887 # Extract uploader and title from webpage
888 self.report_extraction(video_id)
889 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
890 if mobj is None:
891 self._downloader.trouble(u'ERROR: unable to extract video title')
892 return
893 video_title = mobj.group(1).decode('utf-8')
894
895 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
896 if mobj is None:
897 self._downloader.trouble(u'ERROR: unable to extract video uploader')
898 return
899 video_uploader = mobj.group(1).decode('utf-8')
900
901 # Extract video thumbnail
902 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
903 if mobj is None:
904 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
905 return
906 video_thumbnail = mobj.group(1).decode('utf-8')
907
908 # Extract video description
909 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
910 if mobj is None:
911 self._downloader.trouble(u'ERROR: unable to extract video description')
912 return
913 video_description = mobj.group(1).decode('utf-8')
914 if not video_description:
915 video_description = 'No description available.'
916
917 # Extract video height and width
918 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
919 if mobj is None:
920 self._downloader.trouble(u'ERROR: unable to extract video height')
921 return
922 yv_video_height = mobj.group(1)
923
924 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
925 if mobj is None:
926 self._downloader.trouble(u'ERROR: unable to extract video width')
927 return
928 yv_video_width = mobj.group(1)
929
930 # Retrieve video playlist to extract media URL
931 # I'm not completely sure what all these options are, but we
932 # seem to need most of them, otherwise the server sends a 401.
933 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
934 yv_bitrate = '700' # according to Wikipedia this is hard-coded
935 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
936 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
937 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
938 try:
939 self.report_download_webpage(video_id)
940 webpage = compat_urllib_request.urlopen(request).read()
941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
943 return
944
945 # Extract media URL from playlist XML
946 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
947 if mobj is None:
948 self._downloader.trouble(u'ERROR: Unable to extract media URL')
949 return
950 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
951 video_url = unescapeHTML(video_url)
952
953 return [{
954 'id': video_id.decode('utf-8'),
955 'url': video_url,
956 'uploader': video_uploader,
957 'upload_date': None,
958 'title': video_title,
959 'ext': video_extension.decode('utf-8'),
960 'thumbnail': video_thumbnail.decode('utf-8'),
961 'description': video_description,
962 }]
963
964
965 class VimeoIE(InfoExtractor):
966 """Information extractor for vimeo.com."""
967
968 # _VALID_URL matches Vimeo URLs
969 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
970 IE_NAME = u'vimeo'
971
972 def __init__(self, downloader=None):
973 InfoExtractor.__init__(self, downloader)
974
975 def report_download_webpage(self, video_id):
976 """Report webpage download."""
977 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
978
979 def report_extraction(self, video_id):
980 """Report information extraction."""
981 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
982
983 def _real_extract(self, url, new_video=True):
984 # Extract ID from URL
985 mobj = re.match(self._VALID_URL, url)
986 if mobj is None:
987 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
988 return
989
990 video_id = mobj.group(1)
991
992 # Retrieve video webpage to extract further information
993 request = compat_urllib_request.Request(url, None, std_headers)
994 try:
995 self.report_download_webpage(video_id)
996 webpage_bytes = compat_urllib_request.urlopen(request).read()
997 webpage = webpage_bytes.decode('utf-8')
998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1000 return
1001
1002 # Now we begin extracting as much information as we can from what we
1003 # retrieved. First we extract the information common to all extractors,
1004 # and latter we extract those that are Vimeo specific.
1005 self.report_extraction(video_id)
1006
1007 # Extract the config JSON
1008 try:
1009 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010 config = json.loads(config)
1011 except:
1012 self._downloader.trouble(u'ERROR: unable to extract info section')
1013 return
1014
1015 # Extract title
1016 video_title = config["video"]["title"]
1017
1018 # Extract uploader and uploader_id
1019 video_uploader = config["video"]["owner"]["name"]
1020 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1021
1022 # Extract video thumbnail
1023 video_thumbnail = config["video"]["thumbnail"]
1024
1025 # Extract video description
1026 video_description = get_element_by_attribute("itemprop", "description", webpage)
1027 if video_description: video_description = clean_html(video_description)
1028 else: video_description = ''
1029
1030 # Extract upload date
1031 video_upload_date = None
1032 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033 if mobj is not None:
1034 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1035
1036 # Vimeo specific: extract request signature and timestamp
1037 sig = config['request']['signature']
1038 timestamp = config['request']['timestamp']
1039
1040 # Vimeo specific: extract video codec and quality information
1041 # First consider quality, then codecs, then take everything
1042 # TODO bind to format param
1043 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044 files = { 'hd': [], 'sd': [], 'other': []}
1045 for codec_name, codec_extension in codecs:
1046 if codec_name in config["video"]["files"]:
1047 if 'hd' in config["video"]["files"][codec_name]:
1048 files['hd'].append((codec_name, codec_extension, 'hd'))
1049 elif 'sd' in config["video"]["files"][codec_name]:
1050 files['sd'].append((codec_name, codec_extension, 'sd'))
1051 else:
1052 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1053
1054 for quality in ('hd', 'sd', 'other'):
1055 if len(files[quality]) > 0:
1056 video_quality = files[quality][0][2]
1057 video_codec = files[quality][0][0]
1058 video_extension = files[quality][0][1]
1059 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1060 break
1061 else:
1062 self._downloader.trouble(u'ERROR: no known codec found')
1063 return
1064
1065 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1067
1068 return [{
1069 'id': video_id,
1070 'url': video_url,
1071 'uploader': video_uploader,
1072 'uploader_id': video_uploader_id,
1073 'upload_date': video_upload_date,
1074 'title': video_title,
1075 'ext': video_extension,
1076 'thumbnail': video_thumbnail,
1077 'description': video_description,
1078 }]
1079
1080
1081 class ArteTvIE(InfoExtractor):
1082 """arte.tv information extractor."""
1083
1084 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085 _LIVE_URL = r'index-[0-9]+\.html$'
1086
1087 IE_NAME = u'arte.tv'
1088
1089 def __init__(self, downloader=None):
1090 InfoExtractor.__init__(self, downloader)
1091
1092 def report_download_webpage(self, video_id):
1093 """Report webpage download."""
1094 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1095
1096 def report_extraction(self, video_id):
1097 """Report information extraction."""
1098 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1099
1100 def fetch_webpage(self, url):
1101 request = compat_urllib_request.Request(url)
1102 try:
1103 self.report_download_webpage(url)
1104 webpage = compat_urllib_request.urlopen(request).read()
1105 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1107 return
1108 except ValueError as err:
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110 return
1111 return webpage
1112
1113 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114 page = self.fetch_webpage(url)
1115 mobj = re.search(regex, page, regexFlags)
1116 info = {}
1117
1118 if mobj is None:
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120 return
1121
1122 for (i, key, err) in matchTuples:
1123 if mobj.group(i) is None:
1124 self._downloader.trouble(err)
1125 return
1126 else:
1127 info[key] = mobj.group(i)
1128
1129 return info
1130
1131 def extractLiveStream(self, url):
1132 video_lang = url.split('/')[-4]
1133 info = self.grep_webpage(
1134 url,
1135 r'src="(.*?/videothek_js.*?\.js)',
1136 0,
1137 [
1138 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1139 ]
1140 )
1141 http_host = url.split('/')[2]
1142 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143 info = self.grep_webpage(
1144 next_url,
1145 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146 '(http://.*?\.swf).*?' +
1147 '(rtmp://.*?)\'',
1148 re.DOTALL,
1149 [
1150 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1151 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1153 ]
1154 )
1155 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1156
1157 def extractPlus7Stream(self, url):
1158 video_lang = url.split('/')[-3]
1159 info = self.grep_webpage(
1160 url,
1161 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1162 0,
1163 [
1164 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1165 ]
1166 )
1167 next_url = compat_urllib_parse.unquote(info.get('url'))
1168 info = self.grep_webpage(
1169 next_url,
1170 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1171 0,
1172 [
1173 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1174 ]
1175 )
1176 next_url = compat_urllib_parse.unquote(info.get('url'))
1177
1178 info = self.grep_webpage(
1179 next_url,
1180 r'<video id="(.*?)".*?>.*?' +
1181 '<name>(.*?)</name>.*?' +
1182 '<dateVideo>(.*?)</dateVideo>.*?' +
1183 '<url quality="hd">(.*?)</url>',
1184 re.DOTALL,
1185 [
1186 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1187 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1189 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1190 ]
1191 )
1192
1193 return {
1194 'id': info.get('id'),
1195 'url': compat_urllib_parse.unquote(info.get('url')),
1196 'uploader': u'arte.tv',
1197 'upload_date': info.get('date'),
1198 'title': info.get('title').decode('utf-8'),
1199 'ext': u'mp4',
1200 'format': u'NA',
1201 'player_url': None,
1202 }
1203
1204 def _real_extract(self, url):
1205 video_id = url.split('/')[-1]
1206 self.report_extraction(video_id)
1207
1208 if re.search(self._LIVE_URL, video_id) is not None:
1209 self.extractLiveStream(url)
1210 return
1211 else:
1212 info = self.extractPlus7Stream(url)
1213
1214 return [info]
1215
1216
1217 class GenericIE(InfoExtractor):
1218 """Generic last-resort information extractor."""
1219
1220 _VALID_URL = r'.*'
1221 IE_NAME = u'generic'
1222
1223 def __init__(self, downloader=None):
1224 InfoExtractor.__init__(self, downloader)
1225
1226 def report_download_webpage(self, video_id):
1227 """Report webpage download."""
1228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1230
1231 def report_extraction(self, video_id):
1232 """Report information extraction."""
1233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1234
1235 def report_following_redirect(self, new_url):
1236 """Report information extraction."""
1237 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1238
1239 def _test_redirect(self, url):
1240 """Check if it is a redirect, like url shorteners, in case restart chain."""
1241 class HeadRequest(compat_urllib_request.Request):
1242 def get_method(self):
1243 return "HEAD"
1244
1245 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1246 """
1247 Subclass the HTTPRedirectHandler to make it use our
1248 HeadRequest also on the redirected URL
1249 """
1250 def redirect_request(self, req, fp, code, msg, headers, newurl):
1251 if code in (301, 302, 303, 307):
1252 newurl = newurl.replace(' ', '%20')
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return HeadRequest(newurl,
1256 headers=newheaders,
1257 origin_req_host=req.get_origin_req_host(),
1258 unverifiable=True)
1259 else:
1260 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1261
1262 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1263 """
1264 Fallback to GET if HEAD is not allowed (405 HTTP error)
1265 """
1266 def http_error_405(self, req, fp, code, msg, headers):
1267 fp.read()
1268 fp.close()
1269
1270 newheaders = dict((k,v) for k,v in req.headers.items()
1271 if k.lower() not in ("content-length", "content-type"))
1272 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1273 headers=newheaders,
1274 origin_req_host=req.get_origin_req_host(),
1275 unverifiable=True))
1276
1277 # Build our opener
1278 opener = compat_urllib_request.OpenerDirector()
1279 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280 HTTPMethodFallback, HEADRedirectHandler,
1281 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282 opener.add_handler(handler())
1283
1284 response = opener.open(HeadRequest(url))
1285 new_url = response.geturl()
1286
1287 if url == new_url:
1288 return False
1289
1290 self.report_following_redirect(new_url)
1291 self._downloader.download([new_url])
1292 return True
1293
1294 def _real_extract(self, url):
1295 if self._test_redirect(url): return
1296
1297 video_id = url.split('/')[-1]
1298 request = compat_urllib_request.Request(url)
1299 try:
1300 self.report_download_webpage(video_id)
1301 webpage = compat_urllib_request.urlopen(request).read()
1302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1304 return
1305 except ValueError as err:
1306 # since this is the last-resort InfoExtractor, if
1307 # this error is thrown, it'll be thrown here
1308 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309 return
1310
1311 self.report_extraction(video_id)
1312 # Start with something easy: JW Player in SWFObject
1313 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1314 if mobj is None:
1315 # Broaden the search a little bit
1316 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1317 if mobj is None:
1318 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319 return
1320
1321 # It's possible that one of the regexes
1322 # matched, but returned an empty group:
1323 if mobj.group(1) is None:
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325 return
1326
1327 video_url = compat_urllib_parse.unquote(mobj.group(1))
1328 video_id = os.path.basename(video_url)
1329
1330 # here's a fun little line of code for you:
1331 video_extension = os.path.splitext(video_id)[1][1:]
1332 video_id = os.path.splitext(video_id)[0]
1333
1334 # it's tempting to parse this further, but you would
1335 # have to take into account all the variations like
1336 # Video Title - Site Name
1337 # Site Name | Video Title
1338 # Video Title - Tagline | Site Name
1339 # and so on and so forth; it's just not practical
1340 mobj = re.search(r'<title>(.*)</title>', webpage)
1341 if mobj is None:
1342 self._downloader.trouble(u'ERROR: unable to extract title')
1343 return
1344 video_title = mobj.group(1)
1345
1346 # video uploader is domain name
1347 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1348 if mobj is None:
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1350 return
1351 video_uploader = mobj.group(1)
1352
1353 return [{
1354 'id': video_id,
1355 'url': video_url,
1356 'uploader': video_uploader,
1357 'upload_date': None,
1358 'title': video_title,
1359 'ext': video_extension,
1360 }]
1361
1362
1363 class YoutubeSearchIE(InfoExtractor):
1364 """Information Extractor for YouTube search queries."""
1365 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367 _max_youtube_results = 1000
1368 IE_NAME = u'youtube:search'
1369
1370 def __init__(self, downloader=None):
1371 InfoExtractor.__init__(self, downloader)
1372
1373 def report_download_page(self, query, pagenum):
1374 """Report attempt to download search page with given number."""
1375 query = query.decode(preferredencoding())
1376 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1377
1378 def _real_extract(self, query):
1379 mobj = re.match(self._VALID_URL, query)
1380 if mobj is None:
1381 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1382 return
1383
1384 prefix, query = query.split(':')
1385 prefix = prefix[8:]
1386 query = query.encode('utf-8')
1387 if prefix == '':
1388 self._download_n_results(query, 1)
1389 return
1390 elif prefix == 'all':
1391 self._download_n_results(query, self._max_youtube_results)
1392 return
1393 else:
1394 try:
1395 n = int(prefix)
1396 if n <= 0:
1397 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1398 return
1399 elif n > self._max_youtube_results:
1400 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401 n = self._max_youtube_results
1402 self._download_n_results(query, n)
1403 return
1404 except ValueError: # parsing prefix as integer fails
1405 self._download_n_results(query, 1)
1406 return
1407
1408 def _download_n_results(self, query, n):
1409 """Downloads a specified number of results for a query"""
1410
1411 video_ids = []
1412 pagenum = 0
1413 limit = n
1414
1415 while (50 * pagenum) < limit:
1416 self.report_download_page(query, pagenum+1)
1417 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418 request = compat_urllib_request.Request(result_url)
1419 try:
1420 data = compat_urllib_request.urlopen(request).read()
1421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1423 return
1424 api_response = json.loads(data)['data']
1425
1426 new_ids = list(video['id'] for video in api_response['items'])
1427 video_ids += new_ids
1428
1429 limit = min(n, api_response['totalItems'])
1430 pagenum += 1
1431
1432 if len(video_ids) > n:
1433 video_ids = video_ids[:n]
1434 for id in video_ids:
1435 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1436 return
1437
1438
1439 class GoogleSearchIE(InfoExtractor):
1440 """Information Extractor for Google Video search queries."""
1441 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445 _max_google_results = 1000
1446 IE_NAME = u'video.google:search'
1447
1448 def __init__(self, downloader=None):
1449 InfoExtractor.__init__(self, downloader)
1450
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1455
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1458 if mobj is None:
1459 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1460 return
1461
1462 prefix, query = query.split(':')
1463 prefix = prefix[8:]
1464 query = query.encode('utf-8')
1465 if prefix == '':
1466 self._download_n_results(query, 1)
1467 return
1468 elif prefix == 'all':
1469 self._download_n_results(query, self._max_google_results)
1470 return
1471 else:
1472 try:
1473 n = int(prefix)
1474 if n <= 0:
1475 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476 return
1477 elif n > self._max_google_results:
1478 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479 n = self._max_google_results
1480 self._download_n_results(query, n)
1481 return
1482 except ValueError: # parsing prefix as integer fails
1483 self._download_n_results(query, 1)
1484 return
1485
1486 def _download_n_results(self, query, n):
1487 """Downloads a specified number of results for a query"""
1488
1489 video_ids = []
1490 pagenum = 0
1491
1492 while True:
1493 self.report_download_page(query, pagenum)
1494 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495 request = compat_urllib_request.Request(result_url)
1496 try:
1497 page = compat_urllib_request.urlopen(request).read()
1498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1500 return
1501
1502 # Extract video identifiers
1503 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504 video_id = mobj.group(1)
1505 if video_id not in video_ids:
1506 video_ids.append(video_id)
1507 if len(video_ids) == n:
1508 # Specified n videos reached
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511 return
1512
1513 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514 for id in video_ids:
1515 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1516 return
1517
1518 pagenum = pagenum + 1
1519
1520
1521 class YahooSearchIE(InfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1523
1524 _WORKING = False
1525 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528 _MORE_PAGES_INDICATOR = r'\s*Next'
1529 _max_yahoo_results = 1000
1530 IE_NAME = u'video.yahoo:search'
1531
1532 def __init__(self, downloader=None):
1533 InfoExtractor.__init__(self, downloader)
1534
1535 def report_download_page(self, query, pagenum):
1536 """Report attempt to download playlist page with given number."""
1537 query = query.decode(preferredencoding())
1538 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1539
1540 def _real_extract(self, query):
1541 mobj = re.match(self._VALID_URL, query)
1542 if mobj is None:
1543 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1544 return
1545
1546 prefix, query = query.split(':')
1547 prefix = prefix[8:]
1548 query = query.encode('utf-8')
1549 if prefix == '':
1550 self._download_n_results(query, 1)
1551 return
1552 elif prefix == 'all':
1553 self._download_n_results(query, self._max_yahoo_results)
1554 return
1555 else:
1556 try:
1557 n = int(prefix)
1558 if n <= 0:
1559 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1560 return
1561 elif n > self._max_yahoo_results:
1562 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563 n = self._max_yahoo_results
1564 self._download_n_results(query, n)
1565 return
1566 except ValueError: # parsing prefix as integer fails
1567 self._download_n_results(query, 1)
1568 return
1569
1570 def _download_n_results(self, query, n):
1571 """Downloads a specified number of results for a query"""
1572
1573 video_ids = []
1574 already_seen = set()
1575 pagenum = 1
1576
1577 while True:
1578 self.report_download_page(query, pagenum)
1579 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580 request = compat_urllib_request.Request(result_url)
1581 try:
1582 page = compat_urllib_request.urlopen(request).read()
1583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1585 return
1586
1587 # Extract video identifiers
1588 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589 video_id = mobj.group(1)
1590 if video_id not in already_seen:
1591 video_ids.append(video_id)
1592 already_seen.add(video_id)
1593 if len(video_ids) == n:
1594 # Specified n videos reached
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597 return
1598
1599 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600 for id in video_ids:
1601 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1602 return
1603
1604 pagenum = pagenum + 1
1605
1606
1607 class YoutubePlaylistIE(InfoExtractor):
1608 """Information Extractor for YouTube playlists."""
1609
1610 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1613 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614 IE_NAME = u'youtube:playlist'
1615
1616 def __init__(self, downloader=None):
1617 InfoExtractor.__init__(self, downloader)
1618
1619 def report_download_page(self, playlist_id, pagenum):
1620 """Report attempt to download playlist page with given number."""
1621 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1622
1623 def _real_extract(self, url):
1624 # Extract playlist id
1625 mobj = re.match(self._VALID_URL, url)
1626 if mobj is None:
1627 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1628 return
1629
1630 # Single video case
1631 if mobj.group(3) is not None:
1632 self._downloader.download([mobj.group(3)])
1633 return
1634
1635 # Download playlist pages
1636 # prefix is 'p' as default for playlists but there are other types that need extra care
1637 playlist_prefix = mobj.group(1)
1638 if playlist_prefix == 'a':
1639 playlist_access = 'artist'
1640 else:
1641 playlist_prefix = 'p'
1642 playlist_access = 'view_play_list'
1643 playlist_id = mobj.group(2)
1644 video_ids = []
1645 pagenum = 1
1646
1647 while True:
1648 self.report_download_page(playlist_id, pagenum)
1649 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650 request = compat_urllib_request.Request(url)
1651 try:
1652 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1655 return
1656
1657 # Extract video identifiers
1658 ids_in_page = []
1659 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660 if mobj.group(1) not in ids_in_page:
1661 ids_in_page.append(mobj.group(1))
1662 video_ids.extend(ids_in_page)
1663
1664 if self._MORE_PAGES_INDICATOR not in page:
1665 break
1666 pagenum = pagenum + 1
1667
1668 total = len(video_ids)
1669
1670 playliststart = self._downloader.params.get('playliststart', 1) - 1
1671 playlistend = self._downloader.params.get('playlistend', -1)
1672 if playlistend == -1:
1673 video_ids = video_ids[playliststart:]
1674 else:
1675 video_ids = video_ids[playliststart:playlistend]
1676
1677 if len(video_ids) == total:
1678 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1679 else:
1680 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1681
1682 for id in video_ids:
1683 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1684 return
1685
1686
1687 class YoutubeChannelIE(InfoExtractor):
1688 """Information Extractor for YouTube channels."""
1689
1690 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693 IE_NAME = u'youtube:channel'
1694
1695 def report_download_page(self, channel_id, pagenum):
1696 """Report attempt to download channel page with given number."""
1697 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1698
1699 def _real_extract(self, url):
1700 # Extract channel id
1701 mobj = re.match(self._VALID_URL, url)
1702 if mobj is None:
1703 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1704 return
1705
1706 # Download channel pages
1707 channel_id = mobj.group(1)
1708 video_ids = []
1709 pagenum = 1
1710
1711 while True:
1712 self.report_download_page(channel_id, pagenum)
1713 url = self._TEMPLATE_URL % (channel_id, pagenum)
1714 request = compat_urllib_request.Request(url)
1715 try:
1716 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719 return
1720
1721 # Extract video identifiers
1722 ids_in_page = []
1723 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1727
1728 if self._MORE_PAGES_INDICATOR not in page:
1729 break
1730 pagenum = pagenum + 1
1731
1732 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1733
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736 return
1737
1738
1739 class YoutubeUserIE(InfoExtractor):
1740 """Information Extractor for YouTube users."""
1741
1742 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744 _GDATA_PAGE_SIZE = 50
1745 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747 IE_NAME = u'youtube:user'
1748
1749 def __init__(self, downloader=None):
1750 InfoExtractor.__init__(self, downloader)
1751
1752 def report_download_page(self, username, start_index):
1753 """Report attempt to download user page."""
1754 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1756
1757 def _real_extract(self, url):
1758 # Extract username
1759 mobj = re.match(self._VALID_URL, url)
1760 if mobj is None:
1761 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1762 return
1763
1764 username = mobj.group(1)
1765
1766 # Download video ids using YouTube Data API. Result size per
1767 # query is limited (currently to 50 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1769 # all of them.
1770
1771 video_ids = []
1772 pagenum = 0
1773
1774 while True:
1775 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776 self.report_download_page(username, start_index)
1777
1778 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1779
1780 try:
1781 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1784 return
1785
1786 # Extract video identifiers
1787 ids_in_page = []
1788
1789 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(mobj.group(1))
1792
1793 video_ids.extend(ids_in_page)
1794
1795 # A little optimization - if current page is not
1796 # "full", ie. does not contain PAGE_SIZE video ids then
1797 # we can assume that this page is the last one - there
1798 # are no more ids on further pages - no need to query
1799 # again.
1800
1801 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1802 break
1803
1804 pagenum += 1
1805
1806 all_ids_count = len(video_ids)
1807 playliststart = self._downloader.params.get('playliststart', 1) - 1
1808 playlistend = self._downloader.params.get('playlistend', -1)
1809
1810 if playlistend == -1:
1811 video_ids = video_ids[playliststart:]
1812 else:
1813 video_ids = video_ids[playliststart:playlistend]
1814
1815 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816 (username, all_ids_count, len(video_ids)))
1817
1818 for video_id in video_ids:
1819 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1820
1821
1822 class BlipTVUserIE(InfoExtractor):
1823 """Information Extractor for blip.tv users."""
1824
1825 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1826 _PAGE_SIZE = 12
1827 IE_NAME = u'blip.tv:user'
1828
1829 def __init__(self, downloader=None):
1830 InfoExtractor.__init__(self, downloader)
1831
1832 def report_download_page(self, username, pagenum):
1833 """Report attempt to download user page."""
1834 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835 (self.IE_NAME, username, pagenum))
1836
1837 def _real_extract(self, url):
1838 # Extract username
1839 mobj = re.match(self._VALID_URL, url)
1840 if mobj is None:
1841 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1842 return
1843
1844 username = mobj.group(1)
1845
1846 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1847
1848 request = compat_urllib_request.Request(url)
1849
1850 try:
1851 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852 mobj = re.search(r'data-users-id="([^"]+)"', page)
1853 page_base = page_base % mobj.group(1)
1854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1856 return
1857
1858
1859 # Download video ids using BlipTV Ajax calls. Result size per
1860 # query is limited (currently to 12 videos) so we need to query
1861 # page by page until there are no video ids - it means we got
1862 # all of them.
1863
1864 video_ids = []
1865 pagenum = 1
1866
1867 while True:
1868 self.report_download_page(username, pagenum)
1869
1870 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1871
1872 try:
1873 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1876 return
1877
1878 # Extract video identifiers
1879 ids_in_page = []
1880
1881 for mobj in re.finditer(r'href="/([^"]+)"', page):
1882 if mobj.group(1) not in ids_in_page:
1883 ids_in_page.append(unescapeHTML(mobj.group(1)))
1884
1885 video_ids.extend(ids_in_page)
1886
1887 # A little optimization - if current page is not
1888 # "full", ie. does not contain PAGE_SIZE video ids then
1889 # we can assume that this page is the last one - there
1890 # are no more ids on further pages - no need to query
1891 # again.
1892
1893 if len(ids_in_page) < self._PAGE_SIZE:
1894 break
1895
1896 pagenum += 1
1897
1898 all_ids_count = len(video_ids)
1899 playliststart = self._downloader.params.get('playliststart', 1) - 1
1900 playlistend = self._downloader.params.get('playlistend', -1)
1901
1902 if playlistend == -1:
1903 video_ids = video_ids[playliststart:]
1904 else:
1905 video_ids = video_ids[playliststart:playlistend]
1906
1907 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1909
1910 for video_id in video_ids:
1911 self._downloader.download([u'http://blip.tv/'+video_id])
1912
1913
1914 class DepositFilesIE(InfoExtractor):
1915 """Information extractor for depositfiles.com"""
1916
1917 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1918
1919 def report_download_webpage(self, file_id):
1920 """Report webpage download."""
1921 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1922
1923 def report_extraction(self, file_id):
1924 """Report information extraction."""
1925 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1926
1927 def _real_extract(self, url):
1928 file_id = url.split('/')[-1]
1929 # Rebuild url in english locale
1930 url = 'http://depositfiles.com/en/files/' + file_id
1931
1932 # Retrieve file webpage with 'Free download' button pressed
1933 free_download_indication = { 'gateway_result' : '1' }
1934 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1935 try:
1936 self.report_download_webpage(file_id)
1937 webpage = compat_urllib_request.urlopen(request).read()
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1940 return
1941
1942 # Search for the real file URL
1943 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944 if (mobj is None) or (mobj.group(1) is None):
1945 # Try to figure out reason of the error.
1946 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947 if (mobj is not None) and (mobj.group(1) is not None):
1948 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1950 else:
1951 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1952 return
1953
1954 file_url = mobj.group(1)
1955 file_extension = os.path.splitext(file_url)[1][1:]
1956
1957 # Search for file title
1958 mobj = re.search(r'<b title="(.*?)">', webpage)
1959 if mobj is None:
1960 self._downloader.trouble(u'ERROR: unable to extract title')
1961 return
1962 file_title = mobj.group(1).decode('utf-8')
1963
1964 return [{
1965 'id': file_id.decode('utf-8'),
1966 'url': file_url.decode('utf-8'),
1967 'uploader': None,
1968 'upload_date': None,
1969 'title': file_title,
1970 'ext': file_extension.decode('utf-8'),
1971 }]
1972
1973
1974 class FacebookIE(InfoExtractor):
1975 """Information Extractor for Facebook"""
1976
1977 _WORKING = False
1978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980 _NETRC_MACHINE = 'facebook'
1981 _available_formats = ['video', 'highqual', 'lowqual']
1982 _video_extensions = {
1983 'video': 'mp4',
1984 'highqual': 'mp4',
1985 'lowqual': 'mp4',
1986 }
1987 IE_NAME = u'facebook'
1988
1989 def __init__(self, downloader=None):
1990 InfoExtractor.__init__(self, downloader)
1991
1992 def _reporter(self, message):
1993 """Add header and report message."""
1994 self._downloader.to_screen(u'[facebook] %s' % message)
1995
1996 def report_login(self):
1997 """Report attempt to log in."""
1998 self._reporter(u'Logging in')
1999
2000 def report_video_webpage_download(self, video_id):
2001 """Report attempt to download video webpage."""
2002 self._reporter(u'%s: Downloading video webpage' % video_id)
2003
2004 def report_information_extraction(self, video_id):
2005 """Report attempt to extract video information."""
2006 self._reporter(u'%s: Extracting video information' % video_id)
2007
2008 def _parse_page(self, video_webpage):
2009 """Extract video information from page"""
2010 # General data
2011 data = {'title': r'\("video_title", "(.*?)"\)',
2012 'description': r'<div class="datawrap">(.*?)</div>',
2013 'owner': r'\("video_owner_name", "(.*?)"\)',
2014 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2015 }
2016 video_info = {}
2017 for piece in data.keys():
2018 mobj = re.search(data[piece], video_webpage)
2019 if mobj is not None:
2020 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2021
2022 # Video urls
2023 video_urls = {}
2024 for fmt in self._available_formats:
2025 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026 if mobj is not None:
2027 # URL is in a Javascript segment inside an escaped Unicode format within
2028 # the generally utf-8 page
2029 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030 video_info['video_urls'] = video_urls
2031
2032 return video_info
2033
2034 def _real_initialize(self):
2035 if self._downloader is None:
2036 return
2037
2038 useremail = None
2039 password = None
2040 downloader_params = self._downloader.params
2041
2042 # Attempt to use provided username and password or .netrc data
2043 if downloader_params.get('username', None) is not None:
2044 useremail = downloader_params['username']
2045 password = downloader_params['password']
2046 elif downloader_params.get('usenetrc', False):
2047 try:
2048 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049 if info is not None:
2050 useremail = info[0]
2051 password = info[2]
2052 else:
2053 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054 except (IOError, netrc.NetrcParseError) as err:
2055 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2056 return
2057
2058 if useremail is None:
2059 return
2060
2061 # Log in
2062 login_form = {
2063 'email': useremail,
2064 'pass': password,
2065 'login': 'Log+In'
2066 }
2067 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2068 try:
2069 self.report_login()
2070 login_results = compat_urllib_request.urlopen(request).read()
2071 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2073 return
2074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2076 return
2077
2078 def _real_extract(self, url):
2079 mobj = re.match(self._VALID_URL, url)
2080 if mobj is None:
2081 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2082 return
2083 video_id = mobj.group('ID')
2084
2085 # Get video webpage
2086 self.report_video_webpage_download(video_id)
2087 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2088 try:
2089 page = compat_urllib_request.urlopen(request)
2090 video_webpage = page.read()
2091 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2093 return
2094
2095 # Start extracting information
2096 self.report_information_extraction(video_id)
2097
2098 # Extract information
2099 video_info = self._parse_page(video_webpage)
2100
2101 # uploader
2102 if 'owner' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2104 return
2105 video_uploader = video_info['owner']
2106
2107 # title
2108 if 'title' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract video title')
2110 return
2111 video_title = video_info['title']
2112 video_title = video_title.decode('utf-8')
2113
2114 # thumbnail image
2115 if 'thumbnail' not in video_info:
2116 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117 video_thumbnail = ''
2118 else:
2119 video_thumbnail = video_info['thumbnail']
2120
2121 # upload date
2122 upload_date = None
2123 if 'upload_date' in video_info:
2124 upload_time = video_info['upload_date']
2125 timetuple = email.utils.parsedate_tz(upload_time)
2126 if timetuple is not None:
2127 try:
2128 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2129 except:
2130 pass
2131
2132 # description
2133 video_description = video_info.get('description', 'No description available.')
2134
2135 url_map = video_info['video_urls']
2136 if url_map:
2137 # Decide which formats to download
2138 req_format = self._downloader.params.get('format', None)
2139 format_limit = self._downloader.params.get('format_limit', None)
2140
2141 if format_limit is not None and format_limit in self._available_formats:
2142 format_list = self._available_formats[self._available_formats.index(format_limit):]
2143 else:
2144 format_list = self._available_formats
2145 existing_formats = [x for x in format_list if x in url_map]
2146 if len(existing_formats) == 0:
2147 self._downloader.trouble(u'ERROR: no known formats available for video')
2148 return
2149 if req_format is None:
2150 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151 elif req_format == 'worst':
2152 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153 elif req_format == '-1':
2154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2155 else:
2156 # Specific format
2157 if req_format not in url_map:
2158 self._downloader.trouble(u'ERROR: requested format not available')
2159 return
2160 video_url_list = [(req_format, url_map[req_format])] # Specific format
2161
2162 results = []
2163 for format_param, video_real_url in video_url_list:
2164 # Extension
2165 video_extension = self._video_extensions.get(format_param, 'mp4')
2166
2167 results.append({
2168 'id': video_id.decode('utf-8'),
2169 'url': video_real_url.decode('utf-8'),
2170 'uploader': video_uploader.decode('utf-8'),
2171 'upload_date': upload_date,
2172 'title': video_title,
2173 'ext': video_extension.decode('utf-8'),
2174 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2175 'thumbnail': video_thumbnail.decode('utf-8'),
2176 'description': video_description.decode('utf-8'),
2177 })
2178 return results
2179
2180 class BlipTVIE(InfoExtractor):
2181 """Information extractor for blip.tv"""
2182
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185 IE_NAME = u'blip.tv'
2186
2187 def report_extraction(self, file_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191 def report_direct_download(self, title):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195 def _real_extract(self, url):
2196 mobj = re.match(self._VALID_URL, url)
2197 if mobj is None:
2198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2199 return
2200
2201 if '?' in url:
2202 cchar = '&'
2203 else:
2204 cchar = '?'
2205 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206 request = compat_urllib_request.Request(json_url)
2207 self.report_extraction(mobj.group(1))
2208 info = None
2209 try:
2210 urlh = compat_urllib_request.urlopen(request)
2211 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212 basename = url.split('/')[-1]
2213 title,ext = os.path.splitext(basename)
2214 title = title.decode('UTF-8')
2215 ext = ext.replace('.', '')
2216 self.report_direct_download(title)
2217 info = {
2218 'id': title,
2219 'url': url,
2220 'uploader': None,
2221 'upload_date': None,
2222 'title': title,
2223 'ext': ext,
2224 'urlhandle': urlh
2225 }
2226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228 return
2229 if info is None: # Regular URL
2230 try:
2231 json_code_bytes = urlh.read()
2232 json_code = json_code_bytes.decode('utf-8')
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235 return
2236
2237 try:
2238 json_data = json.loads(json_code)
2239 if 'Post' in json_data:
2240 data = json_data['Post']
2241 else:
2242 data = json_data
2243
2244 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245 video_url = data['media']['url']
2246 umobj = re.match(self._URL_EXT, video_url)
2247 if umobj is None:
2248 raise ValueError('Can not determine filename extension')
2249 ext = umobj.group(1)
2250
2251 info = {
2252 'id': data['item_id'],
2253 'url': video_url,
2254 'uploader': data['display_name'],
2255 'upload_date': upload_date,
2256 'title': data['title'],
2257 'ext': ext,
2258 'format': data['media']['mimeType'],
2259 'thumbnail': data['thumbnailUrl'],
2260 'description': data['description'],
2261 'player_url': data['embedUrl']
2262 }
2263 except (ValueError,KeyError) as err:
2264 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2265 return
2266
2267 std_headers['User-Agent'] = 'iTunes/10.6.1'
2268 return [info]
2269
2270
2271 class MyVideoIE(InfoExtractor):
2272 """Information Extractor for myvideo.de."""
2273
2274 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275 IE_NAME = u'myvideo'
2276
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2279
2280 def report_extraction(self, video_id):
2281 """Report information extraction."""
2282 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2283
2284 def _real_extract(self,url):
2285 mobj = re.match(self._VALID_URL, url)
2286 if mobj is None:
2287 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2288 return
2289
2290 video_id = mobj.group(1)
2291
2292 # Get video webpage
2293 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294 webpage = self._download_webpage(webpage_url, video_id)
2295
2296 self.report_extraction(video_id)
2297 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2298 webpage)
2299 if mobj is None:
2300 self._downloader.trouble(u'ERROR: unable to extract media URL')
2301 return
2302 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2303
2304 mobj = re.search('<title>([^<]+)</title>', webpage)
2305 if mobj is None:
2306 self._downloader.trouble(u'ERROR: unable to extract title')
2307 return
2308
2309 video_title = mobj.group(1)
2310
2311 return [{
2312 'id': video_id,
2313 'url': video_url,
2314 'uploader': None,
2315 'upload_date': None,
2316 'title': video_title,
2317 'ext': u'flv',
2318 }]
2319
2320 class ComedyCentralIE(InfoExtractor):
2321 """Information extractor for The Daily Show and Colbert Report """
2322
2323 # urls can be abbreviations like :thedailyshow or :colbert
2324 # urls for episodes like:
2325 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329 |(https?://)?(www\.)?
2330 (?P<showname>thedailyshow|colbertnation)\.com/
2331 (full-episodes/(?P<episode>.*)|
2332 (?P<clip>
2333 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2335 $"""
2336
2337 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2338
2339 _video_extensions = {
2340 '3500': 'mp4',
2341 '2200': 'mp4',
2342 '1700': 'mp4',
2343 '1200': 'mp4',
2344 '750': 'mp4',
2345 '400': 'mp4',
2346 }
2347 _video_dimensions = {
2348 '3500': '1280x720',
2349 '2200': '960x540',
2350 '1700': '768x432',
2351 '1200': '640x360',
2352 '750': '512x288',
2353 '400': '384x216',
2354 }
2355
2356 def suitable(self, url):
2357 """Receives a URL and returns True if suitable for this IE."""
2358 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2359
2360 def report_extraction(self, episode_id):
2361 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2362
2363 def report_config_download(self, episode_id, media_id):
2364 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2365
2366 def report_index_download(self, episode_id):
2367 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2368
2369 def _print_formats(self, formats):
2370 print('Available formats:')
2371 for x in formats:
2372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2373
2374
2375 def _real_extract(self, url):
2376 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2377 if mobj is None:
2378 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2379 return
2380
2381 if mobj.group('shortname'):
2382 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2383 url = u'http://www.thedailyshow.com/full-episodes/'
2384 else:
2385 url = u'http://www.colbertnation.com/full-episodes/'
2386 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387 assert mobj is not None
2388
2389 if mobj.group('clip'):
2390 if mobj.group('showname') == 'thedailyshow':
2391 epTitle = mobj.group('tdstitle')
2392 else:
2393 epTitle = mobj.group('cntitle')
2394 dlNewest = False
2395 else:
2396 dlNewest = not mobj.group('episode')
2397 if dlNewest:
2398 epTitle = mobj.group('showname')
2399 else:
2400 epTitle = mobj.group('episode')
2401
2402 req = compat_urllib_request.Request(url)
2403 self.report_extraction(epTitle)
2404 try:
2405 htmlHandle = compat_urllib_request.urlopen(req)
2406 html = htmlHandle.read()
2407 webpage = html.decode('utf-8')
2408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2409 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2410 return
2411 if dlNewest:
2412 url = htmlHandle.geturl()
2413 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414 if mobj is None:
2415 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2416 return
2417 if mobj.group('episode') == '':
2418 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2419 return
2420 epTitle = mobj.group('episode')
2421
2422 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2423
2424 if len(mMovieParams) == 0:
2425 # The Colbert Report embeds the information in a without
2426 # a URL prefix; so extract the alternate reference
2427 # and then add the URL prefix manually.
2428
2429 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2430 if len(altMovieParams) == 0:
2431 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2432 return
2433 else:
2434 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2435
2436 uri = mMovieParams[0][1]
2437 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2438 self.report_index_download(epTitle)
2439 try:
2440 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2441 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2443 return
2444
2445 results = []
2446
2447 idoc = xml.etree.ElementTree.fromstring(indexXml)
2448 itemEls = idoc.findall('.//item')
2449 for partNum,itemEl in enumerate(itemEls):
2450 mediaId = itemEl.findall('./guid')[0].text
2451 shortMediaId = mediaId.split(':')[-1]
2452 showId = mediaId.split(':')[-2].replace('.com', '')
2453 officialTitle = itemEl.findall('./title')[0].text
2454 officialDate = itemEl.findall('./pubDate')[0].text
2455
2456 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2457 compat_urllib_parse.urlencode({'uri': mediaId}))
2458 configReq = compat_urllib_request.Request(configUrl)
2459 self.report_config_download(epTitle, shortMediaId)
2460 try:
2461 configXml = compat_urllib_request.urlopen(configReq).read()
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2464 return
2465
2466 cdoc = xml.etree.ElementTree.fromstring(configXml)
2467 turls = []
2468 for rendition in cdoc.findall('.//rendition'):
2469 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2470 turls.append(finfo)
2471
2472 if len(turls) == 0:
2473 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2474 continue
2475
2476 if self._downloader.params.get('listformats', None):
2477 self._print_formats([i[0] for i in turls])
2478 return
2479
2480 # For now, just pick the highest bitrate
2481 format,rtmp_video_url = turls[-1]
2482
2483 # Get the format arg from the arg stream
2484 req_format = self._downloader.params.get('format', None)
2485
2486 # Select format if we can find one
2487 for f,v in turls:
2488 if f == req_format:
2489 format, rtmp_video_url = f, v
2490 break
2491
2492 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2493 if not m:
2494 raise ExtractorError(u'Cannot transform RTMP url')
2495 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2496 video_url = base + m.group('finalid')
2497
2498 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2499 info = {
2500 'id': shortMediaId,
2501 'url': video_url,
2502 'uploader': showId,
2503 'upload_date': officialDate,
2504 'title': effTitle,
2505 'ext': 'mp4',
2506 'format': format,
2507 'thumbnail': None,
2508 'description': officialTitle,
2509 }
2510 results.append(info)
2511
2512 return results
2513
2514
2515 class EscapistIE(InfoExtractor):
2516 """Information extractor for The Escapist """
2517
2518 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2519 IE_NAME = u'escapist'
2520
2521 def report_extraction(self, showName):
2522 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2523
2524 def report_config_download(self, showName):
2525 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2526
2527 def _real_extract(self, url):
2528 mobj = re.match(self._VALID_URL, url)
2529 if mobj is None:
2530 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2531 return
2532 showName = mobj.group('showname')
2533 videoId = mobj.group('episode')
2534
2535 self.report_extraction(showName)
2536 try:
2537 webPage = compat_urllib_request.urlopen(url)
2538 webPageBytes = webPage.read()
2539 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2540 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2541 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2542 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2543 return
2544
2545 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2546 description = unescapeHTML(descMatch.group(1))
2547 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2548 imgUrl = unescapeHTML(imgMatch.group(1))
2549 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2550 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2551 configUrlMatch = re.search('config=(.*)$', playerUrl)
2552 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2553
2554 self.report_config_download(showName)
2555 try:
2556 configJSON = compat_urllib_request.urlopen(configUrl)
2557 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2558 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2561 return
2562
2563 # Technically, it's JavaScript, not JSON
2564 configJSON = configJSON.replace("'", '"')
2565
2566 try:
2567 config = json.loads(configJSON)
2568 except (ValueError,) as err:
2569 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2570 return
2571
2572 playlist = config['playlist']
2573 videoUrl = playlist[1]['url']
2574
2575 info = {
2576 'id': videoId,
2577 'url': videoUrl,
2578 'uploader': showName,
2579 'upload_date': None,
2580 'title': showName,
2581 'ext': 'flv',
2582 'thumbnail': imgUrl,
2583 'description': description,
2584 'player_url': playerUrl,
2585 }
2586
2587 return [info]
2588
2589 class CollegeHumorIE(InfoExtractor):
2590 """Information extractor for collegehumor.com"""
2591
2592 _WORKING = False
2593 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2594 IE_NAME = u'collegehumor'
2595
2596 def report_manifest(self, video_id):
2597 """Report information extraction."""
2598 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2599
2600 def report_extraction(self, video_id):
2601 """Report information extraction."""
2602 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2603
2604 def _real_extract(self, url):
2605 mobj = re.match(self._VALID_URL, url)
2606 if mobj is None:
2607 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2608 return
2609 video_id = mobj.group('videoid')
2610
2611 info = {
2612 'id': video_id,
2613 'uploader': None,
2614 'upload_date': None,
2615 }
2616
2617 self.report_extraction(video_id)
2618 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2619 try:
2620 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2621 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2622 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2623 return
2624
2625 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2626 try:
2627 videoNode = mdoc.findall('./video')[0]
2628 info['description'] = videoNode.findall('./description')[0].text
2629 info['title'] = videoNode.findall('./caption')[0].text
2630 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2631 manifest_url = videoNode.findall('./file')[0].text
2632 except IndexError:
2633 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2634 return
2635
2636 manifest_url += '?hdcore=2.10.3'
2637 self.report_manifest(video_id)
2638 try:
2639 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2640 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2641 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2642 return
2643
2644 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2645 try:
2646 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2647 node_id = media_node.attrib['url']
2648 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2649 except IndexError as err:
2650 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2651 return
2652
2653 url_pr = compat_urllib_parse_urlparse(manifest_url)
2654 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2655
2656 info['url'] = url
2657 info['ext'] = 'f4f'
2658 return [info]
2659
2660
2661 class XVideosIE(InfoExtractor):
2662 """Information extractor for xvideos.com"""
2663
2664 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2665 IE_NAME = u'xvideos'
2666
2667 def report_extraction(self, video_id):
2668 """Report information extraction."""
2669 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2670
2671 def _real_extract(self, url):
2672 mobj = re.match(self._VALID_URL, url)
2673 if mobj is None:
2674 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2675 return
2676 video_id = mobj.group(1)
2677
2678 webpage = self._download_webpage(url, video_id)
2679
2680 self.report_extraction(video_id)
2681
2682
2683 # Extract video URL
2684 mobj = re.search(r'flv_url=(.+?)&', webpage)
2685 if mobj is None:
2686 self._downloader.trouble(u'ERROR: unable to extract video url')
2687 return
2688 video_url = compat_urllib_parse.unquote(mobj.group(1))
2689
2690
2691 # Extract title
2692 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2693 if mobj is None:
2694 self._downloader.trouble(u'ERROR: unable to extract video title')
2695 return
2696 video_title = mobj.group(1)
2697
2698
2699 # Extract video thumbnail
2700 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2701 if mobj is None:
2702 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2703 return
2704 video_thumbnail = mobj.group(0)
2705
2706 info = {
2707 'id': video_id,
2708 'url': video_url,
2709 'uploader': None,
2710 'upload_date': None,
2711 'title': video_title,
2712 'ext': 'flv',
2713 'thumbnail': video_thumbnail,
2714 'description': None,
2715 }
2716
2717 return [info]
2718
2719
2720 class SoundcloudIE(InfoExtractor):
2721 """Information extractor for soundcloud.com
2722 To access the media, the uid of the song and a stream token
2723 must be extracted from the page source and the script must make
2724 a request to media.soundcloud.com/crossdomain.xml. Then
2725 the media can be grabbed by requesting from an url composed
2726 of the stream token and uid
2727 """
2728
2729 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2730 IE_NAME = u'soundcloud'
2731
2732 def __init__(self, downloader=None):
2733 InfoExtractor.__init__(self, downloader)
2734
2735 def report_resolve(self, video_id):
2736 """Report information extraction."""
2737 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2738
2739 def report_extraction(self, video_id):
2740 """Report information extraction."""
2741 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2742
2743 def _real_extract(self, url):
2744 mobj = re.match(self._VALID_URL, url)
2745 if mobj is None:
2746 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2747 return
2748
2749 # extract uploader (which is in the url)
2750 uploader = mobj.group(1)
2751 # extract simple title (uploader + slug of song title)
2752 slug_title = mobj.group(2)
2753 simple_title = uploader + u'-' + slug_title
2754
2755 self.report_resolve('%s/%s' % (uploader, slug_title))
2756
2757 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2758 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2759 request = compat_urllib_request.Request(resolv_url)
2760 try:
2761 info_json_bytes = compat_urllib_request.urlopen(request).read()
2762 info_json = info_json_bytes.decode('utf-8')
2763 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2764 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2765 return
2766
2767 info = json.loads(info_json)
2768 video_id = info['id']
2769 self.report_extraction('%s/%s' % (uploader, slug_title))
2770
2771 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2772 request = compat_urllib_request.Request(streams_url)
2773 try:
2774 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2775 stream_json = stream_json_bytes.decode('utf-8')
2776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2777 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2778 return
2779
2780 streams = json.loads(stream_json)
2781 mediaURL = streams['http_mp3_128_url']
2782
2783 return [{
2784 'id': info['id'],
2785 'url': mediaURL,
2786 'uploader': info['user']['username'],
2787 'upload_date': info['created_at'],
2788 'title': info['title'],
2789 'ext': u'mp3',
2790 'description': info['description'],
2791 }]
2792
2793
2794 class InfoQIE(InfoExtractor):
2795 """Information extractor for infoq.com"""
2796 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2797
2798 def report_extraction(self, video_id):
2799 """Report information extraction."""
2800 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2801
2802 def _real_extract(self, url):
2803 mobj = re.match(self._VALID_URL, url)
2804 if mobj is None:
2805 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806 return
2807
2808 webpage = self._download_webpage(url, video_id=url)
2809 self.report_extraction(url)
2810
2811 # Extract video URL
2812 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2813 if mobj is None:
2814 self._downloader.trouble(u'ERROR: unable to extract video url')
2815 return
2816 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2817 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2818
2819 # Extract title
2820 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2821 if mobj is None:
2822 self._downloader.trouble(u'ERROR: unable to extract video title')
2823 return
2824 video_title = mobj.group(1)
2825
2826 # Extract description
2827 video_description = u'No description available.'
2828 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2829 if mobj is not None:
2830 video_description = mobj.group(1)
2831
2832 video_filename = video_url.split('/')[-1]
2833 video_id, extension = video_filename.split('.')
2834
2835 info = {
2836 'id': video_id,
2837 'url': video_url,
2838 'uploader': None,
2839 'upload_date': None,
2840 'title': video_title,
2841 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2842 'thumbnail': None,
2843 'description': video_description,
2844 }
2845
2846 return [info]
2847
2848 class MixcloudIE(InfoExtractor):
2849 """Information extractor for www.mixcloud.com"""
2850
2851 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2852 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2853 IE_NAME = u'mixcloud'
2854
2855 def __init__(self, downloader=None):
2856 InfoExtractor.__init__(self, downloader)
2857
2858 def report_download_json(self, file_id):
2859 """Report JSON download."""
2860 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2861
2862 def report_extraction(self, file_id):
2863 """Report information extraction."""
2864 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2865
2866 def get_urls(self, jsonData, fmt, bitrate='best'):
2867 """Get urls from 'audio_formats' section in json"""
2868 file_url = None
2869 try:
2870 bitrate_list = jsonData[fmt]
2871 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2872 bitrate = max(bitrate_list) # select highest
2873
2874 url_list = jsonData[fmt][bitrate]
2875 except TypeError: # we have no bitrate info.
2876 url_list = jsonData[fmt]
2877 return url_list
2878
2879 def check_urls(self, url_list):
2880 """Returns 1st active url from list"""
2881 for url in url_list:
2882 try:
2883 compat_urllib_request.urlopen(url)
2884 return url
2885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886 url = None
2887
2888 return None
2889
2890 def _print_formats(self, formats):
2891 print('Available formats:')
2892 for fmt in formats.keys():
2893 for b in formats[fmt]:
2894 try:
2895 ext = formats[fmt][b][0]
2896 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2897 except TypeError: # we have no bitrate info
2898 ext = formats[fmt][0]
2899 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2900 break
2901
2902 def _real_extract(self, url):
2903 mobj = re.match(self._VALID_URL, url)
2904 if mobj is None:
2905 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2906 return
2907 # extract uploader & filename from url
2908 uploader = mobj.group(1).decode('utf-8')
2909 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2910
2911 # construct API request
2912 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2913 # retrieve .json file with links to files
2914 request = compat_urllib_request.Request(file_url)
2915 try:
2916 self.report_download_json(file_url)
2917 jsonData = compat_urllib_request.urlopen(request).read()
2918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2919 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2920 return
2921
2922 # parse JSON
2923 json_data = json.loads(jsonData)
2924 player_url = json_data['player_swf_url']
2925 formats = dict(json_data['audio_formats'])
2926
2927 req_format = self._downloader.params.get('format', None)
2928 bitrate = None
2929
2930 if self._downloader.params.get('listformats', None):
2931 self._print_formats(formats)
2932 return
2933
2934 if req_format is None or req_format == 'best':
2935 for format_param in formats.keys():
2936 url_list = self.get_urls(formats, format_param)
2937 # check urls
2938 file_url = self.check_urls(url_list)
2939 if file_url is not None:
2940 break # got it!
2941 else:
2942 if req_format not in formats:
2943 self._downloader.trouble(u'ERROR: format is not available')
2944 return
2945
2946 url_list = self.get_urls(formats, req_format)
2947 file_url = self.check_urls(url_list)
2948 format_param = req_format
2949
2950 return [{
2951 'id': file_id.decode('utf-8'),
2952 'url': file_url.decode('utf-8'),
2953 'uploader': uploader.decode('utf-8'),
2954 'upload_date': None,
2955 'title': json_data['name'],
2956 'ext': file_url.split('.')[-1].decode('utf-8'),
2957 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2958 'thumbnail': json_data['thumbnail_url'],
2959 'description': json_data['description'],
2960 'player_url': player_url.decode('utf-8'),
2961 }]
2962
2963 class StanfordOpenClassroomIE(InfoExtractor):
2964 """Information extractor for Stanford's Open ClassRoom"""
2965
2966 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2967 IE_NAME = u'stanfordoc'
2968
2969 def report_download_webpage(self, objid):
2970 """Report information extraction."""
2971 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2972
2973 def report_extraction(self, video_id):
2974 """Report information extraction."""
2975 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2976
2977 def _real_extract(self, url):
2978 mobj = re.match(self._VALID_URL, url)
2979 if mobj is None:
2980 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2981 return
2982
2983 if mobj.group('course') and mobj.group('video'): # A specific video
2984 course = mobj.group('course')
2985 video = mobj.group('video')
2986 info = {
2987 'id': course + '_' + video,
2988 'uploader': None,
2989 'upload_date': None,
2990 }
2991
2992 self.report_extraction(info['id'])
2993 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2994 xmlUrl = baseUrl + video + '.xml'
2995 try:
2996 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2997 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2998 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2999 return
3000 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3001 try:
3002 info['title'] = mdoc.findall('./title')[0].text
3003 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3004 except IndexError:
3005 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3006 return
3007 info['ext'] = info['url'].rpartition('.')[2]
3008 return [info]
3009 elif mobj.group('course'): # A course page
3010 course = mobj.group('course')
3011 info = {
3012 'id': course,
3013 'type': 'playlist',
3014 'uploader': None,
3015 'upload_date': None,
3016 }
3017
3018 self.report_download_webpage(info['id'])
3019 try:
3020 coursepage = compat_urllib_request.urlopen(url).read()
3021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3023 return
3024
3025 m = re.search('<h1>([^<]+)</h1>', coursepage)
3026 if m:
3027 info['title'] = unescapeHTML(m.group(1))
3028 else:
3029 info['title'] = info['id']
3030
3031 m = re.search('<description>([^<]+)</description>', coursepage)
3032 if m:
3033 info['description'] = unescapeHTML(m.group(1))
3034
3035 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3036 info['list'] = [
3037 {
3038 'type': 'reference',
3039 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3040 }
3041 for vpage in links]
3042 results = []
3043 for entry in info['list']:
3044 assert entry['type'] == 'reference'
3045 results += self.extract(entry['url'])
3046 return results
3047
3048 else: # Root page
3049 info = {
3050 'id': 'Stanford OpenClassroom',
3051 'type': 'playlist',
3052 'uploader': None,
3053 'upload_date': None,
3054 }
3055
3056 self.report_download_webpage(info['id'])
3057 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3058 try:
3059 rootpage = compat_urllib_request.urlopen(rootURL).read()
3060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3062 return
3063
3064 info['title'] = info['id']
3065
3066 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3067 info['list'] = [
3068 {
3069 'type': 'reference',
3070 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3071 }
3072 for cpage in links]
3073
3074 results = []
3075 for entry in info['list']:
3076 assert entry['type'] == 'reference'
3077 results += self.extract(entry['url'])
3078 return results
3079
3080 class MTVIE(InfoExtractor):
3081 """Information extractor for MTV.com"""
3082
3083 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3084 IE_NAME = u'mtv'
3085
3086 def report_extraction(self, video_id):
3087 """Report information extraction."""
3088 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3089
3090 def _real_extract(self, url):
3091 mobj = re.match(self._VALID_URL, url)
3092 if mobj is None:
3093 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3094 return
3095 if not mobj.group('proto'):
3096 url = 'http://' + url
3097 video_id = mobj.group('videoid')
3098
3099 webpage = self._download_webpage(url, video_id)
3100
3101 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3102 if mobj is None:
3103 self._downloader.trouble(u'ERROR: unable to extract song name')
3104 return
3105 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3106 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3107 if mobj is None:
3108 self._downloader.trouble(u'ERROR: unable to extract performer')
3109 return
3110 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3111 video_title = performer + ' - ' + song_name
3112
3113 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3114 if mobj is None:
3115 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3116 return
3117 mtvn_uri = mobj.group(1)
3118
3119 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3120 if mobj is None:
3121 self._downloader.trouble(u'ERROR: unable to extract content id')
3122 return
3123 content_id = mobj.group(1)
3124
3125 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3126 self.report_extraction(video_id)
3127 request = compat_urllib_request.Request(videogen_url)
3128 try:
3129 metadataXml = compat_urllib_request.urlopen(request).read()
3130 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3131 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3132 return
3133
3134 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3135 renditions = mdoc.findall('.//rendition')
3136
3137 # For now, always pick the highest quality.
3138 rendition = renditions[-1]
3139
3140 try:
3141 _,_,ext = rendition.attrib['type'].partition('/')
3142 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3143 video_url = rendition.find('./src').text
3144 except KeyError:
3145 self._downloader.trouble('Invalid rendition field.')
3146 return
3147
3148 info = {
3149 'id': video_id,
3150 'url': video_url,
3151 'uploader': performer,
3152 'upload_date': None,
3153 'title': video_title,
3154 'ext': ext,
3155 'format': format,
3156 }
3157
3158 return [info]
3159
3160
3161 class YoukuIE(InfoExtractor):
3162 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3163
3164 def report_download_webpage(self, file_id):
3165 """Report webpage download."""
3166 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3167
3168 def report_extraction(self, file_id):
3169 """Report information extraction."""
3170 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3171
3172 def _gen_sid(self):
3173 nowTime = int(time.time() * 1000)
3174 random1 = random.randint(1000,1998)
3175 random2 = random.randint(1000,9999)
3176
3177 return "%d%d%d" %(nowTime,random1,random2)
3178
3179 def _get_file_ID_mix_string(self, seed):
3180 mixed = []
3181 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3182 seed = float(seed)
3183 for i in range(len(source)):
3184 seed = (seed * 211 + 30031 ) % 65536
3185 index = math.floor(seed / 65536 * len(source) )
3186 mixed.append(source[int(index)])
3187 source.remove(source[int(index)])
3188 #return ''.join(mixed)
3189 return mixed
3190
3191 def _get_file_id(self, fileId, seed):
3192 mixed = self._get_file_ID_mix_string(seed)
3193 ids = fileId.split('*')
3194 realId = []
3195 for ch in ids:
3196 if ch:
3197 realId.append(mixed[int(ch)])
3198 return ''.join(realId)
3199
3200 def _real_extract(self, url):
3201 mobj = re.match(self._VALID_URL, url)
3202 if mobj is None:
3203 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3204 return
3205 video_id = mobj.group('ID')
3206
3207 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3208
3209 request = compat_urllib_request.Request(info_url, None, std_headers)
3210 try:
3211 self.report_download_webpage(video_id)
3212 jsondata = compat_urllib_request.urlopen(request).read()
3213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3214 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3215 return
3216
3217 self.report_extraction(video_id)
3218 try:
3219 jsonstr = jsondata.decode('utf-8')
3220 config = json.loads(jsonstr)
3221
3222 video_title = config['data'][0]['title']
3223 seed = config['data'][0]['seed']
3224
3225 format = self._downloader.params.get('format', None)
3226 supported_format = list(config['data'][0]['streamfileids'].keys())
3227
3228 if format is None or format == 'best':
3229 if 'hd2' in supported_format:
3230 format = 'hd2'
3231 else:
3232 format = 'flv'
3233 ext = u'flv'
3234 elif format == 'worst':
3235 format = 'mp4'
3236 ext = u'mp4'
3237 else:
3238 format = 'flv'
3239 ext = u'flv'
3240
3241
3242 fileid = config['data'][0]['streamfileids'][format]
3243 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3244 except (UnicodeDecodeError, ValueError, KeyError):
3245 self._downloader.trouble(u'ERROR: unable to extract info section')
3246 return
3247
3248 files_info=[]
3249 sid = self._gen_sid()
3250 fileid = self._get_file_id(fileid, seed)
3251
3252 #column 8,9 of fileid represent the segment number
3253 #fileid[7:9] should be changed
3254 for index, key in enumerate(keys):
3255
3256 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3257 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3258
3259 info = {
3260 'id': '%s_part%02d' % (video_id, index),
3261 'url': download_url,
3262 'uploader': None,
3263 'upload_date': None,
3264 'title': video_title,
3265 'ext': ext,
3266 }
3267 files_info.append(info)
3268
3269 return files_info
3270
3271
3272 class XNXXIE(InfoExtractor):
3273 """Information extractor for xnxx.com"""
3274
3275 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3276 IE_NAME = u'xnxx'
3277 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3278 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3279 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3280
3281 def report_webpage(self, video_id):
3282 """Report information extraction"""
3283 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3284
3285 def report_extraction(self, video_id):
3286 """Report information extraction"""
3287 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3288
3289 def _real_extract(self, url):
3290 mobj = re.match(self._VALID_URL, url)
3291 if mobj is None:
3292 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3293 return
3294 video_id = mobj.group(1)
3295
3296 self.report_webpage(video_id)
3297
3298 # Get webpage content
3299 try:
3300 webpage_bytes = compat_urllib_request.urlopen(url).read()
3301 webpage = webpage_bytes.decode('utf-8')
3302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3303 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3304 return
3305
3306 result = re.search(self.VIDEO_URL_RE, webpage)
3307 if result is None:
3308 self._downloader.trouble(u'ERROR: unable to extract video url')
3309 return
3310 video_url = compat_urllib_parse.unquote(result.group(1))
3311
3312 result = re.search(self.VIDEO_TITLE_RE, webpage)
3313 if result is None:
3314 self._downloader.trouble(u'ERROR: unable to extract video title')
3315 return
3316 video_title = result.group(1)
3317
3318 result = re.search(self.VIDEO_THUMB_RE, webpage)
3319 if result is None:
3320 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3321 return
3322 video_thumbnail = result.group(1)
3323
3324 return [{
3325 'id': video_id,
3326 'url': video_url,
3327 'uploader': None,
3328 'upload_date': None,
3329 'title': video_title,
3330 'ext': 'flv',
3331 'thumbnail': video_thumbnail,
3332 'description': None,
3333 }]
3334
3335
3336 class GooglePlusIE(InfoExtractor):
3337 """Information extractor for plus.google.com."""
3338
3339 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3340 IE_NAME = u'plus.google'
3341
3342 def __init__(self, downloader=None):
3343 InfoExtractor.__init__(self, downloader)
3344
3345 def report_extract_entry(self, url):
3346 """Report downloading extry"""
3347 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3348
3349 def report_date(self, upload_date):
3350 """Report downloading extry"""
3351 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3352
3353 def report_uploader(self, uploader):
3354 """Report downloading extry"""
3355 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3356
3357 def report_title(self, video_title):
3358 """Report downloading extry"""
3359 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3360
3361 def report_extract_vid_page(self, video_page):
3362 """Report information extraction."""
3363 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3364
3365 def _real_extract(self, url):
3366 # Extract id from URL
3367 mobj = re.match(self._VALID_URL, url)
3368 if mobj is None:
3369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3370 return
3371
3372 post_url = mobj.group(0)
3373 video_id = mobj.group(1)
3374
3375 video_extension = 'flv'
3376
3377 # Step 1, Retrieve post webpage to extract further information
3378 self.report_extract_entry(post_url)
3379 request = compat_urllib_request.Request(post_url)
3380 try:
3381 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3382 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3383 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3384 return
3385
3386 # Extract update date
3387 upload_date = None
3388 pattern = 'title="Timestamp">(.*?)</a>'
3389 mobj = re.search(pattern, webpage)
3390 if mobj:
3391 upload_date = mobj.group(1)
3392 # Convert timestring to a format suitable for filename
3393 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3394 upload_date = upload_date.strftime('%Y%m%d')
3395 self.report_date(upload_date)
3396
3397 # Extract uploader
3398 uploader = None
3399 pattern = r'rel\="author".*?>(.*?)</a>'
3400 mobj = re.search(pattern, webpage)
3401 if mobj:
3402 uploader = mobj.group(1)
3403 self.report_uploader(uploader)
3404
3405 # Extract title
3406 # Get the first line for title
3407 video_title = u'NA'
3408 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3409 mobj = re.search(pattern, webpage)
3410 if mobj:
3411 video_title = mobj.group(1)
3412 self.report_title(video_title)
3413
3414 # Step 2, Stimulate clicking the image box to launch video
3415 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3416 mobj = re.search(pattern, webpage)
3417 if mobj is None:
3418 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3419
3420 video_page = mobj.group(1)
3421 request = compat_urllib_request.Request(video_page)
3422 try:
3423 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3424 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3426 return
3427 self.report_extract_vid_page(video_page)
3428
3429
3430 # Extract video links on video page
3431 """Extract video links of all sizes"""
3432 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3433 mobj = re.findall(pattern, webpage)
3434 if len(mobj) == 0:
3435 self._downloader.trouble(u'ERROR: unable to extract video links')
3436
3437 # Sort in resolution
3438 links = sorted(mobj)
3439
3440 # Choose the lowest of the sort, i.e. highest resolution
3441 video_url = links[-1]
3442 # Only get the url. The resolution part in the tuple has no use anymore
3443 video_url = video_url[-1]
3444 # Treat escaped \u0026 style hex
3445 try:
3446 video_url = video_url.decode("unicode_escape")
3447 except AttributeError: # Python 3
3448 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3449
3450
3451 return [{
3452 'id': video_id,
3453 'url': video_url,
3454 'uploader': uploader,
3455 'upload_date': upload_date,
3456 'title': video_title,
3457 'ext': video_extension,
3458 }]
3459
3460 class NBAIE(InfoExtractor):
3461 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3462 IE_NAME = u'nba'
3463
3464 def _real_extract(self, url):
3465 mobj = re.match(self._VALID_URL, url)
3466 if mobj is None:
3467 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3468 return
3469
3470 video_id = mobj.group(1)
3471 if video_id.endswith('/index.html'):
3472 video_id = video_id[:-len('/index.html')]
3473
3474 webpage = self._download_webpage(url, video_id)
3475
3476 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3477 def _findProp(rexp, default=None):
3478 m = re.search(rexp, webpage)
3479 if m:
3480 return unescapeHTML(m.group(1))
3481 else:
3482 return default
3483
3484 shortened_video_id = video_id.rpartition('/')[2]
3485 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3486 info = {
3487 'id': shortened_video_id,
3488 'url': video_url,
3489 'ext': 'mp4',
3490 'title': title,
3491 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3492 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3493 }
3494 return [info]
3495
3496 class JustinTVIE(InfoExtractor):
3497 """Information extractor for justin.tv and twitch.tv"""
3498 # TODO: One broadcast may be split into multiple videos. The key
3499 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3500 # starts at 1 and increases. Can we treat all parts as one video?
3501
3502 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3503 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3504 _JUSTIN_PAGE_LIMIT = 100
3505 IE_NAME = u'justin.tv'
3506
3507 def report_extraction(self, file_id):
3508 """Report information extraction."""
3509 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3510
3511 def report_download_page(self, channel, offset):
3512 """Report attempt to download a single page of videos."""
3513 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3514 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3515
3516 # Return count of items, list of *valid* items
3517 def _parse_page(self, url):
3518 try:
3519 urlh = compat_urllib_request.urlopen(url)
3520 webpage_bytes = urlh.read()
3521 webpage = webpage_bytes.decode('utf-8', 'ignore')
3522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3523 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3524 return
3525
3526 response = json.loads(webpage)
3527 if type(response) != list:
3528 error_text = response.get('error', 'unknown error')
3529 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3530 return
3531 info = []
3532 for clip in response:
3533 video_url = clip['video_file_url']
3534 if video_url:
3535 video_extension = os.path.splitext(video_url)[1][1:]
3536 video_date = re.sub('-', '', clip['start_time'][:10])
3537 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3538 info.append({
3539 'id': clip['id'],
3540 'url': video_url,
3541 'title': clip['title'],
3542 'uploader': clip.get('channel_name', video_uploader_id),
3543 'uploader_id': video_uploader_id,
3544 'upload_date': video_date,
3545 'ext': video_extension,
3546 })
3547 return (len(response), info)
3548
3549 def _real_extract(self, url):
3550 mobj = re.match(self._VALID_URL, url)
3551 if mobj is None:
3552 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3553 return
3554
3555 api = 'http://api.justin.tv'
3556 video_id = mobj.group(mobj.lastindex)
3557 paged = False
3558 if mobj.lastindex == 1:
3559 paged = True
3560 api += '/channel/archives/%s.json'
3561 else:
3562 api += '/broadcast/by_archive/%s.json'
3563 api = api % (video_id,)
3564
3565 self.report_extraction(video_id)
3566
3567 info = []
3568 offset = 0
3569 limit = self._JUSTIN_PAGE_LIMIT
3570 while True:
3571 if paged:
3572 self.report_download_page(video_id, offset)
3573 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3574 page_count, page_info = self._parse_page(page_url)
3575 info.extend(page_info)
3576 if not paged or page_count != limit:
3577 break
3578 offset += limit
3579 return info
3580
3581 class FunnyOrDieIE(InfoExtractor):
3582 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3583
3584 def _real_extract(self, url):
3585 mobj = re.match(self._VALID_URL, url)
3586 if mobj is None:
3587 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3588 return
3589
3590 video_id = mobj.group('id')
3591 webpage = self._download_webpage(url, video_id)
3592
3593 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3594 if not m:
3595 self._downloader.trouble(u'ERROR: unable to find video information')
3596 video_url = unescapeHTML(m.group('url'))
3597
3598 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3599 if not m:
3600 self._downloader.trouble(u'Cannot find video title')
3601 title = unescapeHTML(m.group('title'))
3602
3603 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3604 if m:
3605 desc = unescapeHTML(m.group('desc'))
3606 else:
3607 desc = None
3608
3609 info = {
3610 'id': video_id,
3611 'url': video_url,
3612 'ext': 'mp4',
3613 'title': title,
3614 'description': desc,
3615 }
3616 return [info]
3617
3618 class TweetReelIE(InfoExtractor):
3619 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3620
3621 def _real_extract(self, url):
3622 mobj = re.match(self._VALID_URL, url)
3623 if mobj is None:
3624 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3625 return
3626
3627 video_id = mobj.group('id')
3628 webpage = self._download_webpage(url, video_id)
3629
3630 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3631 if not m:
3632 self._downloader.trouble(u'ERROR: Cannot find status ID')
3633 status_id = m.group(1)
3634
3635 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3636 if not m:
3637 self._downloader.trouble(u'WARNING: Cannot find description')
3638 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3639
3640 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3641 if not m:
3642 self._downloader.trouble(u'ERROR: Cannot find uploader')
3643 uploader = unescapeHTML(m.group('uploader'))
3644 uploader_id = unescapeHTML(m.group('uploader_id'))
3645
3646 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3647 if not m:
3648 self._downloader.trouble(u'ERROR: Cannot find upload date')
3649 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3650
3651 title = desc
3652 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3653
3654 info = {
3655 'id': video_id,
3656 'url': video_url,
3657 'ext': 'mov',
3658 'title': title,
3659 'description': desc,
3660 'uploader': uploader,
3661 'uploader_id': uploader_id,
3662 'internal_id': status_id,
3663 'upload_date': upload_date
3664 }
3665 return [info]
3666
3667 class SteamIE(InfoExtractor):
3668 _VALID_URL = r"""http://store.steampowered.com/
3669 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3670 (?P<gameID>\d+)/?
3671 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3672 """
3673
3674 def suitable(self, url):
3675 """Receives a URL and returns True if suitable for this IE."""
3676 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3677
3678 def _real_extract(self, url):
3679 m = re.match(self._VALID_URL, url, re.VERBOSE)
3680 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3681 gameID = m.group('gameID')
3682 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3683 webpage = self._download_webpage(videourl, gameID)
3684 mweb = re.finditer(urlRE, webpage)
3685 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3686 titles = re.finditer(namesRE, webpage)
3687 videos = []
3688 for vid,vtitle in zip(mweb,titles):
3689 video_id = vid.group('videoID')
3690 title = vtitle.group('videoName')
3691 video_url = vid.group('videoURL')
3692 if not video_url:
3693 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3694 info = {
3695 'id':video_id,
3696 'url':video_url,
3697 'ext': 'flv',
3698 'title': unescapeHTML(title)
3699 }
3700 videos.append(info)
3701 return videos
3702
3703 class UstreamIE(InfoExtractor):
3704 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3705 IE_NAME = u'ustream'
3706
3707 def _real_extract(self, url):
3708 m = re.match(self._VALID_URL, url)
3709 video_id = m.group('videoID')
3710 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3711 webpage = self._download_webpage(url, video_id)
3712 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3713 title = m.group('title')
3714 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3715 uploader = m.group('uploader')
3716 info = {
3717 'id':video_id,
3718 'url':video_url,
3719 'ext': 'flv',
3720 'title': title,
3721 'uploader': uploader
3722 }
3723 return [info]
3724
3725
3726 def gen_extractors():
3727 """ Return a list of an instance of every supported extractor.
3728 The order does matter; the first extractor matched is the one handling the URL.
3729 """
3730 return [
3731 YoutubePlaylistIE(),
3732 YoutubeChannelIE(),
3733 YoutubeUserIE(),
3734 YoutubeSearchIE(),
3735 YoutubeIE(),
3736 MetacafeIE(),
3737 DailymotionIE(),
3738 GoogleSearchIE(),
3739 PhotobucketIE(),
3740 YahooIE(),
3741 YahooSearchIE(),
3742 DepositFilesIE(),
3743 FacebookIE(),
3744 BlipTVUserIE(),
3745 BlipTVIE(),
3746 VimeoIE(),
3747 MyVideoIE(),
3748 ComedyCentralIE(),
3749 EscapistIE(),
3750 CollegeHumorIE(),
3751 XVideosIE(),
3752 SoundcloudIE(),
3753 InfoQIE(),
3754 MixcloudIE(),
3755 StanfordOpenClassroomIE(),
3756 MTVIE(),
3757 YoukuIE(),
3758 XNXXIE(),
3759 GooglePlusIE(),
3760 ArteTvIE(),
3761 NBAIE(),
3762 JustinTVIE(),
3763 FunnyOrDieIE(),
3764 TweetReelIE(),
3765 SteamIE(),
3766 UstreamIE(),
3767 GenericIE()
3768 ]
3769
3770