]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Better name for InfoQ IE
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import netrc
9 import os
10 import re
11 import socket
12 import time
13 import email.utils
14 import xml.etree.ElementTree
15 import random
16 import math
17
18 from .utils import *
19
20
21 class InfoExtractor(object):
22 """Information Extractor class.
23
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
31
32 The dictionaries must include the following fields:
33
34 id: Video identifier.
35 url: Final video URL.
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
51
52 The fields should all be Unicode strings.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57
58 _real_extract() must return a *list* of information dictionaries as
59 described above.
60
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
63 """
64
65 _ready = False
66 _downloader = None
67 _WORKING = True
68
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
71 self._ready = False
72 self.set_downloader(downloader)
73
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
77
78 def working(self):
79 """Getter method for _WORKING."""
80 return self._WORKING
81
82 def initialize(self):
83 """Initializes an instance (authentication, etc)."""
84 if not self._ready:
85 self._real_initialize()
86 self._ready = True
87
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
90 self.initialize()
91 return self._real_extract(url)
92
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
96
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
99 pass
100
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
103 pass
104
105 @property
106 def IE_NAME(self):
107 return type(self).__name__[:-2]
108
109 def _download_webpage(self, url, video_id, note=None, errnote=None):
110 if note is None:
111 note = u'Downloading video webpage'
112 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
113 try:
114 urlh = compat_urllib_request.urlopen(url)
115 webpage_bytes = urlh.read()
116 return webpage_bytes.decode('utf-8', 'replace')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 if errnote is None:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
121
122
123 class YoutubeIE(InfoExtractor):
124 """Information extractor for youtube.com."""
125
126 _VALID_URL = r"""^
127 (
128 (?:https?://)? # http(s):// (optional)
129 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
131 (?:.*?\#/)? # handle anchor (#/) redirect urls
132 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
133 (?: # the various things that can precede the ID:
134 (?:(?:v|embed|e)/) # v/ or embed/ or e/
135 |(?: # or the v= param in all its forms
136 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137 (?:\?|\#!?) # the params delimiter ? or # or #!
138 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
139 v=
140 )
141 )? # optional -> youtube.com/xxxx is OK
142 )? # all until now is optional -> you can pass the naked ID
143 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
144 (?(1).+)? # if we found the ID, everything can follow
145 $"""
146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150 _NETRC_MACHINE = 'youtube'
151 # Listed in order of quality
152 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154 _video_extensions = {
155 '13': '3gp',
156 '17': 'mp4',
157 '18': 'mp4',
158 '22': 'mp4',
159 '37': 'mp4',
160 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
161 '43': 'webm',
162 '44': 'webm',
163 '45': 'webm',
164 '46': 'webm',
165 }
166 _video_dimensions = {
167 '5': '240x400',
168 '6': '???',
169 '13': '???',
170 '17': '144x176',
171 '18': '360x640',
172 '22': '720x1280',
173 '34': '360x640',
174 '35': '480x854',
175 '37': '1080x1920',
176 '38': '3072x4096',
177 '43': '360x640',
178 '44': '480x854',
179 '45': '720x1280',
180 '46': '1080x1920',
181 }
182 IE_NAME = u'youtube'
183
184 def suitable(self, url):
185 """Receives a URL and returns True if suitable for this IE."""
186 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
187
188 def report_lang(self):
189 """Report attempt to set language."""
190 self._downloader.to_screen(u'[youtube] Setting language')
191
192 def report_login(self):
193 """Report attempt to log in."""
194 self._downloader.to_screen(u'[youtube] Logging in')
195
196 def report_age_confirmation(self):
197 """Report attempt to confirm age."""
198 self._downloader.to_screen(u'[youtube] Confirming age')
199
200 def report_video_webpage_download(self, video_id):
201 """Report attempt to download video webpage."""
202 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
203
204 def report_video_info_webpage_download(self, video_id):
205 """Report attempt to download video info webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
207
208 def report_video_subtitles_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
211
212 def report_information_extraction(self, video_id):
213 """Report attempt to extract video information."""
214 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
215
216 def report_unavailable_format(self, video_id, format):
217 """Report extracted video URL."""
218 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
219
220 def report_rtmp_download(self):
221 """Indicate the download will use the RTMP protocol."""
222 self._downloader.to_screen(u'[youtube] RTMP download detected')
223
224 def _closed_captions_xml_to_srt(self, xml_string):
225 srt = ''
226 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227 # TODO parse xml instead of regex
228 for n, (start, dur_tag, dur, caption) in enumerate(texts):
229 if not dur: dur = '4'
230 start = float(start)
231 end = start + float(dur)
232 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234 caption = unescapeHTML(caption)
235 caption = unescapeHTML(caption) # double cycle, intentional
236 srt += str(n+1) + '\n'
237 srt += start + ' --> ' + end + '\n'
238 srt += caption + '\n\n'
239 return srt
240
241 def _extract_subtitles(self, video_id):
242 self.report_video_subtitles_download(video_id)
243 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
244 try:
245 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250 if not srt_lang_list:
251 return (u'WARNING: video has no closed captions', None)
252 if self._downloader.params.get('subtitleslang', False):
253 srt_lang = self._downloader.params.get('subtitleslang')
254 elif 'en' in srt_lang_list:
255 srt_lang = 'en'
256 else:
257 srt_lang = list(srt_lang_list.keys())[0]
258 if not srt_lang in srt_lang_list:
259 return (u'WARNING: no closed captions found in the specified language', None)
260 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
261 try:
262 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265 if not srt_xml:
266 return (u'WARNING: unable to download video subtitles', None)
267 return (None, self._closed_captions_xml_to_srt(srt_xml))
268
269 def _print_formats(self, formats):
270 print('Available formats:')
271 for x in formats:
272 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
273
274 def _real_initialize(self):
275 if self._downloader is None:
276 return
277
278 username = None
279 password = None
280 downloader_params = self._downloader.params
281
282 # Attempt to use provided username and password or .netrc data
283 if downloader_params.get('username', None) is not None:
284 username = downloader_params['username']
285 password = downloader_params['password']
286 elif downloader_params.get('usenetrc', False):
287 try:
288 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
289 if info is not None:
290 username = info[0]
291 password = info[2]
292 else:
293 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294 except (IOError, netrc.NetrcParseError) as err:
295 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
296 return
297
298 # Set language
299 request = compat_urllib_request.Request(self._LANG_URL)
300 try:
301 self.report_lang()
302 compat_urllib_request.urlopen(request).read()
303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
305 return
306
307 # No authentication to be performed
308 if username is None:
309 return
310
311 # Log in
312 login_form = {
313 'current_form': 'loginForm',
314 'next': '/',
315 'action_login': 'Log In',
316 'username': username,
317 'password': password,
318 }
319 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
320 try:
321 self.report_login()
322 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
325 return
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
328 return
329
330 # Confirm age
331 age_form = {
332 'next_url': '/',
333 'action_confirm': 'Confirm',
334 }
335 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
336 try:
337 self.report_age_confirmation()
338 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
341 return
342
343 def _extract_id(self, url):
344 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
345 if mobj is None:
346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
347 return
348 video_id = mobj.group(2)
349 return video_id
350
351 def _real_extract(self, url):
352 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353 mobj = re.search(self._NEXT_URL_RE, url)
354 if mobj:
355 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356 video_id = self._extract_id(url)
357
358 # Get video webpage
359 self.report_video_webpage_download(video_id)
360 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361 request = compat_urllib_request.Request(url)
362 try:
363 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
366 return
367
368 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
369
370 # Attempt to extract SWF player URL
371 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
372 if mobj is not None:
373 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
374 else:
375 player_url = None
376
377 # Get video info
378 self.report_video_info_webpage_download(video_id)
379 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381 % (video_id, el_type))
382 request = compat_urllib_request.Request(video_info_url)
383 try:
384 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386 video_info = compat_parse_qs(video_info_webpage)
387 if 'token' in video_info:
388 break
389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
391 return
392 if 'token' not in video_info:
393 if 'reason' in video_info:
394 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
395 else:
396 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
397 return
398
399 # Check for "rental" videos
400 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401 self._downloader.trouble(u'ERROR: "rental" videos not supported')
402 return
403
404 # Start extracting information
405 self.report_information_extraction(video_id)
406
407 # uploader
408 if 'author' not in video_info:
409 self._downloader.trouble(u'ERROR: unable to extract uploader name')
410 return
411 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
412
413 # uploader_id
414 video_uploader_id = None
415 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
416 if mobj is not None:
417 video_uploader_id = mobj.group(1)
418 else:
419 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
420
421 # title
422 if 'title' not in video_info:
423 self._downloader.trouble(u'ERROR: unable to extract video title')
424 return
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
426
427 # thumbnail image
428 if 'thumbnail_url' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
430 video_thumbnail = ''
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
433
434 # upload date
435 upload_date = None
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
437 if mobj is not None:
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440 for expression in format_expressions:
441 try:
442 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
443 except:
444 pass
445
446 # description
447 video_description = get_element_by_id("eow-description", video_webpage)
448 if video_description:
449 video_description = clean_html(video_description)
450 else:
451 video_description = ''
452
453 # closed captions
454 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
457 if srt_error:
458 self._downloader.trouble(srt_error)
459
460 if 'length_seconds' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video duration')
462 video_duration = ''
463 else:
464 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
465
466 # token
467 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
468
469 # Decide which formats to download
470 req_format = self._downloader.params.get('format', None)
471
472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473 self.report_rtmp_download()
474 video_url_list = [(None, video_info['conn'][0])]
475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
480
481 format_limit = self._downloader.params.get('format_limit', None)
482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483 if format_limit is not None and format_limit in available_formats:
484 format_list = available_formats[available_formats.index(format_limit):]
485 else:
486 format_list = available_formats
487 existing_formats = [x for x in format_list if x in url_map]
488 if len(existing_formats) == 0:
489 self._downloader.trouble(u'ERROR: no known formats available for video')
490 return
491 if self._downloader.params.get('listformats', None):
492 self._print_formats(existing_formats)
493 return
494 if req_format is None or req_format == 'best':
495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496 elif req_format == 'worst':
497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498 elif req_format in ('-1', 'all'):
499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
500 else:
501 # Specific formats. We pick the first in a slash-delimeted sequence.
502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503 req_formats = req_format.split('/')
504 video_url_list = None
505 for rf in req_formats:
506 if rf in url_map:
507 video_url_list = [(rf, url_map[rf])]
508 break
509 if video_url_list is None:
510 self._downloader.trouble(u'ERROR: requested format not available')
511 return
512 else:
513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
514 return
515
516 results = []
517 for format_param, video_real_url in video_url_list:
518 # Extension
519 video_extension = self._video_extensions.get(format_param, 'flv')
520
521 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522 self._video_dimensions.get(format_param, '???'))
523
524 results.append({
525 'id': video_id,
526 'url': video_real_url,
527 'uploader': video_uploader,
528 'uploader_id': video_uploader_id,
529 'upload_date': upload_date,
530 'title': video_title,
531 'ext': video_extension,
532 'format': video_format,
533 'thumbnail': video_thumbnail,
534 'description': video_description,
535 'player_url': player_url,
536 'subtitles': video_subtitles,
537 'duration': video_duration
538 })
539 return results
540
541
542 class MetacafeIE(InfoExtractor):
543 """Information Extractor for metacafe.com."""
544
545 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548 IE_NAME = u'metacafe'
549
550 def __init__(self, downloader=None):
551 InfoExtractor.__init__(self, downloader)
552
553 def report_disclaimer(self):
554 """Report disclaimer retrieval."""
555 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
556
557 def report_age_confirmation(self):
558 """Report attempt to confirm age."""
559 self._downloader.to_screen(u'[metacafe] Confirming age')
560
561 def report_download_webpage(self, video_id):
562 """Report webpage download."""
563 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
564
565 def report_extraction(self, video_id):
566 """Report information extraction."""
567 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
568
569 def _real_initialize(self):
570 # Retrieve disclaimer
571 request = compat_urllib_request.Request(self._DISCLAIMER)
572 try:
573 self.report_disclaimer()
574 disclaimer = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
577 return
578
579 # Confirm age
580 disclaimer_form = {
581 'filters': '0',
582 'submit': "Continue - I'm over 18",
583 }
584 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
585 try:
586 self.report_age_confirmation()
587 disclaimer = compat_urllib_request.urlopen(request).read()
588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
590 return
591
592 def _real_extract(self, url):
593 # Extract id and simplified title from URL
594 mobj = re.match(self._VALID_URL, url)
595 if mobj is None:
596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
597 return
598
599 video_id = mobj.group(1)
600
601 # Check if video comes from YouTube
602 mobj2 = re.match(r'^yt-(.*)$', video_id)
603 if mobj2 is not None:
604 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
605 return
606
607 # Retrieve video webpage to extract further information
608 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
609 try:
610 self.report_download_webpage(video_id)
611 webpage = compat_urllib_request.urlopen(request).read()
612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
614 return
615
616 # Extract URL, uploader and title from webpage
617 self.report_extraction(video_id)
618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
619 if mobj is not None:
620 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621 video_extension = mediaURL[-3:]
622
623 # Extract gdaKey if available
624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
625 if mobj is None:
626 video_url = mediaURL
627 else:
628 gdaKey = mobj.group(1)
629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
630 else:
631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
632 if mobj is None:
633 self._downloader.trouble(u'ERROR: unable to extract media URL')
634 return
635 vardict = compat_parse_qs(mobj.group(1))
636 if 'mediaData' not in vardict:
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 return
639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
640 if mobj is None:
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 return
643 mediaURL = mobj.group(1).replace('\\/', '/')
644 video_extension = mediaURL[-3:]
645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
646
647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
648 if mobj is None:
649 self._downloader.trouble(u'ERROR: unable to extract title')
650 return
651 video_title = mobj.group(1).decode('utf-8')
652
653 mobj = re.search(r'submitter=(.*?);', webpage)
654 if mobj is None:
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656 return
657 video_uploader = mobj.group(1)
658
659 return [{
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
663 'upload_date': None,
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
666 }]
667
668
669 class DailymotionIE(InfoExtractor):
670 """Information Extractor for Dailymotion"""
671
672 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673 IE_NAME = u'dailymotion'
674
675 def __init__(self, downloader=None):
676 InfoExtractor.__init__(self, downloader)
677
678 def report_download_webpage(self, video_id):
679 """Report webpage download."""
680 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
681
682 def report_extraction(self, video_id):
683 """Report information extraction."""
684 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
685
686 def _real_extract(self, url):
687 # Extract id and simplified title from URL
688 mobj = re.match(self._VALID_URL, url)
689 if mobj is None:
690 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
691 return
692
693 video_id = mobj.group(1).split('_')[0].split('?')[0]
694
695 video_extension = 'mp4'
696
697 # Retrieve video webpage to extract further information
698 request = compat_urllib_request.Request(url)
699 request.add_header('Cookie', 'family_filter=off')
700 try:
701 self.report_download_webpage(video_id)
702 webpage_bytes = compat_urllib_request.urlopen(request).read()
703 webpage = webpage_bytes.decode('utf-8')
704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
705 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
706 return
707
708 # Extract URL, uploader and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
711 if mobj is None:
712 self._downloader.trouble(u'ERROR: unable to extract media URL')
713 return
714 flashvars = compat_urllib_parse.unquote(mobj.group(1))
715
716 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
717 if key in flashvars:
718 max_quality = key
719 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
720 break
721 else:
722 self._downloader.trouble(u'ERROR: unable to extract video URL')
723 return
724
725 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
726 if mobj is None:
727 self._downloader.trouble(u'ERROR: unable to extract video URL')
728 return
729
730 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
731
732 # TODO: support choosing qualities
733
734 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
735 if mobj is None:
736 self._downloader.trouble(u'ERROR: unable to extract title')
737 return
738 video_title = unescapeHTML(mobj.group('title'))
739
740 video_uploader = None
741 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
742 if mobj is None:
743 # lookin for official user
744 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
745 if mobj_official is None:
746 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
747 else:
748 video_uploader = mobj_official.group(1)
749 else:
750 video_uploader = mobj.group(1)
751
752 video_upload_date = None
753 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
754 if mobj is not None:
755 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
756
757 return [{
758 'id': video_id,
759 'url': video_url,
760 'uploader': video_uploader,
761 'upload_date': video_upload_date,
762 'title': video_title,
763 'ext': video_extension,
764 }]
765
766
767 class PhotobucketIE(InfoExtractor):
768 """Information extractor for photobucket.com."""
769
770 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
771 IE_NAME = u'photobucket'
772
773 def __init__(self, downloader=None):
774 InfoExtractor.__init__(self, downloader)
775
776 def report_download_webpage(self, video_id):
777 """Report webpage download."""
778 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779
780 def report_extraction(self, video_id):
781 """Report information extraction."""
782 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783
784 def _real_extract(self, url):
785 # Extract id from URL
786 mobj = re.match(self._VALID_URL, url)
787 if mobj is None:
788 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
789 return
790
791 video_id = mobj.group(1)
792
793 video_extension = 'flv'
794
795 # Retrieve video webpage to extract further information
796 request = compat_urllib_request.Request(url)
797 try:
798 self.report_download_webpage(video_id)
799 webpage = compat_urllib_request.urlopen(request).read()
800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
801 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
802 return
803
804 # Extract URL, uploader, and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807 if mobj is None:
808 self._downloader.trouble(u'ERROR: unable to extract media URL')
809 return
810 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
811
812 video_url = mediaURL
813
814 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815 if mobj is None:
816 self._downloader.trouble(u'ERROR: unable to extract title')
817 return
818 video_title = mobj.group(1).decode('utf-8')
819
820 video_uploader = mobj.group(2).decode('utf-8')
821
822 return [{
823 'id': video_id.decode('utf-8'),
824 'url': video_url.decode('utf-8'),
825 'uploader': video_uploader,
826 'upload_date': None,
827 'title': video_title,
828 'ext': video_extension.decode('utf-8'),
829 }]
830
831
832 class YahooIE(InfoExtractor):
833 """Information extractor for video.yahoo.com."""
834
835 _WORKING = False
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME = u'video.yahoo'
841
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
844
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
848
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
852
853 def _real_extract(self, url, new_video=True):
854 # Extract ID from URL
855 mobj = re.match(self._VALID_URL, url)
856 if mobj is None:
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
858 return
859
860 video_id = mobj.group(2)
861 video_extension = 'flv'
862
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re.match(self._VPAGE_URL, url) is None:
866 request = compat_urllib_request.Request(url)
867 try:
868 webpage = compat_urllib_request.urlopen(request).read()
869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
871 return
872
873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
874 if mobj is None:
875 self._downloader.trouble(u'ERROR: Unable to extract id field')
876 return
877 yahoo_id = mobj.group(1)
878
879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
880 if mobj is None:
881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
882 return
883 yahoo_vid = mobj.group(1)
884
885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886 return self._real_extract(url, new_video=False)
887
888 # Retrieve video webpage to extract further information
889 request = compat_urllib_request.Request(url)
890 try:
891 self.report_download_webpage(video_id)
892 webpage = compat_urllib_request.urlopen(request).read()
893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
895 return
896
897 # Extract uploader and title from webpage
898 self.report_extraction(video_id)
899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
900 if mobj is None:
901 self._downloader.trouble(u'ERROR: unable to extract video title')
902 return
903 video_title = mobj.group(1).decode('utf-8')
904
905 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
906 if mobj is None:
907 self._downloader.trouble(u'ERROR: unable to extract video uploader')
908 return
909 video_uploader = mobj.group(1).decode('utf-8')
910
911 # Extract video thumbnail
912 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
913 if mobj is None:
914 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
915 return
916 video_thumbnail = mobj.group(1).decode('utf-8')
917
918 # Extract video description
919 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
920 if mobj is None:
921 self._downloader.trouble(u'ERROR: unable to extract video description')
922 return
923 video_description = mobj.group(1).decode('utf-8')
924 if not video_description:
925 video_description = 'No description available.'
926
927 # Extract video height and width
928 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
929 if mobj is None:
930 self._downloader.trouble(u'ERROR: unable to extract video height')
931 return
932 yv_video_height = mobj.group(1)
933
934 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
935 if mobj is None:
936 self._downloader.trouble(u'ERROR: unable to extract video width')
937 return
938 yv_video_width = mobj.group(1)
939
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate = '700' # according to Wikipedia this is hard-coded
945 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
948 try:
949 self.report_download_webpage(video_id)
950 webpage = compat_urllib_request.urlopen(request).read()
951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
953 return
954
955 # Extract media URL from playlist XML
956 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
957 if mobj is None:
958 self._downloader.trouble(u'ERROR: Unable to extract media URL')
959 return
960 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961 video_url = unescapeHTML(video_url)
962
963 return [{
964 'id': video_id.decode('utf-8'),
965 'url': video_url,
966 'uploader': video_uploader,
967 'upload_date': None,
968 'title': video_title,
969 'ext': video_extension.decode('utf-8'),
970 'thumbnail': video_thumbnail.decode('utf-8'),
971 'description': video_description,
972 }]
973
974
975 class VimeoIE(InfoExtractor):
976 """Information extractor for vimeo.com."""
977
978 # _VALID_URL matches Vimeo URLs
979 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
980 IE_NAME = u'vimeo'
981
982 def __init__(self, downloader=None):
983 InfoExtractor.__init__(self, downloader)
984
985 def report_download_webpage(self, video_id):
986 """Report webpage download."""
987 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
988
989 def report_extraction(self, video_id):
990 """Report information extraction."""
991 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
992
993 def _real_extract(self, url, new_video=True):
994 # Extract ID from URL
995 mobj = re.match(self._VALID_URL, url)
996 if mobj is None:
997 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
998 return
999
1000 video_id = mobj.group(1)
1001
1002 # Retrieve video webpage to extract further information
1003 request = compat_urllib_request.Request(url, None, std_headers)
1004 try:
1005 self.report_download_webpage(video_id)
1006 webpage_bytes = compat_urllib_request.urlopen(request).read()
1007 webpage = webpage_bytes.decode('utf-8')
1008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1010 return
1011
1012 # Now we begin extracting as much information as we can from what we
1013 # retrieved. First we extract the information common to all extractors,
1014 # and latter we extract those that are Vimeo specific.
1015 self.report_extraction(video_id)
1016
1017 # Extract the config JSON
1018 try:
1019 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020 config = json.loads(config)
1021 except:
1022 self._downloader.trouble(u'ERROR: unable to extract info section')
1023 return
1024
1025 # Extract title
1026 video_title = config["video"]["title"]
1027
1028 # Extract uploader and uploader_id
1029 video_uploader = config["video"]["owner"]["name"]
1030 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1031
1032 # Extract video thumbnail
1033 video_thumbnail = config["video"]["thumbnail"]
1034
1035 # Extract video description
1036 video_description = get_element_by_attribute("itemprop", "description", webpage)
1037 if video_description: video_description = clean_html(video_description)
1038 else: video_description = ''
1039
1040 # Extract upload date
1041 video_upload_date = None
1042 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1043 if mobj is not None:
1044 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1045
1046 # Vimeo specific: extract request signature and timestamp
1047 sig = config['request']['signature']
1048 timestamp = config['request']['timestamp']
1049
1050 # Vimeo specific: extract video codec and quality information
1051 # First consider quality, then codecs, then take everything
1052 # TODO bind to format param
1053 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1054 files = { 'hd': [], 'sd': [], 'other': []}
1055 for codec_name, codec_extension in codecs:
1056 if codec_name in config["video"]["files"]:
1057 if 'hd' in config["video"]["files"][codec_name]:
1058 files['hd'].append((codec_name, codec_extension, 'hd'))
1059 elif 'sd' in config["video"]["files"][codec_name]:
1060 files['sd'].append((codec_name, codec_extension, 'sd'))
1061 else:
1062 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1063
1064 for quality in ('hd', 'sd', 'other'):
1065 if len(files[quality]) > 0:
1066 video_quality = files[quality][0][2]
1067 video_codec = files[quality][0][0]
1068 video_extension = files[quality][0][1]
1069 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1070 break
1071 else:
1072 self._downloader.trouble(u'ERROR: no known codec found')
1073 return
1074
1075 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1076 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1077
1078 return [{
1079 'id': video_id,
1080 'url': video_url,
1081 'uploader': video_uploader,
1082 'uploader_id': video_uploader_id,
1083 'upload_date': video_upload_date,
1084 'title': video_title,
1085 'ext': video_extension,
1086 'thumbnail': video_thumbnail,
1087 'description': video_description,
1088 }]
1089
1090
1091 class ArteTvIE(InfoExtractor):
1092 """arte.tv information extractor."""
1093
1094 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1095 _LIVE_URL = r'index-[0-9]+\.html$'
1096
1097 IE_NAME = u'arte.tv'
1098
1099 def __init__(self, downloader=None):
1100 InfoExtractor.__init__(self, downloader)
1101
1102 def report_download_webpage(self, video_id):
1103 """Report webpage download."""
1104 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1105
1106 def report_extraction(self, video_id):
1107 """Report information extraction."""
1108 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1109
1110 def fetch_webpage(self, url):
1111 request = compat_urllib_request.Request(url)
1112 try:
1113 self.report_download_webpage(url)
1114 webpage = compat_urllib_request.urlopen(request).read()
1115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1116 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1117 return
1118 except ValueError as err:
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120 return
1121 return webpage
1122
1123 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1124 page = self.fetch_webpage(url)
1125 mobj = re.search(regex, page, regexFlags)
1126 info = {}
1127
1128 if mobj is None:
1129 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1130 return
1131
1132 for (i, key, err) in matchTuples:
1133 if mobj.group(i) is None:
1134 self._downloader.trouble(err)
1135 return
1136 else:
1137 info[key] = mobj.group(i)
1138
1139 return info
1140
1141 def extractLiveStream(self, url):
1142 video_lang = url.split('/')[-4]
1143 info = self.grep_webpage(
1144 url,
1145 r'src="(.*?/videothek_js.*?\.js)',
1146 0,
1147 [
1148 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1149 ]
1150 )
1151 http_host = url.split('/')[2]
1152 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1153 info = self.grep_webpage(
1154 next_url,
1155 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1156 '(http://.*?\.swf).*?' +
1157 '(rtmp://.*?)\'',
1158 re.DOTALL,
1159 [
1160 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1161 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1162 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1163 ]
1164 )
1165 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1166
1167 def extractPlus7Stream(self, url):
1168 video_lang = url.split('/')[-3]
1169 info = self.grep_webpage(
1170 url,
1171 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1172 0,
1173 [
1174 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1175 ]
1176 )
1177 next_url = compat_urllib_parse.unquote(info.get('url'))
1178 info = self.grep_webpage(
1179 next_url,
1180 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1181 0,
1182 [
1183 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1184 ]
1185 )
1186 next_url = compat_urllib_parse.unquote(info.get('url'))
1187
1188 info = self.grep_webpage(
1189 next_url,
1190 r'<video id="(.*?)".*?>.*?' +
1191 '<name>(.*?)</name>.*?' +
1192 '<dateVideo>(.*?)</dateVideo>.*?' +
1193 '<url quality="hd">(.*?)</url>',
1194 re.DOTALL,
1195 [
1196 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1197 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1198 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1199 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1200 ]
1201 )
1202
1203 return {
1204 'id': info.get('id'),
1205 'url': compat_urllib_parse.unquote(info.get('url')),
1206 'uploader': u'arte.tv',
1207 'upload_date': info.get('date'),
1208 'title': info.get('title').decode('utf-8'),
1209 'ext': u'mp4',
1210 'format': u'NA',
1211 'player_url': None,
1212 }
1213
1214 def _real_extract(self, url):
1215 video_id = url.split('/')[-1]
1216 self.report_extraction(video_id)
1217
1218 if re.search(self._LIVE_URL, video_id) is not None:
1219 self.extractLiveStream(url)
1220 return
1221 else:
1222 info = self.extractPlus7Stream(url)
1223
1224 return [info]
1225
1226
1227 class GenericIE(InfoExtractor):
1228 """Generic last-resort information extractor."""
1229
1230 _VALID_URL = r'.*'
1231 IE_NAME = u'generic'
1232
1233 def __init__(self, downloader=None):
1234 InfoExtractor.__init__(self, downloader)
1235
1236 def report_download_webpage(self, video_id):
1237 """Report webpage download."""
1238 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1239 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1240
1241 def report_extraction(self, video_id):
1242 """Report information extraction."""
1243 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1244
1245 def report_following_redirect(self, new_url):
1246 """Report information extraction."""
1247 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1248
1249 def _test_redirect(self, url):
1250 """Check if it is a redirect, like url shorteners, in case restart chain."""
1251 class HeadRequest(compat_urllib_request.Request):
1252 def get_method(self):
1253 return "HEAD"
1254
1255 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1256 """
1257 Subclass the HTTPRedirectHandler to make it use our
1258 HeadRequest also on the redirected URL
1259 """
1260 def redirect_request(self, req, fp, code, msg, headers, newurl):
1261 if code in (301, 302, 303, 307):
1262 newurl = newurl.replace(' ', '%20')
1263 newheaders = dict((k,v) for k,v in req.headers.items()
1264 if k.lower() not in ("content-length", "content-type"))
1265 return HeadRequest(newurl,
1266 headers=newheaders,
1267 origin_req_host=req.get_origin_req_host(),
1268 unverifiable=True)
1269 else:
1270 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1271
1272 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1273 """
1274 Fallback to GET if HEAD is not allowed (405 HTTP error)
1275 """
1276 def http_error_405(self, req, fp, code, msg, headers):
1277 fp.read()
1278 fp.close()
1279
1280 newheaders = dict((k,v) for k,v in req.headers.items()
1281 if k.lower() not in ("content-length", "content-type"))
1282 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1283 headers=newheaders,
1284 origin_req_host=req.get_origin_req_host(),
1285 unverifiable=True))
1286
1287 # Build our opener
1288 opener = compat_urllib_request.OpenerDirector()
1289 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1290 HTTPMethodFallback, HEADRedirectHandler,
1291 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1292 opener.add_handler(handler())
1293
1294 response = opener.open(HeadRequest(url))
1295 new_url = response.geturl()
1296
1297 if url == new_url:
1298 return False
1299
1300 self.report_following_redirect(new_url)
1301 self._downloader.download([new_url])
1302 return True
1303
1304 def _real_extract(self, url):
1305 if self._test_redirect(url): return
1306
1307 video_id = url.split('/')[-1]
1308 request = compat_urllib_request.Request(url)
1309 try:
1310 self.report_download_webpage(video_id)
1311 webpage = compat_urllib_request.urlopen(request).read()
1312 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1313 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1314 return
1315 except ValueError as err:
1316 # since this is the last-resort InfoExtractor, if
1317 # this error is thrown, it'll be thrown here
1318 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319 return
1320
1321 self.report_extraction(video_id)
1322 # Start with something easy: JW Player in SWFObject
1323 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1324 if mobj is None:
1325 # Broaden the search a little bit
1326 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1327 if mobj is None:
1328 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1329 return
1330
1331 # It's possible that one of the regexes
1332 # matched, but returned an empty group:
1333 if mobj.group(1) is None:
1334 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1335 return
1336
1337 video_url = compat_urllib_parse.unquote(mobj.group(1))
1338 video_id = os.path.basename(video_url)
1339
1340 # here's a fun little line of code for you:
1341 video_extension = os.path.splitext(video_id)[1][1:]
1342 video_id = os.path.splitext(video_id)[0]
1343
1344 # it's tempting to parse this further, but you would
1345 # have to take into account all the variations like
1346 # Video Title - Site Name
1347 # Site Name | Video Title
1348 # Video Title - Tagline | Site Name
1349 # and so on and so forth; it's just not practical
1350 mobj = re.search(r'<title>(.*)</title>', webpage)
1351 if mobj is None:
1352 self._downloader.trouble(u'ERROR: unable to extract title')
1353 return
1354 video_title = mobj.group(1)
1355
1356 # video uploader is domain name
1357 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1358 if mobj is None:
1359 self._downloader.trouble(u'ERROR: unable to extract title')
1360 return
1361 video_uploader = mobj.group(1)
1362
1363 return [{
1364 'id': video_id,
1365 'url': video_url,
1366 'uploader': video_uploader,
1367 'upload_date': None,
1368 'title': video_title,
1369 'ext': video_extension,
1370 }]
1371
1372
1373 class YoutubeSearchIE(InfoExtractor):
1374 """Information Extractor for YouTube search queries."""
1375 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1376 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1377 _max_youtube_results = 1000
1378 IE_NAME = u'youtube:search'
1379
1380 def __init__(self, downloader=None):
1381 InfoExtractor.__init__(self, downloader)
1382
1383 def report_download_page(self, query, pagenum):
1384 """Report attempt to download search page with given number."""
1385 query = query.decode(preferredencoding())
1386 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1387
1388 def _real_extract(self, query):
1389 mobj = re.match(self._VALID_URL, query)
1390 if mobj is None:
1391 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1392 return
1393
1394 prefix, query = query.split(':')
1395 prefix = prefix[8:]
1396 query = query.encode('utf-8')
1397 if prefix == '':
1398 self._download_n_results(query, 1)
1399 return
1400 elif prefix == 'all':
1401 self._download_n_results(query, self._max_youtube_results)
1402 return
1403 else:
1404 try:
1405 n = int(prefix)
1406 if n <= 0:
1407 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1408 return
1409 elif n > self._max_youtube_results:
1410 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1411 n = self._max_youtube_results
1412 self._download_n_results(query, n)
1413 return
1414 except ValueError: # parsing prefix as integer fails
1415 self._download_n_results(query, 1)
1416 return
1417
1418 def _download_n_results(self, query, n):
1419 """Downloads a specified number of results for a query"""
1420
1421 video_ids = []
1422 pagenum = 0
1423 limit = n
1424
1425 while (50 * pagenum) < limit:
1426 self.report_download_page(query, pagenum+1)
1427 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1428 request = compat_urllib_request.Request(result_url)
1429 try:
1430 data = compat_urllib_request.urlopen(request).read()
1431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1432 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1433 return
1434 api_response = json.loads(data)['data']
1435
1436 new_ids = list(video['id'] for video in api_response['items'])
1437 video_ids += new_ids
1438
1439 limit = min(n, api_response['totalItems'])
1440 pagenum += 1
1441
1442 if len(video_ids) > n:
1443 video_ids = video_ids[:n]
1444 for id in video_ids:
1445 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1446 return
1447
1448
1449 class GoogleSearchIE(InfoExtractor):
1450 """Information Extractor for Google Video search queries."""
1451 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1452 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1453 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1454 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1455 _max_google_results = 1000
1456 IE_NAME = u'video.google:search'
1457
1458 def __init__(self, downloader=None):
1459 InfoExtractor.__init__(self, downloader)
1460
1461 def report_download_page(self, query, pagenum):
1462 """Report attempt to download playlist page with given number."""
1463 query = query.decode(preferredencoding())
1464 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1465
1466 def _real_extract(self, query):
1467 mobj = re.match(self._VALID_URL, query)
1468 if mobj is None:
1469 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1470 return
1471
1472 prefix, query = query.split(':')
1473 prefix = prefix[8:]
1474 query = query.encode('utf-8')
1475 if prefix == '':
1476 self._download_n_results(query, 1)
1477 return
1478 elif prefix == 'all':
1479 self._download_n_results(query, self._max_google_results)
1480 return
1481 else:
1482 try:
1483 n = int(prefix)
1484 if n <= 0:
1485 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1486 return
1487 elif n > self._max_google_results:
1488 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1489 n = self._max_google_results
1490 self._download_n_results(query, n)
1491 return
1492 except ValueError: # parsing prefix as integer fails
1493 self._download_n_results(query, 1)
1494 return
1495
1496 def _download_n_results(self, query, n):
1497 """Downloads a specified number of results for a query"""
1498
1499 video_ids = []
1500 pagenum = 0
1501
1502 while True:
1503 self.report_download_page(query, pagenum)
1504 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1505 request = compat_urllib_request.Request(result_url)
1506 try:
1507 page = compat_urllib_request.urlopen(request).read()
1508 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1509 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1510 return
1511
1512 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514 video_id = mobj.group(1)
1515 if video_id not in video_ids:
1516 video_ids.append(video_id)
1517 if len(video_ids) == n:
1518 # Specified n videos reached
1519 for id in video_ids:
1520 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1521 return
1522
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 for id in video_ids:
1525 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1526 return
1527
1528 pagenum = pagenum + 1
1529
1530
1531 class YahooSearchIE(InfoExtractor):
1532 """Information Extractor for Yahoo! Video search queries."""
1533
1534 _WORKING = False
1535 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1536 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1537 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1538 _MORE_PAGES_INDICATOR = r'\s*Next'
1539 _max_yahoo_results = 1000
1540 IE_NAME = u'video.yahoo:search'
1541
1542 def __init__(self, downloader=None):
1543 InfoExtractor.__init__(self, downloader)
1544
1545 def report_download_page(self, query, pagenum):
1546 """Report attempt to download playlist page with given number."""
1547 query = query.decode(preferredencoding())
1548 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1549
1550 def _real_extract(self, query):
1551 mobj = re.match(self._VALID_URL, query)
1552 if mobj is None:
1553 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1554 return
1555
1556 prefix, query = query.split(':')
1557 prefix = prefix[8:]
1558 query = query.encode('utf-8')
1559 if prefix == '':
1560 self._download_n_results(query, 1)
1561 return
1562 elif prefix == 'all':
1563 self._download_n_results(query, self._max_yahoo_results)
1564 return
1565 else:
1566 try:
1567 n = int(prefix)
1568 if n <= 0:
1569 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1570 return
1571 elif n > self._max_yahoo_results:
1572 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1573 n = self._max_yahoo_results
1574 self._download_n_results(query, n)
1575 return
1576 except ValueError: # parsing prefix as integer fails
1577 self._download_n_results(query, 1)
1578 return
1579
1580 def _download_n_results(self, query, n):
1581 """Downloads a specified number of results for a query"""
1582
1583 video_ids = []
1584 already_seen = set()
1585 pagenum = 1
1586
1587 while True:
1588 self.report_download_page(query, pagenum)
1589 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1590 request = compat_urllib_request.Request(result_url)
1591 try:
1592 page = compat_urllib_request.urlopen(request).read()
1593 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1594 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1595 return
1596
1597 # Extract video identifiers
1598 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1599 video_id = mobj.group(1)
1600 if video_id not in already_seen:
1601 video_ids.append(video_id)
1602 already_seen.add(video_id)
1603 if len(video_ids) == n:
1604 # Specified n videos reached
1605 for id in video_ids:
1606 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1607 return
1608
1609 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610 for id in video_ids:
1611 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1612 return
1613
1614 pagenum = pagenum + 1
1615
1616
1617 class YoutubePlaylistIE(InfoExtractor):
1618 """Information Extractor for YouTube playlists."""
1619
1620 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1621 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1622 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1623 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1624 IE_NAME = u'youtube:playlist'
1625
1626 def __init__(self, downloader=None):
1627 InfoExtractor.__init__(self, downloader)
1628
1629 def report_download_page(self, playlist_id, pagenum):
1630 """Report attempt to download playlist page with given number."""
1631 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1632
1633 def _real_extract(self, url):
1634 # Extract playlist id
1635 mobj = re.match(self._VALID_URL, url)
1636 if mobj is None:
1637 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1638 return
1639
1640 # Single video case
1641 if mobj.group(3) is not None:
1642 self._downloader.download([mobj.group(3)])
1643 return
1644
1645 # Download playlist pages
1646 # prefix is 'p' as default for playlists but there are other types that need extra care
1647 playlist_prefix = mobj.group(1)
1648 if playlist_prefix == 'a':
1649 playlist_access = 'artist'
1650 else:
1651 playlist_prefix = 'p'
1652 playlist_access = 'view_play_list'
1653 playlist_id = mobj.group(2)
1654 video_ids = []
1655 pagenum = 1
1656
1657 while True:
1658 self.report_download_page(playlist_id, pagenum)
1659 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1660 request = compat_urllib_request.Request(url)
1661 try:
1662 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1664 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1665 return
1666
1667 # Extract video identifiers
1668 ids_in_page = []
1669 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1670 if mobj.group(1) not in ids_in_page:
1671 ids_in_page.append(mobj.group(1))
1672 video_ids.extend(ids_in_page)
1673
1674 if self._MORE_PAGES_INDICATOR not in page:
1675 break
1676 pagenum = pagenum + 1
1677
1678 total = len(video_ids)
1679
1680 playliststart = self._downloader.params.get('playliststart', 1) - 1
1681 playlistend = self._downloader.params.get('playlistend', -1)
1682 if playlistend == -1:
1683 video_ids = video_ids[playliststart:]
1684 else:
1685 video_ids = video_ids[playliststart:playlistend]
1686
1687 if len(video_ids) == total:
1688 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1689 else:
1690 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1691
1692 for id in video_ids:
1693 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1694 return
1695
1696
1697 class YoutubeChannelIE(InfoExtractor):
1698 """Information Extractor for YouTube channels."""
1699
1700 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1701 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1702 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1703 IE_NAME = u'youtube:channel'
1704
1705 def report_download_page(self, channel_id, pagenum):
1706 """Report attempt to download channel page with given number."""
1707 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1708
1709 def _real_extract(self, url):
1710 # Extract channel id
1711 mobj = re.match(self._VALID_URL, url)
1712 if mobj is None:
1713 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1714 return
1715
1716 # Download channel pages
1717 channel_id = mobj.group(1)
1718 video_ids = []
1719 pagenum = 1
1720
1721 while True:
1722 self.report_download_page(channel_id, pagenum)
1723 url = self._TEMPLATE_URL % (channel_id, pagenum)
1724 request = compat_urllib_request.Request(url)
1725 try:
1726 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1727 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1728 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1729 return
1730
1731 # Extract video identifiers
1732 ids_in_page = []
1733 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1734 if mobj.group(1) not in ids_in_page:
1735 ids_in_page.append(mobj.group(1))
1736 video_ids.extend(ids_in_page)
1737
1738 if self._MORE_PAGES_INDICATOR not in page:
1739 break
1740 pagenum = pagenum + 1
1741
1742 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1743
1744 for id in video_ids:
1745 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746 return
1747
1748
1749 class YoutubeUserIE(InfoExtractor):
1750 """Information Extractor for YouTube users."""
1751
1752 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1753 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1754 _GDATA_PAGE_SIZE = 50
1755 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1756 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1757 IE_NAME = u'youtube:user'
1758
1759 def __init__(self, downloader=None):
1760 InfoExtractor.__init__(self, downloader)
1761
1762 def report_download_page(self, username, start_index):
1763 """Report attempt to download user page."""
1764 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1765 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1766
1767 def _real_extract(self, url):
1768 # Extract username
1769 mobj = re.match(self._VALID_URL, url)
1770 if mobj is None:
1771 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1772 return
1773
1774 username = mobj.group(1)
1775
1776 # Download video ids using YouTube Data API. Result size per
1777 # query is limited (currently to 50 videos) so we need to query
1778 # page by page until there are no video ids - it means we got
1779 # all of them.
1780
1781 video_ids = []
1782 pagenum = 0
1783
1784 while True:
1785 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1786 self.report_download_page(username, start_index)
1787
1788 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1789
1790 try:
1791 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794 return
1795
1796 # Extract video identifiers
1797 ids_in_page = []
1798
1799 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1800 if mobj.group(1) not in ids_in_page:
1801 ids_in_page.append(mobj.group(1))
1802
1803 video_ids.extend(ids_in_page)
1804
1805 # A little optimization - if current page is not
1806 # "full", ie. does not contain PAGE_SIZE video ids then
1807 # we can assume that this page is the last one - there
1808 # are no more ids on further pages - no need to query
1809 # again.
1810
1811 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1812 break
1813
1814 pagenum += 1
1815
1816 all_ids_count = len(video_ids)
1817 playliststart = self._downloader.params.get('playliststart', 1) - 1
1818 playlistend = self._downloader.params.get('playlistend', -1)
1819
1820 if playlistend == -1:
1821 video_ids = video_ids[playliststart:]
1822 else:
1823 video_ids = video_ids[playliststart:playlistend]
1824
1825 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1826 (username, all_ids_count, len(video_ids)))
1827
1828 for video_id in video_ids:
1829 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1830
1831
1832 class BlipTVUserIE(InfoExtractor):
1833 """Information Extractor for blip.tv users."""
1834
1835 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1836 _PAGE_SIZE = 12
1837 IE_NAME = u'blip.tv:user'
1838
1839 def __init__(self, downloader=None):
1840 InfoExtractor.__init__(self, downloader)
1841
1842 def report_download_page(self, username, pagenum):
1843 """Report attempt to download user page."""
1844 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1845 (self.IE_NAME, username, pagenum))
1846
1847 def _real_extract(self, url):
1848 # Extract username
1849 mobj = re.match(self._VALID_URL, url)
1850 if mobj is None:
1851 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1852 return
1853
1854 username = mobj.group(1)
1855
1856 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1857
1858 request = compat_urllib_request.Request(url)
1859
1860 try:
1861 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1862 mobj = re.search(r'data-users-id="([^"]+)"', page)
1863 page_base = page_base % mobj.group(1)
1864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1866 return
1867
1868
1869 # Download video ids using BlipTV Ajax calls. Result size per
1870 # query is limited (currently to 12 videos) so we need to query
1871 # page by page until there are no video ids - it means we got
1872 # all of them.
1873
1874 video_ids = []
1875 pagenum = 1
1876
1877 while True:
1878 self.report_download_page(username, pagenum)
1879
1880 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1881
1882 try:
1883 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1884 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1885 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1886 return
1887
1888 # Extract video identifiers
1889 ids_in_page = []
1890
1891 for mobj in re.finditer(r'href="/([^"]+)"', page):
1892 if mobj.group(1) not in ids_in_page:
1893 ids_in_page.append(unescapeHTML(mobj.group(1)))
1894
1895 video_ids.extend(ids_in_page)
1896
1897 # A little optimization - if current page is not
1898 # "full", ie. does not contain PAGE_SIZE video ids then
1899 # we can assume that this page is the last one - there
1900 # are no more ids on further pages - no need to query
1901 # again.
1902
1903 if len(ids_in_page) < self._PAGE_SIZE:
1904 break
1905
1906 pagenum += 1
1907
1908 all_ids_count = len(video_ids)
1909 playliststart = self._downloader.params.get('playliststart', 1) - 1
1910 playlistend = self._downloader.params.get('playlistend', -1)
1911
1912 if playlistend == -1:
1913 video_ids = video_ids[playliststart:]
1914 else:
1915 video_ids = video_ids[playliststart:playlistend]
1916
1917 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1918 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1919
1920 for video_id in video_ids:
1921 self._downloader.download([u'http://blip.tv/'+video_id])
1922
1923
1924 class DepositFilesIE(InfoExtractor):
1925 """Information extractor for depositfiles.com"""
1926
1927 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1928
1929 def report_download_webpage(self, file_id):
1930 """Report webpage download."""
1931 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1932
1933 def report_extraction(self, file_id):
1934 """Report information extraction."""
1935 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1936
1937 def _real_extract(self, url):
1938 file_id = url.split('/')[-1]
1939 # Rebuild url in english locale
1940 url = 'http://depositfiles.com/en/files/' + file_id
1941
1942 # Retrieve file webpage with 'Free download' button pressed
1943 free_download_indication = { 'gateway_result' : '1' }
1944 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1945 try:
1946 self.report_download_webpage(file_id)
1947 webpage = compat_urllib_request.urlopen(request).read()
1948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1950 return
1951
1952 # Search for the real file URL
1953 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1954 if (mobj is None) or (mobj.group(1) is None):
1955 # Try to figure out reason of the error.
1956 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1957 if (mobj is not None) and (mobj.group(1) is not None):
1958 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1959 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1960 else:
1961 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1962 return
1963
1964 file_url = mobj.group(1)
1965 file_extension = os.path.splitext(file_url)[1][1:]
1966
1967 # Search for file title
1968 mobj = re.search(r'<b title="(.*?)">', webpage)
1969 if mobj is None:
1970 self._downloader.trouble(u'ERROR: unable to extract title')
1971 return
1972 file_title = mobj.group(1).decode('utf-8')
1973
1974 return [{
1975 'id': file_id.decode('utf-8'),
1976 'url': file_url.decode('utf-8'),
1977 'uploader': None,
1978 'upload_date': None,
1979 'title': file_title,
1980 'ext': file_extension.decode('utf-8'),
1981 }]
1982
1983
1984 class FacebookIE(InfoExtractor):
1985 """Information Extractor for Facebook"""
1986
1987 _WORKING = False
1988 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1989 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1990 _NETRC_MACHINE = 'facebook'
1991 _available_formats = ['video', 'highqual', 'lowqual']
1992 _video_extensions = {
1993 'video': 'mp4',
1994 'highqual': 'mp4',
1995 'lowqual': 'mp4',
1996 }
1997 IE_NAME = u'facebook'
1998
1999 def __init__(self, downloader=None):
2000 InfoExtractor.__init__(self, downloader)
2001
2002 def _reporter(self, message):
2003 """Add header and report message."""
2004 self._downloader.to_screen(u'[facebook] %s' % message)
2005
2006 def report_login(self):
2007 """Report attempt to log in."""
2008 self._reporter(u'Logging in')
2009
2010 def report_video_webpage_download(self, video_id):
2011 """Report attempt to download video webpage."""
2012 self._reporter(u'%s: Downloading video webpage' % video_id)
2013
2014 def report_information_extraction(self, video_id):
2015 """Report attempt to extract video information."""
2016 self._reporter(u'%s: Extracting video information' % video_id)
2017
2018 def _parse_page(self, video_webpage):
2019 """Extract video information from page"""
2020 # General data
2021 data = {'title': r'\("video_title", "(.*?)"\)',
2022 'description': r'<div class="datawrap">(.*?)</div>',
2023 'owner': r'\("video_owner_name", "(.*?)"\)',
2024 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2025 }
2026 video_info = {}
2027 for piece in data.keys():
2028 mobj = re.search(data[piece], video_webpage)
2029 if mobj is not None:
2030 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2031
2032 # Video urls
2033 video_urls = {}
2034 for fmt in self._available_formats:
2035 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2036 if mobj is not None:
2037 # URL is in a Javascript segment inside an escaped Unicode format within
2038 # the generally utf-8 page
2039 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2040 video_info['video_urls'] = video_urls
2041
2042 return video_info
2043
2044 def _real_initialize(self):
2045 if self._downloader is None:
2046 return
2047
2048 useremail = None
2049 password = None
2050 downloader_params = self._downloader.params
2051
2052 # Attempt to use provided username and password or .netrc data
2053 if downloader_params.get('username', None) is not None:
2054 useremail = downloader_params['username']
2055 password = downloader_params['password']
2056 elif downloader_params.get('usenetrc', False):
2057 try:
2058 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2059 if info is not None:
2060 useremail = info[0]
2061 password = info[2]
2062 else:
2063 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2064 except (IOError, netrc.NetrcParseError) as err:
2065 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2066 return
2067
2068 if useremail is None:
2069 return
2070
2071 # Log in
2072 login_form = {
2073 'email': useremail,
2074 'pass': password,
2075 'login': 'Log+In'
2076 }
2077 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2078 try:
2079 self.report_login()
2080 login_results = compat_urllib_request.urlopen(request).read()
2081 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2082 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2083 return
2084 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2085 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2086 return
2087
2088 def _real_extract(self, url):
2089 mobj = re.match(self._VALID_URL, url)
2090 if mobj is None:
2091 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2092 return
2093 video_id = mobj.group('ID')
2094
2095 # Get video webpage
2096 self.report_video_webpage_download(video_id)
2097 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2098 try:
2099 page = compat_urllib_request.urlopen(request)
2100 video_webpage = page.read()
2101 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2102 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2103 return
2104
2105 # Start extracting information
2106 self.report_information_extraction(video_id)
2107
2108 # Extract information
2109 video_info = self._parse_page(video_webpage)
2110
2111 # uploader
2112 if 'owner' not in video_info:
2113 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2114 return
2115 video_uploader = video_info['owner']
2116
2117 # title
2118 if 'title' not in video_info:
2119 self._downloader.trouble(u'ERROR: unable to extract video title')
2120 return
2121 video_title = video_info['title']
2122 video_title = video_title.decode('utf-8')
2123
2124 # thumbnail image
2125 if 'thumbnail' not in video_info:
2126 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2127 video_thumbnail = ''
2128 else:
2129 video_thumbnail = video_info['thumbnail']
2130
2131 # upload date
2132 upload_date = None
2133 if 'upload_date' in video_info:
2134 upload_time = video_info['upload_date']
2135 timetuple = email.utils.parsedate_tz(upload_time)
2136 if timetuple is not None:
2137 try:
2138 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2139 except:
2140 pass
2141
2142 # description
2143 video_description = video_info.get('description', 'No description available.')
2144
2145 url_map = video_info['video_urls']
2146 if url_map:
2147 # Decide which formats to download
2148 req_format = self._downloader.params.get('format', None)
2149 format_limit = self._downloader.params.get('format_limit', None)
2150
2151 if format_limit is not None and format_limit in self._available_formats:
2152 format_list = self._available_formats[self._available_formats.index(format_limit):]
2153 else:
2154 format_list = self._available_formats
2155 existing_formats = [x for x in format_list if x in url_map]
2156 if len(existing_formats) == 0:
2157 self._downloader.trouble(u'ERROR: no known formats available for video')
2158 return
2159 if req_format is None:
2160 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2161 elif req_format == 'worst':
2162 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2163 elif req_format == '-1':
2164 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2165 else:
2166 # Specific format
2167 if req_format not in url_map:
2168 self._downloader.trouble(u'ERROR: requested format not available')
2169 return
2170 video_url_list = [(req_format, url_map[req_format])] # Specific format
2171
2172 results = []
2173 for format_param, video_real_url in video_url_list:
2174 # Extension
2175 video_extension = self._video_extensions.get(format_param, 'mp4')
2176
2177 results.append({
2178 'id': video_id.decode('utf-8'),
2179 'url': video_real_url.decode('utf-8'),
2180 'uploader': video_uploader.decode('utf-8'),
2181 'upload_date': upload_date,
2182 'title': video_title,
2183 'ext': video_extension.decode('utf-8'),
2184 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2185 'thumbnail': video_thumbnail.decode('utf-8'),
2186 'description': video_description.decode('utf-8'),
2187 })
2188 return results
2189
2190 class BlipTVIE(InfoExtractor):
2191 """Information extractor for blip.tv"""
2192
2193 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195 IE_NAME = u'blip.tv'
2196
2197 def report_extraction(self, file_id):
2198 """Report information extraction."""
2199 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200
2201 def report_direct_download(self, title):
2202 """Report information extraction."""
2203 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204
2205 def _real_extract(self, url):
2206 mobj = re.match(self._VALID_URL, url)
2207 if mobj is None:
2208 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2209 return
2210
2211 if '?' in url:
2212 cchar = '&'
2213 else:
2214 cchar = '?'
2215 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2216 request = compat_urllib_request.Request(json_url)
2217 self.report_extraction(mobj.group(1))
2218 info = None
2219 try:
2220 urlh = compat_urllib_request.urlopen(request)
2221 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222 basename = url.split('/')[-1]
2223 title,ext = os.path.splitext(basename)
2224 title = title.decode('UTF-8')
2225 ext = ext.replace('.', '')
2226 self.report_direct_download(title)
2227 info = {
2228 'id': title,
2229 'url': url,
2230 'uploader': None,
2231 'upload_date': None,
2232 'title': title,
2233 'ext': ext,
2234 'urlhandle': urlh
2235 }
2236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238 return
2239 if info is None: # Regular URL
2240 try:
2241 json_code_bytes = urlh.read()
2242 json_code = json_code_bytes.decode('utf-8')
2243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2244 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2245 return
2246
2247 try:
2248 json_data = json.loads(json_code)
2249 if 'Post' in json_data:
2250 data = json_data['Post']
2251 else:
2252 data = json_data
2253
2254 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2255 video_url = data['media']['url']
2256 umobj = re.match(self._URL_EXT, video_url)
2257 if umobj is None:
2258 raise ValueError('Can not determine filename extension')
2259 ext = umobj.group(1)
2260
2261 info = {
2262 'id': data['item_id'],
2263 'url': video_url,
2264 'uploader': data['display_name'],
2265 'upload_date': upload_date,
2266 'title': data['title'],
2267 'ext': ext,
2268 'format': data['media']['mimeType'],
2269 'thumbnail': data['thumbnailUrl'],
2270 'description': data['description'],
2271 'player_url': data['embedUrl']
2272 }
2273 except (ValueError,KeyError) as err:
2274 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2275 return
2276
2277 std_headers['User-Agent'] = 'iTunes/10.6.1'
2278 return [info]
2279
2280
2281 class MyVideoIE(InfoExtractor):
2282 """Information Extractor for myvideo.de."""
2283
2284 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2285 IE_NAME = u'myvideo'
2286
2287 def __init__(self, downloader=None):
2288 InfoExtractor.__init__(self, downloader)
2289
2290 def report_extraction(self, video_id):
2291 """Report information extraction."""
2292 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2293
2294 def _real_extract(self,url):
2295 mobj = re.match(self._VALID_URL, url)
2296 if mobj is None:
2297 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2298 return
2299
2300 video_id = mobj.group(1)
2301
2302 # Get video webpage
2303 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2304 webpage = self._download_webpage(webpage_url, video_id)
2305
2306 self.report_extraction(video_id)
2307 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2308 webpage)
2309 if mobj is None:
2310 self._downloader.trouble(u'ERROR: unable to extract media URL')
2311 return
2312 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2313
2314 mobj = re.search('<title>([^<]+)</title>', webpage)
2315 if mobj is None:
2316 self._downloader.trouble(u'ERROR: unable to extract title')
2317 return
2318
2319 video_title = mobj.group(1)
2320
2321 return [{
2322 'id': video_id,
2323 'url': video_url,
2324 'uploader': None,
2325 'upload_date': None,
2326 'title': video_title,
2327 'ext': u'flv',
2328 }]
2329
2330 class ComedyCentralIE(InfoExtractor):
2331 """Information extractor for The Daily Show and Colbert Report """
2332
2333 # urls can be abbreviations like :thedailyshow or :colbert
2334 # urls for episodes like:
2335 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2336 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2337 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2338 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2339 |(https?://)?(www\.)?
2340 (?P<showname>thedailyshow|colbertnation)\.com/
2341 (full-episodes/(?P<episode>.*)|
2342 (?P<clip>
2343 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2344 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2345 $"""
2346 IE_NAME = u'comedycentral'
2347
2348 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349
2350 _video_extensions = {
2351 '3500': 'mp4',
2352 '2200': 'mp4',
2353 '1700': 'mp4',
2354 '1200': 'mp4',
2355 '750': 'mp4',
2356 '400': 'mp4',
2357 }
2358 _video_dimensions = {
2359 '3500': '1280x720',
2360 '2200': '960x540',
2361 '1700': '768x432',
2362 '1200': '640x360',
2363 '750': '512x288',
2364 '400': '384x216',
2365 }
2366
2367 def suitable(self, url):
2368 """Receives a URL and returns True if suitable for this IE."""
2369 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2370
2371 def report_extraction(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2373
2374 def report_config_download(self, episode_id):
2375 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2376
2377 def report_index_download(self, episode_id):
2378 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2379
2380 def report_player_url(self, episode_id):
2381 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2382
2383
2384 def _print_formats(self, formats):
2385 print('Available formats:')
2386 for x in formats:
2387 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2388
2389
2390 def _real_extract(self, url):
2391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 if mobj is None:
2393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2394 return
2395
2396 if mobj.group('shortname'):
2397 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2398 url = u'http://www.thedailyshow.com/full-episodes/'
2399 else:
2400 url = u'http://www.colbertnation.com/full-episodes/'
2401 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2402 assert mobj is not None
2403
2404 if mobj.group('clip'):
2405 if mobj.group('showname') == 'thedailyshow':
2406 epTitle = mobj.group('tdstitle')
2407 else:
2408 epTitle = mobj.group('cntitle')
2409 dlNewest = False
2410 else:
2411 dlNewest = not mobj.group('episode')
2412 if dlNewest:
2413 epTitle = mobj.group('showname')
2414 else:
2415 epTitle = mobj.group('episode')
2416
2417 req = compat_urllib_request.Request(url)
2418 self.report_extraction(epTitle)
2419 try:
2420 htmlHandle = compat_urllib_request.urlopen(req)
2421 html = htmlHandle.read()
2422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2424 return
2425 if dlNewest:
2426 url = htmlHandle.geturl()
2427 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2428 if mobj is None:
2429 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2430 return
2431 if mobj.group('episode') == '':
2432 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2433 return
2434 epTitle = mobj.group('episode')
2435
2436 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2437
2438 if len(mMovieParams) == 0:
2439 # The Colbert Report embeds the information in a without
2440 # a URL prefix; so extract the alternate reference
2441 # and then add the URL prefix manually.
2442
2443 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2444 if len(altMovieParams) == 0:
2445 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2446 return
2447 else:
2448 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2449
2450 playerUrl_raw = mMovieParams[0][0]
2451 self.report_player_url(epTitle)
2452 try:
2453 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2454 playerUrl = urlHandle.geturl()
2455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2456 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2457 return
2458
2459 uri = mMovieParams[0][1]
2460 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2461 self.report_index_download(epTitle)
2462 try:
2463 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2464 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2466 return
2467
2468 results = []
2469
2470 idoc = xml.etree.ElementTree.fromstring(indexXml)
2471 itemEls = idoc.findall('.//item')
2472 for itemEl in itemEls:
2473 mediaId = itemEl.findall('./guid')[0].text
2474 shortMediaId = mediaId.split(':')[-1]
2475 showId = mediaId.split(':')[-2].replace('.com', '')
2476 officialTitle = itemEl.findall('./title')[0].text
2477 officialDate = itemEl.findall('./pubDate')[0].text
2478
2479 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2480 compat_urllib_parse.urlencode({'uri': mediaId}))
2481 configReq = compat_urllib_request.Request(configUrl)
2482 self.report_config_download(epTitle)
2483 try:
2484 configXml = compat_urllib_request.urlopen(configReq).read()
2485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2487 return
2488
2489 cdoc = xml.etree.ElementTree.fromstring(configXml)
2490 turls = []
2491 for rendition in cdoc.findall('.//rendition'):
2492 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2493 turls.append(finfo)
2494
2495 if len(turls) == 0:
2496 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2497 continue
2498
2499 if self._downloader.params.get('listformats', None):
2500 self._print_formats([i[0] for i in turls])
2501 return
2502
2503 # For now, just pick the highest bitrate
2504 format,video_url = turls[-1]
2505
2506 # Get the format arg from the arg stream
2507 req_format = self._downloader.params.get('format', None)
2508
2509 # Select format if we can find one
2510 for f,v in turls:
2511 if f == req_format:
2512 format, video_url = f, v
2513 break
2514
2515 # Patch to download from alternative CDN, which does not
2516 # break on current RTMPDump builds
2517 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2518 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2519
2520 if video_url.startswith(broken_cdn):
2521 video_url = video_url.replace(broken_cdn, better_cdn)
2522
2523 effTitle = showId + u'-' + epTitle
2524 info = {
2525 'id': shortMediaId,
2526 'url': video_url,
2527 'uploader': showId,
2528 'upload_date': officialDate,
2529 'title': effTitle,
2530 'ext': 'mp4',
2531 'format': format,
2532 'thumbnail': None,
2533 'description': officialTitle,
2534 'player_url': None #playerUrl
2535 }
2536
2537 results.append(info)
2538
2539 return results
2540
2541
2542 class EscapistIE(InfoExtractor):
2543 """Information extractor for The Escapist """
2544
2545 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2546 IE_NAME = u'escapist'
2547
2548 def report_extraction(self, showName):
2549 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2550
2551 def report_config_download(self, showName):
2552 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2553
2554 def _real_extract(self, url):
2555 mobj = re.match(self._VALID_URL, url)
2556 if mobj is None:
2557 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2558 return
2559 showName = mobj.group('showname')
2560 videoId = mobj.group('episode')
2561
2562 self.report_extraction(showName)
2563 try:
2564 webPage = compat_urllib_request.urlopen(url)
2565 webPageBytes = webPage.read()
2566 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2567 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2568 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2570 return
2571
2572 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2573 description = unescapeHTML(descMatch.group(1))
2574 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2575 imgUrl = unescapeHTML(imgMatch.group(1))
2576 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2577 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2578 configUrlMatch = re.search('config=(.*)$', playerUrl)
2579 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2580
2581 self.report_config_download(showName)
2582 try:
2583 configJSON = compat_urllib_request.urlopen(configUrl)
2584 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2585 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2588 return
2589
2590 # Technically, it's JavaScript, not JSON
2591 configJSON = configJSON.replace("'", '"')
2592
2593 try:
2594 config = json.loads(configJSON)
2595 except (ValueError,) as err:
2596 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2597 return
2598
2599 playlist = config['playlist']
2600 videoUrl = playlist[1]['url']
2601
2602 info = {
2603 'id': videoId,
2604 'url': videoUrl,
2605 'uploader': showName,
2606 'upload_date': None,
2607 'title': showName,
2608 'ext': 'flv',
2609 'thumbnail': imgUrl,
2610 'description': description,
2611 'player_url': playerUrl,
2612 }
2613
2614 return [info]
2615
2616
2617 class CollegeHumorIE(InfoExtractor):
2618 """Information extractor for collegehumor.com"""
2619
2620 _WORKING = False
2621 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2622 IE_NAME = u'collegehumor'
2623
2624 def report_manifest(self, video_id):
2625 """Report information extraction."""
2626 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2627
2628 def report_extraction(self, video_id):
2629 """Report information extraction."""
2630 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631
2632 def _real_extract(self, url):
2633 mobj = re.match(self._VALID_URL, url)
2634 if mobj is None:
2635 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2636 return
2637 video_id = mobj.group('videoid')
2638
2639 info = {
2640 'id': video_id,
2641 'uploader': None,
2642 'upload_date': None,
2643 }
2644
2645 self.report_extraction(video_id)
2646 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2647 try:
2648 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2651 return
2652
2653 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2654 try:
2655 videoNode = mdoc.findall('./video')[0]
2656 info['description'] = videoNode.findall('./description')[0].text
2657 info['title'] = videoNode.findall('./caption')[0].text
2658 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2659 manifest_url = videoNode.findall('./file')[0].text
2660 except IndexError:
2661 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2662 return
2663
2664 manifest_url += '?hdcore=2.10.3'
2665 self.report_manifest(video_id)
2666 try:
2667 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2670 return
2671
2672 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2673 try:
2674 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2675 node_id = media_node.attrib['url']
2676 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2677 except IndexError as err:
2678 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2679 return
2680
2681 url_pr = compat_urllib_parse_urlparse(manifest_url)
2682 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2683
2684 info['url'] = url
2685 info['ext'] = 'f4f'
2686 return [info]
2687
2688
2689 class XVideosIE(InfoExtractor):
2690 """Information extractor for xvideos.com"""
2691
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2693 IE_NAME = u'xvideos'
2694
2695 def report_extraction(self, video_id):
2696 """Report information extraction."""
2697 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2701 if mobj is None:
2702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703 return
2704 video_id = mobj.group(1)
2705
2706 webpage = self._download_webpage(url, video_id)
2707
2708 self.report_extraction(video_id)
2709
2710
2711 # Extract video URL
2712 mobj = re.search(r'flv_url=(.+?)&', webpage)
2713 if mobj is None:
2714 self._downloader.trouble(u'ERROR: unable to extract video url')
2715 return
2716 video_url = compat_urllib_parse.unquote(mobj.group(1))
2717
2718
2719 # Extract title
2720 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2721 if mobj is None:
2722 self._downloader.trouble(u'ERROR: unable to extract video title')
2723 return
2724 video_title = mobj.group(1)
2725
2726
2727 # Extract video thumbnail
2728 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2729 if mobj is None:
2730 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2731 return
2732 video_thumbnail = mobj.group(0)
2733
2734 info = {
2735 'id': video_id,
2736 'url': video_url,
2737 'uploader': None,
2738 'upload_date': None,
2739 'title': video_title,
2740 'ext': 'flv',
2741 'thumbnail': video_thumbnail,
2742 'description': None,
2743 }
2744
2745 return [info]
2746
2747
2748 class SoundcloudIE(InfoExtractor):
2749 """Information extractor for soundcloud.com
2750 To access the media, the uid of the song and a stream token
2751 must be extracted from the page source and the script must make
2752 a request to media.soundcloud.com/crossdomain.xml. Then
2753 the media can be grabbed by requesting from an url composed
2754 of the stream token and uid
2755 """
2756
2757 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2758 IE_NAME = u'soundcloud'
2759
2760 def __init__(self, downloader=None):
2761 InfoExtractor.__init__(self, downloader)
2762
2763 def report_resolve(self, video_id):
2764 """Report information extraction."""
2765 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2766
2767 def report_extraction(self, video_id):
2768 """Report information extraction."""
2769 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2770
2771 def _real_extract(self, url):
2772 mobj = re.match(self._VALID_URL, url)
2773 if mobj is None:
2774 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2775 return
2776
2777 # extract uploader (which is in the url)
2778 uploader = mobj.group(1)
2779 # extract simple title (uploader + slug of song title)
2780 slug_title = mobj.group(2)
2781 simple_title = uploader + u'-' + slug_title
2782
2783 self.report_resolve('%s/%s' % (uploader, slug_title))
2784
2785 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2786 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2787 request = compat_urllib_request.Request(resolv_url)
2788 try:
2789 info_json_bytes = compat_urllib_request.urlopen(request).read()
2790 info_json = info_json_bytes.decode('utf-8')
2791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2793 return
2794
2795 info = json.loads(info_json)
2796 video_id = info['id']
2797 self.report_extraction('%s/%s' % (uploader, slug_title))
2798
2799 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2800 request = compat_urllib_request.Request(streams_url)
2801 try:
2802 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2803 stream_json = stream_json_bytes.decode('utf-8')
2804 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2805 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2806 return
2807
2808 streams = json.loads(stream_json)
2809 mediaURL = streams['http_mp3_128_url']
2810
2811 return [{
2812 'id': info['id'],
2813 'url': mediaURL,
2814 'uploader': info['user']['username'],
2815 'upload_date': info['created_at'],
2816 'title': info['title'],
2817 'ext': u'mp3',
2818 'description': info['description'],
2819 }]
2820
2821
2822 class InfoQIE(InfoExtractor):
2823 """Information extractor for infoq.com"""
2824 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2825
2826 def report_extraction(self, video_id):
2827 """Report information extraction."""
2828 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2829
2830 def _real_extract(self, url):
2831 mobj = re.match(self._VALID_URL, url)
2832 if mobj is None:
2833 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2834 return
2835
2836 webpage = self._download_webpage(url, video_id=url)
2837 self.report_extraction(url)
2838
2839 # Extract video URL
2840 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2841 if mobj is None:
2842 self._downloader.trouble(u'ERROR: unable to extract video url')
2843 return
2844 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2845 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2846
2847 # Extract title
2848 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2849 if mobj is None:
2850 self._downloader.trouble(u'ERROR: unable to extract video title')
2851 return
2852 video_title = mobj.group(1)
2853
2854 # Extract description
2855 video_description = u'No description available.'
2856 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2857 if mobj is not None:
2858 video_description = mobj.group(1)
2859
2860 video_filename = video_url.split('/')[-1]
2861 video_id, extension = video_filename.split('.')
2862
2863 info = {
2864 'id': video_id,
2865 'url': video_url,
2866 'uploader': None,
2867 'upload_date': None,
2868 'title': video_title,
2869 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2870 'thumbnail': None,
2871 'description': video_description,
2872 }
2873
2874 return [info]
2875
2876 class MixcloudIE(InfoExtractor):
2877 """Information extractor for www.mixcloud.com"""
2878
2879 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2880 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2881 IE_NAME = u'mixcloud'
2882
2883 def __init__(self, downloader=None):
2884 InfoExtractor.__init__(self, downloader)
2885
2886 def report_download_json(self, file_id):
2887 """Report JSON download."""
2888 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2889
2890 def report_extraction(self, file_id):
2891 """Report information extraction."""
2892 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2893
2894 def get_urls(self, jsonData, fmt, bitrate='best'):
2895 """Get urls from 'audio_formats' section in json"""
2896 file_url = None
2897 try:
2898 bitrate_list = jsonData[fmt]
2899 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2900 bitrate = max(bitrate_list) # select highest
2901
2902 url_list = jsonData[fmt][bitrate]
2903 except TypeError: # we have no bitrate info.
2904 url_list = jsonData[fmt]
2905 return url_list
2906
2907 def check_urls(self, url_list):
2908 """Returns 1st active url from list"""
2909 for url in url_list:
2910 try:
2911 compat_urllib_request.urlopen(url)
2912 return url
2913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2914 url = None
2915
2916 return None
2917
2918 def _print_formats(self, formats):
2919 print('Available formats:')
2920 for fmt in formats.keys():
2921 for b in formats[fmt]:
2922 try:
2923 ext = formats[fmt][b][0]
2924 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2925 except TypeError: # we have no bitrate info
2926 ext = formats[fmt][0]
2927 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2928 break
2929
2930 def _real_extract(self, url):
2931 mobj = re.match(self._VALID_URL, url)
2932 if mobj is None:
2933 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2934 return
2935 # extract uploader & filename from url
2936 uploader = mobj.group(1).decode('utf-8')
2937 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2938
2939 # construct API request
2940 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2941 # retrieve .json file with links to files
2942 request = compat_urllib_request.Request(file_url)
2943 try:
2944 self.report_download_json(file_url)
2945 jsonData = compat_urllib_request.urlopen(request).read()
2946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2948 return
2949
2950 # parse JSON
2951 json_data = json.loads(jsonData)
2952 player_url = json_data['player_swf_url']
2953 formats = dict(json_data['audio_formats'])
2954
2955 req_format = self._downloader.params.get('format', None)
2956 bitrate = None
2957
2958 if self._downloader.params.get('listformats', None):
2959 self._print_formats(formats)
2960 return
2961
2962 if req_format is None or req_format == 'best':
2963 for format_param in formats.keys():
2964 url_list = self.get_urls(formats, format_param)
2965 # check urls
2966 file_url = self.check_urls(url_list)
2967 if file_url is not None:
2968 break # got it!
2969 else:
2970 if req_format not in formats:
2971 self._downloader.trouble(u'ERROR: format is not available')
2972 return
2973
2974 url_list = self.get_urls(formats, req_format)
2975 file_url = self.check_urls(url_list)
2976 format_param = req_format
2977
2978 return [{
2979 'id': file_id.decode('utf-8'),
2980 'url': file_url.decode('utf-8'),
2981 'uploader': uploader.decode('utf-8'),
2982 'upload_date': None,
2983 'title': json_data['name'],
2984 'ext': file_url.split('.')[-1].decode('utf-8'),
2985 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2986 'thumbnail': json_data['thumbnail_url'],
2987 'description': json_data['description'],
2988 'player_url': player_url.decode('utf-8'),
2989 }]
2990
2991 class StanfordOpenClassroomIE(InfoExtractor):
2992 """Information extractor for Stanford's Open ClassRoom"""
2993
2994 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2995 IE_NAME = u'stanfordoc'
2996
2997 def report_download_webpage(self, objid):
2998 """Report information extraction."""
2999 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3000
3001 def report_extraction(self, video_id):
3002 """Report information extraction."""
3003 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3004
3005 def _real_extract(self, url):
3006 mobj = re.match(self._VALID_URL, url)
3007 if mobj is None:
3008 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3009 return
3010
3011 if mobj.group('course') and mobj.group('video'): # A specific video
3012 course = mobj.group('course')
3013 video = mobj.group('video')
3014 info = {
3015 'id': course + '_' + video,
3016 'uploader': None,
3017 'upload_date': None,
3018 }
3019
3020 self.report_extraction(info['id'])
3021 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3022 xmlUrl = baseUrl + video + '.xml'
3023 try:
3024 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3025 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3026 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3027 return
3028 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3029 try:
3030 info['title'] = mdoc.findall('./title')[0].text
3031 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3032 except IndexError:
3033 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3034 return
3035 info['ext'] = info['url'].rpartition('.')[2]
3036 return [info]
3037 elif mobj.group('course'): # A course page
3038 course = mobj.group('course')
3039 info = {
3040 'id': course,
3041 'type': 'playlist',
3042 'uploader': None,
3043 'upload_date': None,
3044 }
3045
3046 self.report_download_webpage(info['id'])
3047 try:
3048 coursepage = compat_urllib_request.urlopen(url).read()
3049 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3050 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3051 return
3052
3053 m = re.search('<h1>([^<]+)</h1>', coursepage)
3054 if m:
3055 info['title'] = unescapeHTML(m.group(1))
3056 else:
3057 info['title'] = info['id']
3058
3059 m = re.search('<description>([^<]+)</description>', coursepage)
3060 if m:
3061 info['description'] = unescapeHTML(m.group(1))
3062
3063 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3064 info['list'] = [
3065 {
3066 'type': 'reference',
3067 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3068 }
3069 for vpage in links]
3070 results = []
3071 for entry in info['list']:
3072 assert entry['type'] == 'reference'
3073 results += self.extract(entry['url'])
3074 return results
3075
3076 else: # Root page
3077 info = {
3078 'id': 'Stanford OpenClassroom',
3079 'type': 'playlist',
3080 'uploader': None,
3081 'upload_date': None,
3082 }
3083
3084 self.report_download_webpage(info['id'])
3085 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3086 try:
3087 rootpage = compat_urllib_request.urlopen(rootURL).read()
3088 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3089 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3090 return
3091
3092 info['title'] = info['id']
3093
3094 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3095 info['list'] = [
3096 {
3097 'type': 'reference',
3098 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3099 }
3100 for cpage in links]
3101
3102 results = []
3103 for entry in info['list']:
3104 assert entry['type'] == 'reference'
3105 results += self.extract(entry['url'])
3106 return results
3107
3108 class MTVIE(InfoExtractor):
3109 """Information extractor for MTV.com"""
3110
3111 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3112 IE_NAME = u'mtv'
3113
3114 def report_extraction(self, video_id):
3115 """Report information extraction."""
3116 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3117
3118 def _real_extract(self, url):
3119 mobj = re.match(self._VALID_URL, url)
3120 if mobj is None:
3121 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3122 return
3123 if not mobj.group('proto'):
3124 url = 'http://' + url
3125 video_id = mobj.group('videoid')
3126
3127 webpage = self._download_webpage(url, video_id)
3128
3129 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3130 if mobj is None:
3131 self._downloader.trouble(u'ERROR: unable to extract song name')
3132 return
3133 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3134 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3135 if mobj is None:
3136 self._downloader.trouble(u'ERROR: unable to extract performer')
3137 return
3138 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3139 video_title = performer + ' - ' + song_name
3140
3141 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3142 if mobj is None:
3143 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3144 return
3145 mtvn_uri = mobj.group(1)
3146
3147 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3148 if mobj is None:
3149 self._downloader.trouble(u'ERROR: unable to extract content id')
3150 return
3151 content_id = mobj.group(1)
3152
3153 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3154 self.report_extraction(video_id)
3155 request = compat_urllib_request.Request(videogen_url)
3156 try:
3157 metadataXml = compat_urllib_request.urlopen(request).read()
3158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3160 return
3161
3162 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3163 renditions = mdoc.findall('.//rendition')
3164
3165 # For now, always pick the highest quality.
3166 rendition = renditions[-1]
3167
3168 try:
3169 _,_,ext = rendition.attrib['type'].partition('/')
3170 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3171 video_url = rendition.find('./src').text
3172 except KeyError:
3173 self._downloader.trouble('Invalid rendition field.')
3174 return
3175
3176 info = {
3177 'id': video_id,
3178 'url': video_url,
3179 'uploader': performer,
3180 'upload_date': None,
3181 'title': video_title,
3182 'ext': ext,
3183 'format': format,
3184 }
3185
3186 return [info]
3187
3188
3189 class YoukuIE(InfoExtractor):
3190 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3191
3192 def report_download_webpage(self, file_id):
3193 """Report webpage download."""
3194 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3195
3196 def report_extraction(self, file_id):
3197 """Report information extraction."""
3198 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3199
3200 def _gen_sid(self):
3201 nowTime = int(time.time() * 1000)
3202 random1 = random.randint(1000,1998)
3203 random2 = random.randint(1000,9999)
3204
3205 return "%d%d%d" %(nowTime,random1,random2)
3206
3207 def _get_file_ID_mix_string(self, seed):
3208 mixed = []
3209 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3210 seed = float(seed)
3211 for i in range(len(source)):
3212 seed = (seed * 211 + 30031 ) % 65536
3213 index = math.floor(seed / 65536 * len(source) )
3214 mixed.append(source[int(index)])
3215 source.remove(source[int(index)])
3216 #return ''.join(mixed)
3217 return mixed
3218
3219 def _get_file_id(self, fileId, seed):
3220 mixed = self._get_file_ID_mix_string(seed)
3221 ids = fileId.split('*')
3222 realId = []
3223 for ch in ids:
3224 if ch:
3225 realId.append(mixed[int(ch)])
3226 return ''.join(realId)
3227
3228 def _real_extract(self, url):
3229 mobj = re.match(self._VALID_URL, url)
3230 if mobj is None:
3231 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3232 return
3233 video_id = mobj.group('ID')
3234
3235 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3236
3237 request = compat_urllib_request.Request(info_url, None, std_headers)
3238 try:
3239 self.report_download_webpage(video_id)
3240 jsondata = compat_urllib_request.urlopen(request).read()
3241 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3242 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3243 return
3244
3245 self.report_extraction(video_id)
3246 try:
3247 jsonstr = jsondata.decode('utf-8')
3248 config = json.loads(jsonstr)
3249
3250 video_title = config['data'][0]['title']
3251 seed = config['data'][0]['seed']
3252
3253 format = self._downloader.params.get('format', None)
3254 supported_format = list(config['data'][0]['streamfileids'].keys())
3255
3256 if format is None or format == 'best':
3257 if 'hd2' in supported_format:
3258 format = 'hd2'
3259 else:
3260 format = 'flv'
3261 ext = u'flv'
3262 elif format == 'worst':
3263 format = 'mp4'
3264 ext = u'mp4'
3265 else:
3266 format = 'flv'
3267 ext = u'flv'
3268
3269
3270 fileid = config['data'][0]['streamfileids'][format]
3271 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3272 except (UnicodeDecodeError, ValueError, KeyError):
3273 self._downloader.trouble(u'ERROR: unable to extract info section')
3274 return
3275
3276 files_info=[]
3277 sid = self._gen_sid()
3278 fileid = self._get_file_id(fileid, seed)
3279
3280 #column 8,9 of fileid represent the segment number
3281 #fileid[7:9] should be changed
3282 for index, key in enumerate(keys):
3283
3284 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3285 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3286
3287 info = {
3288 'id': '%s_part%02d' % (video_id, index),
3289 'url': download_url,
3290 'uploader': None,
3291 'upload_date': None,
3292 'title': video_title,
3293 'ext': ext,
3294 }
3295 files_info.append(info)
3296
3297 return files_info
3298
3299
3300 class XNXXIE(InfoExtractor):
3301 """Information extractor for xnxx.com"""
3302
3303 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3304 IE_NAME = u'xnxx'
3305 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3306 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3307 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3308
3309 def report_webpage(self, video_id):
3310 """Report information extraction"""
3311 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3312
3313 def report_extraction(self, video_id):
3314 """Report information extraction"""
3315 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3316
3317 def _real_extract(self, url):
3318 mobj = re.match(self._VALID_URL, url)
3319 if mobj is None:
3320 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3321 return
3322 video_id = mobj.group(1)
3323
3324 self.report_webpage(video_id)
3325
3326 # Get webpage content
3327 try:
3328 webpage_bytes = compat_urllib_request.urlopen(url).read()
3329 webpage = webpage_bytes.decode('utf-8')
3330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3331 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3332 return
3333
3334 result = re.search(self.VIDEO_URL_RE, webpage)
3335 if result is None:
3336 self._downloader.trouble(u'ERROR: unable to extract video url')
3337 return
3338 video_url = compat_urllib_parse.unquote(result.group(1))
3339
3340 result = re.search(self.VIDEO_TITLE_RE, webpage)
3341 if result is None:
3342 self._downloader.trouble(u'ERROR: unable to extract video title')
3343 return
3344 video_title = result.group(1)
3345
3346 result = re.search(self.VIDEO_THUMB_RE, webpage)
3347 if result is None:
3348 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3349 return
3350 video_thumbnail = result.group(1)
3351
3352 return [{
3353 'id': video_id,
3354 'url': video_url,
3355 'uploader': None,
3356 'upload_date': None,
3357 'title': video_title,
3358 'ext': 'flv',
3359 'thumbnail': video_thumbnail,
3360 'description': None,
3361 }]
3362
3363
3364 class GooglePlusIE(InfoExtractor):
3365 """Information extractor for plus.google.com."""
3366
3367 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3368 IE_NAME = u'plus.google'
3369
3370 def __init__(self, downloader=None):
3371 InfoExtractor.__init__(self, downloader)
3372
3373 def report_extract_entry(self, url):
3374 """Report downloading extry"""
3375 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3376
3377 def report_date(self, upload_date):
3378 """Report downloading extry"""
3379 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3380
3381 def report_uploader(self, uploader):
3382 """Report downloading extry"""
3383 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3384
3385 def report_title(self, video_title):
3386 """Report downloading extry"""
3387 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3388
3389 def report_extract_vid_page(self, video_page):
3390 """Report information extraction."""
3391 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3392
3393 def _real_extract(self, url):
3394 # Extract id from URL
3395 mobj = re.match(self._VALID_URL, url)
3396 if mobj is None:
3397 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3398 return
3399
3400 post_url = mobj.group(0)
3401 video_id = mobj.group(1)
3402
3403 video_extension = 'flv'
3404
3405 # Step 1, Retrieve post webpage to extract further information
3406 self.report_extract_entry(post_url)
3407 request = compat_urllib_request.Request(post_url)
3408 try:
3409 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3410 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3411 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3412 return
3413
3414 # Extract update date
3415 upload_date = None
3416 pattern = 'title="Timestamp">(.*?)</a>'
3417 mobj = re.search(pattern, webpage)
3418 if mobj:
3419 upload_date = mobj.group(1)
3420 # Convert timestring to a format suitable for filename
3421 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3422 upload_date = upload_date.strftime('%Y%m%d')
3423 self.report_date(upload_date)
3424
3425 # Extract uploader
3426 uploader = None
3427 pattern = r'rel\="author".*?>(.*?)</a>'
3428 mobj = re.search(pattern, webpage)
3429 if mobj:
3430 uploader = mobj.group(1)
3431 self.report_uploader(uploader)
3432
3433 # Extract title
3434 # Get the first line for title
3435 video_title = u'NA'
3436 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3437 mobj = re.search(pattern, webpage)
3438 if mobj:
3439 video_title = mobj.group(1)
3440 self.report_title(video_title)
3441
3442 # Step 2, Stimulate clicking the image box to launch video
3443 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3444 mobj = re.search(pattern, webpage)
3445 if mobj is None:
3446 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3447
3448 video_page = mobj.group(1)
3449 request = compat_urllib_request.Request(video_page)
3450 try:
3451 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3453 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3454 return
3455 self.report_extract_vid_page(video_page)
3456
3457
3458 # Extract video links on video page
3459 """Extract video links of all sizes"""
3460 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3461 mobj = re.findall(pattern, webpage)
3462 if len(mobj) == 0:
3463 self._downloader.trouble(u'ERROR: unable to extract video links')
3464
3465 # Sort in resolution
3466 links = sorted(mobj)
3467
3468 # Choose the lowest of the sort, i.e. highest resolution
3469 video_url = links[-1]
3470 # Only get the url. The resolution part in the tuple has no use anymore
3471 video_url = video_url[-1]
3472 # Treat escaped \u0026 style hex
3473 try:
3474 video_url = video_url.decode("unicode_escape")
3475 except AttributeError: # Python 3
3476 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3477
3478
3479 return [{
3480 'id': video_id,
3481 'url': video_url,
3482 'uploader': uploader,
3483 'upload_date': upload_date,
3484 'title': video_title,
3485 'ext': video_extension,
3486 }]
3487
3488 class NBAIE(InfoExtractor):
3489 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3490 IE_NAME = u'nba'
3491
3492 def _real_extract(self, url):
3493 mobj = re.match(self._VALID_URL, url)
3494 if mobj is None:
3495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3496 return
3497
3498 video_id = mobj.group(1)
3499 if video_id.endswith('/index.html'):
3500 video_id = video_id[:-len('/index.html')]
3501
3502 webpage = self._download_webpage(url, video_id)
3503
3504 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3505 def _findProp(rexp, default=None):
3506 m = re.search(rexp, webpage)
3507 if m:
3508 return unescapeHTML(m.group(1))
3509 else:
3510 return default
3511
3512 shortened_video_id = video_id.rpartition('/')[2]
3513 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3514 info = {
3515 'id': shortened_video_id,
3516 'url': video_url,
3517 'ext': 'mp4',
3518 'title': title,
3519 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3520 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3521 }
3522 return [info]
3523
3524 class JustinTVIE(InfoExtractor):
3525 """Information extractor for justin.tv and twitch.tv"""
3526 # TODO: One broadcast may be split into multiple videos. The key
3527 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3528 # starts at 1 and increases. Can we treat all parts as one video?
3529
3530 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3531 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3532 _JUSTIN_PAGE_LIMIT = 100
3533 IE_NAME = u'justin.tv'
3534
3535 def report_extraction(self, file_id):
3536 """Report information extraction."""
3537 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3538
3539 def report_download_page(self, channel, offset):
3540 """Report attempt to download a single page of videos."""
3541 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3542 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3543
3544 # Return count of items, list of *valid* items
3545 def _parse_page(self, url):
3546 try:
3547 urlh = compat_urllib_request.urlopen(url)
3548 webpage_bytes = urlh.read()
3549 webpage = webpage_bytes.decode('utf-8', 'ignore')
3550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3551 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3552 return
3553
3554 response = json.loads(webpage)
3555 info = []
3556 for clip in response:
3557 video_url = clip['video_file_url']
3558 if video_url:
3559 video_extension = os.path.splitext(video_url)[1][1:]
3560 video_date = re.sub('-', '', clip['created_on'][:10])
3561 info.append({
3562 'id': clip['id'],
3563 'url': video_url,
3564 'title': clip['title'],
3565 'uploader': clip.get('user_id', clip.get('channel_id')),
3566 'upload_date': video_date,
3567 'ext': video_extension,
3568 })
3569 return (len(response), info)
3570
3571 def _real_extract(self, url):
3572 mobj = re.match(self._VALID_URL, url)
3573 if mobj is None:
3574 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3575 return
3576
3577 api = 'http://api.justin.tv'
3578 video_id = mobj.group(mobj.lastindex)
3579 paged = False
3580 if mobj.lastindex == 1:
3581 paged = True
3582 api += '/channel/archives/%s.json'
3583 else:
3584 api += '/clip/show/%s.json'
3585 api = api % (video_id,)
3586
3587 self.report_extraction(video_id)
3588
3589 info = []
3590 offset = 0
3591 limit = self._JUSTIN_PAGE_LIMIT
3592 while True:
3593 if paged:
3594 self.report_download_page(video_id, offset)
3595 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3596 page_count, page_info = self._parse_page(page_url)
3597 info.extend(page_info)
3598 if not paged or page_count != limit:
3599 break
3600 offset += limit
3601 return info
3602
3603 class FunnyOrDieIE(InfoExtractor):
3604 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3605
3606 def _real_extract(self, url):
3607 mobj = re.match(self._VALID_URL, url)
3608 if mobj is None:
3609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3610 return
3611
3612 video_id = mobj.group('id')
3613 webpage = self._download_webpage(url, video_id)
3614
3615 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3616 if not m:
3617 self._downloader.trouble(u'ERROR: unable to find video information')
3618 video_url = unescapeHTML(m.group('url'))
3619
3620 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3621 if not m:
3622 self._downloader.trouble(u'Cannot find video title')
3623 title = unescapeHTML(m.group('title'))
3624
3625 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3626 if m:
3627 desc = unescapeHTML(m.group('desc'))
3628 else:
3629 desc = None
3630
3631 info = {
3632 'id': video_id,
3633 'url': video_url,
3634 'ext': 'mp4',
3635 'title': title,
3636 'description': desc,
3637 }
3638 return [info]
3639
3640 class TweetReelIE(InfoExtractor):
3641 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3642
3643 def _real_extract(self, url):
3644 mobj = re.match(self._VALID_URL, url)
3645 if mobj is None:
3646 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3647 return
3648
3649 video_id = mobj.group('id')
3650 webpage = self._download_webpage(url, video_id)
3651
3652 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3653 if not m:
3654 self._downloader.trouble(u'ERROR: Cannot find status ID')
3655 status_id = m.group(1)
3656
3657 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3658 if not m:
3659 self._downloader.trouble(u'WARNING: Cannot find description')
3660 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3661
3662 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3663 if not m:
3664 self._downloader.trouble(u'ERROR: Cannot find uploader')
3665 uploader = unescapeHTML(m.group('uploader'))
3666 uploader_id = unescapeHTML(m.group('uploader_id'))
3667
3668 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3669 if not m:
3670 self._downloader.trouble(u'ERROR: Cannot find upload date')
3671 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3672
3673 title = desc
3674 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3675
3676 info = {
3677 'id': video_id,
3678 'url': video_url,
3679 'ext': 'mov',
3680 'title': title,
3681 'description': desc,
3682 'uploader': uploader,
3683 'uploader_id': uploader_id,
3684 'internal_id': status_id,
3685 'upload_date': upload_date
3686 }
3687 return [info]
3688
3689 class SteamIE(InfoExtractor):
3690 _VALID_URL = r"""http://store.steampowered.com/
3691 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3692 (?P<gameID>\d+)/?
3693 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3694 """
3695
3696 def suitable(self, url):
3697 """Receives a URL and returns True if suitable for this IE."""
3698 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3699
3700 def _real_extract(self, url):
3701 m = re.match(self._VALID_URL, url, re.VERBOSE)
3702 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3703 gameID = m.group('gameID')
3704 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3705 webpage = self._download_webpage(videourl, gameID)
3706 mweb = re.finditer(urlRE, webpage)
3707 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3708 titles = list(re.finditer(namesRE, webpage))
3709 videos = []
3710 for vid,vtitle in zip(mweb,titles):
3711 video_id = vid.group('videoID')
3712 title = vtitle.group('videoName')
3713 video_url = vid.group('videoURL')
3714 if not video_url:
3715 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3716 info = {
3717 'id':video_id,
3718 'url':video_url,
3719 'ext': 'flv',
3720 'title': title
3721 }
3722 videos.append(info)
3723 return videos
3724
3725 class UstreamIE(InfoExtractor):
3726 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3727 IE_NAME = u'ustream'
3728
3729 def _real_extract(self, url):
3730 m = re.match(self._VALID_URL, url)
3731 video_id = m.group('videoID')
3732 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3733 webpage = self._download_webpage(url, video_id)
3734 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3735 title = m.group('title')
3736 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3737 uploader = m.group('uploader')
3738 info = {
3739 'id':video_id,
3740 'url':video_url,
3741 'ext': 'flv',
3742 'title': title,
3743 'uploader': uploader
3744 }
3745 return [info]
3746
3747
3748 def gen_extractors():
3749 """ Return a list of an instance of every supported extractor.
3750 The order does matter; the first extractor matched is the one handling the URL.
3751 """
3752 return [
3753 YoutubePlaylistIE(),
3754 YoutubeChannelIE(),
3755 YoutubeUserIE(),
3756 YoutubeSearchIE(),
3757 YoutubeIE(),
3758 MetacafeIE(),
3759 DailymotionIE(),
3760 GoogleSearchIE(),
3761 PhotobucketIE(),
3762 YahooIE(),
3763 YahooSearchIE(),
3764 DepositFilesIE(),
3765 FacebookIE(),
3766 BlipTVUserIE(),
3767 BlipTVIE(),
3768 VimeoIE(),
3769 MyVideoIE(),
3770 ComedyCentralIE(),
3771 EscapistIE(),
3772 CollegeHumorIE(),
3773 XVideosIE(),
3774 SoundcloudIE(),
3775 InfoQIE(),
3776 MixcloudIE(),
3777 StanfordOpenClassroomIE(),
3778 MTVIE(),
3779 YoukuIE(),
3780 XNXXIE(),
3781 GooglePlusIE(),
3782 ArteTvIE(),
3783 NBAIE(),
3784 JustinTVIE(),
3785 FunnyOrDieIE(),
3786 TweetReelIE(),
3787 SteamIE(),
3788 UstreamIE(),
3789 GenericIE()
3790 ]
3791
3792