]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Make uploader and upload_date fields optional
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import netrc
9 import os
10 import re
11 import socket
12 import time
13 import email.utils
14 import xml.etree.ElementTree
15 import random
16 import math
17
18 from .utils import *
19
20
21 class InfoExtractor(object):
22 """Information Extractor class.
23
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
31
32 The dictionaries must include the following fields:
33
34 id: Video identifier.
35 url: Final video URL.
36 title: Video title, unescaped.
37 ext: Video filename extension.
38
39 The following fields are optional:
40
41 format: The video format, defaults to ext (used for --get-format)
42 thumbnail: Full URL to a video thumbnail image.
43 description: One-line video description.
44 uploader: Full name of the video uploader.
45 upload_date: Video upload date (YYYYMMDD).
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
51
52 The fields should all be Unicode strings.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57
58 _real_extract() must return a *list* of information dictionaries as
59 described above.
60
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
63 """
64
65 _ready = False
66 _downloader = None
67 _WORKING = True
68
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
71 self._ready = False
72 self.set_downloader(downloader)
73
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
77
78 def working(self):
79 """Getter method for _WORKING."""
80 return self._WORKING
81
82 def initialize(self):
83 """Initializes an instance (authentication, etc)."""
84 if not self._ready:
85 self._real_initialize()
86 self._ready = True
87
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
90 self.initialize()
91 return self._real_extract(url)
92
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
96
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
99 pass
100
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
103 pass
104
105 @property
106 def IE_NAME(self):
107 return type(self).__name__[:-2]
108
109 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
110 """ Returns the response handle """
111 if note is None:
112 note = u'Downloading video webpage'
113 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
114 try:
115 return compat_urllib_request.urlopen(url_or_request)
116 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
117 if errnote is None:
118 errnote = u'Unable to download webpage'
119 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
120
121 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
122 """ Returns the data of the page as a string """
123 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
124 webpage_bytes = urlh.read()
125 return webpage_bytes.decode('utf-8', 'replace')
126
127
128 class YoutubeIE(InfoExtractor):
129 """Information extractor for youtube.com."""
130
131 _VALID_URL = r"""^
132 (
133 (?:https?://)? # http(s):// (optional)
134 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
135 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
138 (?: # the various things that can precede the ID:
139 (?:(?:v|embed|e)/) # v/ or embed/ or e/
140 |(?: # or the v= param in all its forms
141 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
142 (?:\?|\#!?) # the params delimiter ? or # or #!
143 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
144 v=
145 )
146 )? # optional -> youtube.com/xxxx is OK
147 )? # all until now is optional -> you can pass the naked ID
148 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
149 (?(1).+)? # if we found the ID, everything can follow
150 $"""
151 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
152 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
153 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
154 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
155 _NETRC_MACHINE = 'youtube'
156 # Listed in order of quality
157 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
158 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
159 _video_extensions = {
160 '13': '3gp',
161 '17': 'mp4',
162 '18': 'mp4',
163 '22': 'mp4',
164 '37': 'mp4',
165 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
166 '43': 'webm',
167 '44': 'webm',
168 '45': 'webm',
169 '46': 'webm',
170 }
171 _video_dimensions = {
172 '5': '240x400',
173 '6': '???',
174 '13': '???',
175 '17': '144x176',
176 '18': '360x640',
177 '22': '720x1280',
178 '34': '360x640',
179 '35': '480x854',
180 '37': '1080x1920',
181 '38': '3072x4096',
182 '43': '360x640',
183 '44': '480x854',
184 '45': '720x1280',
185 '46': '1080x1920',
186 }
187 IE_NAME = u'youtube'
188
189 def suitable(self, url):
190 """Receives a URL and returns True if suitable for this IE."""
191 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
192
193 def report_lang(self):
194 """Report attempt to set language."""
195 self._downloader.to_screen(u'[youtube] Setting language')
196
197 def report_login(self):
198 """Report attempt to log in."""
199 self._downloader.to_screen(u'[youtube] Logging in')
200
201 def report_age_confirmation(self):
202 """Report attempt to confirm age."""
203 self._downloader.to_screen(u'[youtube] Confirming age')
204
205 def report_video_webpage_download(self, video_id):
206 """Report attempt to download video webpage."""
207 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
208
209 def report_video_info_webpage_download(self, video_id):
210 """Report attempt to download video info webpage."""
211 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
212
213 def report_video_subtitles_download(self, video_id):
214 """Report attempt to download video info webpage."""
215 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
216
217 def report_information_extraction(self, video_id):
218 """Report attempt to extract video information."""
219 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
220
221 def report_unavailable_format(self, video_id, format):
222 """Report extracted video URL."""
223 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
224
225 def report_rtmp_download(self):
226 """Indicate the download will use the RTMP protocol."""
227 self._downloader.to_screen(u'[youtube] RTMP download detected')
228
229 def _closed_captions_xml_to_srt(self, xml_string):
230 srt = ''
231 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
232 # TODO parse xml instead of regex
233 for n, (start, dur_tag, dur, caption) in enumerate(texts):
234 if not dur: dur = '4'
235 start = float(start)
236 end = start + float(dur)
237 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
238 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
239 caption = unescapeHTML(caption)
240 caption = unescapeHTML(caption) # double cycle, intentional
241 srt += str(n+1) + '\n'
242 srt += start + ' --> ' + end + '\n'
243 srt += caption + '\n\n'
244 return srt
245
246 def _extract_subtitles(self, video_id):
247 self.report_video_subtitles_download(video_id)
248 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
249 try:
250 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
251 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
252 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
253 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
254 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
255 if not srt_lang_list:
256 return (u'WARNING: video has no closed captions', None)
257 if self._downloader.params.get('subtitleslang', False):
258 srt_lang = self._downloader.params.get('subtitleslang')
259 elif 'en' in srt_lang_list:
260 srt_lang = 'en'
261 else:
262 srt_lang = list(srt_lang_list.keys())[0]
263 if not srt_lang in srt_lang_list:
264 return (u'WARNING: no closed captions found in the specified language', None)
265 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
266 try:
267 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
268 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
269 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
270 if not srt_xml:
271 return (u'WARNING: unable to download video subtitles', None)
272 return (None, self._closed_captions_xml_to_srt(srt_xml))
273
274 def _print_formats(self, formats):
275 print('Available formats:')
276 for x in formats:
277 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
278
279 def _real_initialize(self):
280 if self._downloader is None:
281 return
282
283 username = None
284 password = None
285 downloader_params = self._downloader.params
286
287 # Attempt to use provided username and password or .netrc data
288 if downloader_params.get('username', None) is not None:
289 username = downloader_params['username']
290 password = downloader_params['password']
291 elif downloader_params.get('usenetrc', False):
292 try:
293 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
294 if info is not None:
295 username = info[0]
296 password = info[2]
297 else:
298 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
299 except (IOError, netrc.NetrcParseError) as err:
300 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
301 return
302
303 # Set language
304 request = compat_urllib_request.Request(self._LANG_URL)
305 try:
306 self.report_lang()
307 compat_urllib_request.urlopen(request).read()
308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
309 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
310 return
311
312 # No authentication to be performed
313 if username is None:
314 return
315
316 # Log in
317 login_form = {
318 'current_form': 'loginForm',
319 'next': '/',
320 'action_login': 'Log In',
321 'username': username,
322 'password': password,
323 }
324 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
325 try:
326 self.report_login()
327 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
328 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
329 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
330 return
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
333 return
334
335 # Confirm age
336 age_form = {
337 'next_url': '/',
338 'action_confirm': 'Confirm',
339 }
340 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
341 try:
342 self.report_age_confirmation()
343 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
344 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
345 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
346 return
347
348 def _extract_id(self, url):
349 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
350 if mobj is None:
351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
352 return
353 video_id = mobj.group(2)
354 return video_id
355
356 def _real_extract(self, url):
357 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
358 mobj = re.search(self._NEXT_URL_RE, url)
359 if mobj:
360 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
361 video_id = self._extract_id(url)
362
363 # Get video webpage
364 self.report_video_webpage_download(video_id)
365 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
366 request = compat_urllib_request.Request(url)
367 try:
368 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
369 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
370 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
371 return
372
373 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
374
375 # Attempt to extract SWF player URL
376 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
377 if mobj is not None:
378 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
379 else:
380 player_url = None
381
382 # Get video info
383 self.report_video_info_webpage_download(video_id)
384 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
385 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
386 % (video_id, el_type))
387 request = compat_urllib_request.Request(video_info_url)
388 try:
389 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
390 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
391 video_info = compat_parse_qs(video_info_webpage)
392 if 'token' in video_info:
393 break
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
396 return
397 if 'token' not in video_info:
398 if 'reason' in video_info:
399 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
400 else:
401 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
402 return
403
404 # Check for "rental" videos
405 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
406 self._downloader.trouble(u'ERROR: "rental" videos not supported')
407 return
408
409 # Start extracting information
410 self.report_information_extraction(video_id)
411
412 # uploader
413 if 'author' not in video_info:
414 self._downloader.trouble(u'ERROR: unable to extract uploader name')
415 return
416 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
417
418 # uploader_id
419 video_uploader_id = None
420 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
421 if mobj is not None:
422 video_uploader_id = mobj.group(1)
423 else:
424 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
425
426 # title
427 if 'title' not in video_info:
428 self._downloader.trouble(u'ERROR: unable to extract video title')
429 return
430 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
431
432 # thumbnail image
433 if 'thumbnail_url' not in video_info:
434 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
435 video_thumbnail = ''
436 else: # don't panic if we can't find it
437 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
438
439 # upload date
440 upload_date = None
441 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
442 if mobj is not None:
443 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
444 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
445 for expression in format_expressions:
446 try:
447 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
448 except:
449 pass
450
451 # description
452 video_description = get_element_by_id("eow-description", video_webpage)
453 if video_description:
454 video_description = clean_html(video_description)
455 else:
456 video_description = ''
457
458 # closed captions
459 video_subtitles = None
460 if self._downloader.params.get('writesubtitles', False):
461 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
462 if srt_error:
463 self._downloader.trouble(srt_error)
464
465 if 'length_seconds' not in video_info:
466 self._downloader.trouble(u'WARNING: unable to extract video duration')
467 video_duration = ''
468 else:
469 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
470
471 # token
472 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
473
474 # Decide which formats to download
475 req_format = self._downloader.params.get('format', None)
476
477 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
478 self.report_rtmp_download()
479 video_url_list = [(None, video_info['conn'][0])]
480 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
481 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
482 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
483 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
484 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
485
486 format_limit = self._downloader.params.get('format_limit', None)
487 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
488 if format_limit is not None and format_limit in available_formats:
489 format_list = available_formats[available_formats.index(format_limit):]
490 else:
491 format_list = available_formats
492 existing_formats = [x for x in format_list if x in url_map]
493 if len(existing_formats) == 0:
494 self._downloader.trouble(u'ERROR: no known formats available for video')
495 return
496 if self._downloader.params.get('listformats', None):
497 self._print_formats(existing_formats)
498 return
499 if req_format is None or req_format == 'best':
500 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
501 elif req_format == 'worst':
502 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
503 elif req_format in ('-1', 'all'):
504 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
505 else:
506 # Specific formats. We pick the first in a slash-delimeted sequence.
507 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
508 req_formats = req_format.split('/')
509 video_url_list = None
510 for rf in req_formats:
511 if rf in url_map:
512 video_url_list = [(rf, url_map[rf])]
513 break
514 if video_url_list is None:
515 self._downloader.trouble(u'ERROR: requested format not available')
516 return
517 else:
518 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
519 return
520
521 results = []
522 for format_param, video_real_url in video_url_list:
523 # Extension
524 video_extension = self._video_extensions.get(format_param, 'flv')
525
526 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
527 self._video_dimensions.get(format_param, '???'))
528
529 results.append({
530 'id': video_id,
531 'url': video_real_url,
532 'uploader': video_uploader,
533 'uploader_id': video_uploader_id,
534 'upload_date': upload_date,
535 'title': video_title,
536 'ext': video_extension,
537 'format': video_format,
538 'thumbnail': video_thumbnail,
539 'description': video_description,
540 'player_url': player_url,
541 'subtitles': video_subtitles,
542 'duration': video_duration
543 })
544 return results
545
546
547 class MetacafeIE(InfoExtractor):
548 """Information Extractor for metacafe.com."""
549
550 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
551 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
552 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
553 IE_NAME = u'metacafe'
554
555 def __init__(self, downloader=None):
556 InfoExtractor.__init__(self, downloader)
557
558 def report_disclaimer(self):
559 """Report disclaimer retrieval."""
560 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
561
562 def report_age_confirmation(self):
563 """Report attempt to confirm age."""
564 self._downloader.to_screen(u'[metacafe] Confirming age')
565
566 def report_download_webpage(self, video_id):
567 """Report webpage download."""
568 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
569
570 def report_extraction(self, video_id):
571 """Report information extraction."""
572 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
573
574 def _real_initialize(self):
575 # Retrieve disclaimer
576 request = compat_urllib_request.Request(self._DISCLAIMER)
577 try:
578 self.report_disclaimer()
579 disclaimer = compat_urllib_request.urlopen(request).read()
580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
581 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
582 return
583
584 # Confirm age
585 disclaimer_form = {
586 'filters': '0',
587 'submit': "Continue - I'm over 18",
588 }
589 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
590 try:
591 self.report_age_confirmation()
592 disclaimer = compat_urllib_request.urlopen(request).read()
593 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
594 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
595 return
596
597 def _real_extract(self, url):
598 # Extract id and simplified title from URL
599 mobj = re.match(self._VALID_URL, url)
600 if mobj is None:
601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
602 return
603
604 video_id = mobj.group(1)
605
606 # Check if video comes from YouTube
607 mobj2 = re.match(r'^yt-(.*)$', video_id)
608 if mobj2 is not None:
609 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
610 return
611
612 # Retrieve video webpage to extract further information
613 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
614 try:
615 self.report_download_webpage(video_id)
616 webpage = compat_urllib_request.urlopen(request).read()
617 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
618 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
619 return
620
621 # Extract URL, uploader and title from webpage
622 self.report_extraction(video_id)
623 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
624 if mobj is not None:
625 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
626 video_extension = mediaURL[-3:]
627
628 # Extract gdaKey if available
629 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
630 if mobj is None:
631 video_url = mediaURL
632 else:
633 gdaKey = mobj.group(1)
634 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
635 else:
636 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
637 if mobj is None:
638 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 return
640 vardict = compat_parse_qs(mobj.group(1))
641 if 'mediaData' not in vardict:
642 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 return
644 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
645 if mobj is None:
646 self._downloader.trouble(u'ERROR: unable to extract media URL')
647 return
648 mediaURL = mobj.group(1).replace('\\/', '/')
649 video_extension = mediaURL[-3:]
650 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
651
652 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
653 if mobj is None:
654 self._downloader.trouble(u'ERROR: unable to extract title')
655 return
656 video_title = mobj.group(1).decode('utf-8')
657
658 mobj = re.search(r'submitter=(.*?);', webpage)
659 if mobj is None:
660 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
661 return
662 video_uploader = mobj.group(1)
663
664 return [{
665 'id': video_id.decode('utf-8'),
666 'url': video_url.decode('utf-8'),
667 'uploader': video_uploader.decode('utf-8'),
668 'upload_date': None,
669 'title': video_title,
670 'ext': video_extension.decode('utf-8'),
671 }]
672
673
674 class DailymotionIE(InfoExtractor):
675 """Information Extractor for Dailymotion"""
676
677 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
678 IE_NAME = u'dailymotion'
679
680 def __init__(self, downloader=None):
681 InfoExtractor.__init__(self, downloader)
682
683 def report_extraction(self, video_id):
684 """Report information extraction."""
685 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
686
687 def _real_extract(self, url):
688 # Extract id and simplified title from URL
689 mobj = re.match(self._VALID_URL, url)
690 if mobj is None:
691 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
692 return
693
694 video_id = mobj.group(1).split('_')[0].split('?')[0]
695
696 video_extension = 'mp4'
697
698 # Retrieve video webpage to extract further information
699 request = compat_urllib_request.Request(url)
700 request.add_header('Cookie', 'family_filter=off')
701 webpage = self._download_webpage(request, video_id)
702
703 # Extract URL, uploader and title from webpage
704 self.report_extraction(video_id)
705 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
706 if mobj is None:
707 self._downloader.trouble(u'ERROR: unable to extract media URL')
708 return
709 flashvars = compat_urllib_parse.unquote(mobj.group(1))
710
711 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
712 if key in flashvars:
713 max_quality = key
714 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
715 break
716 else:
717 self._downloader.trouble(u'ERROR: unable to extract video URL')
718 return
719
720 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
721 if mobj is None:
722 self._downloader.trouble(u'ERROR: unable to extract video URL')
723 return
724
725 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
726
727 # TODO: support choosing qualities
728
729 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
730 if mobj is None:
731 self._downloader.trouble(u'ERROR: unable to extract title')
732 return
733 video_title = unescapeHTML(mobj.group('title'))
734
735 video_uploader = None
736 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
737 if mobj is None:
738 # lookin for official user
739 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
740 if mobj_official is None:
741 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
742 else:
743 video_uploader = mobj_official.group(1)
744 else:
745 video_uploader = mobj.group(1)
746
747 video_upload_date = None
748 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
749 if mobj is not None:
750 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
751
752 return [{
753 'id': video_id,
754 'url': video_url,
755 'uploader': video_uploader,
756 'upload_date': video_upload_date,
757 'title': video_title,
758 'ext': video_extension,
759 }]
760
761
762 class PhotobucketIE(InfoExtractor):
763 """Information extractor for photobucket.com."""
764
765 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
766 IE_NAME = u'photobucket'
767
768 def __init__(self, downloader=None):
769 InfoExtractor.__init__(self, downloader)
770
771 def report_download_webpage(self, video_id):
772 """Report webpage download."""
773 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
774
775 def report_extraction(self, video_id):
776 """Report information extraction."""
777 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
778
779 def _real_extract(self, url):
780 # Extract id from URL
781 mobj = re.match(self._VALID_URL, url)
782 if mobj is None:
783 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
784 return
785
786 video_id = mobj.group(1)
787
788 video_extension = 'flv'
789
790 # Retrieve video webpage to extract further information
791 request = compat_urllib_request.Request(url)
792 try:
793 self.report_download_webpage(video_id)
794 webpage = compat_urllib_request.urlopen(request).read()
795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
797 return
798
799 # Extract URL, uploader, and title from webpage
800 self.report_extraction(video_id)
801 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
802 if mobj is None:
803 self._downloader.trouble(u'ERROR: unable to extract media URL')
804 return
805 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
806
807 video_url = mediaURL
808
809 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
810 if mobj is None:
811 self._downloader.trouble(u'ERROR: unable to extract title')
812 return
813 video_title = mobj.group(1).decode('utf-8')
814
815 video_uploader = mobj.group(2).decode('utf-8')
816
817 return [{
818 'id': video_id.decode('utf-8'),
819 'url': video_url.decode('utf-8'),
820 'uploader': video_uploader,
821 'upload_date': None,
822 'title': video_title,
823 'ext': video_extension.decode('utf-8'),
824 }]
825
826
827 class YahooIE(InfoExtractor):
828 """Information extractor for video.yahoo.com."""
829
830 _WORKING = False
831 # _VALID_URL matches all Yahoo! Video URLs
832 # _VPAGE_URL matches only the extractable '/watch/' URLs
833 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
834 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
835 IE_NAME = u'video.yahoo'
836
837 def __init__(self, downloader=None):
838 InfoExtractor.__init__(self, downloader)
839
840 def report_download_webpage(self, video_id):
841 """Report webpage download."""
842 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
843
844 def report_extraction(self, video_id):
845 """Report information extraction."""
846 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
847
848 def _real_extract(self, url, new_video=True):
849 # Extract ID from URL
850 mobj = re.match(self._VALID_URL, url)
851 if mobj is None:
852 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
853 return
854
855 video_id = mobj.group(2)
856 video_extension = 'flv'
857
858 # Rewrite valid but non-extractable URLs as
859 # extractable English language /watch/ URLs
860 if re.match(self._VPAGE_URL, url) is None:
861 request = compat_urllib_request.Request(url)
862 try:
863 webpage = compat_urllib_request.urlopen(request).read()
864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
865 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
866 return
867
868 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
869 if mobj is None:
870 self._downloader.trouble(u'ERROR: Unable to extract id field')
871 return
872 yahoo_id = mobj.group(1)
873
874 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
875 if mobj is None:
876 self._downloader.trouble(u'ERROR: Unable to extract vid field')
877 return
878 yahoo_vid = mobj.group(1)
879
880 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
881 return self._real_extract(url, new_video=False)
882
883 # Retrieve video webpage to extract further information
884 request = compat_urllib_request.Request(url)
885 try:
886 self.report_download_webpage(video_id)
887 webpage = compat_urllib_request.urlopen(request).read()
888 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
890 return
891
892 # Extract uploader and title from webpage
893 self.report_extraction(video_id)
894 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
895 if mobj is None:
896 self._downloader.trouble(u'ERROR: unable to extract video title')
897 return
898 video_title = mobj.group(1).decode('utf-8')
899
900 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
901 if mobj is None:
902 self._downloader.trouble(u'ERROR: unable to extract video uploader')
903 return
904 video_uploader = mobj.group(1).decode('utf-8')
905
906 # Extract video thumbnail
907 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
908 if mobj is None:
909 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
910 return
911 video_thumbnail = mobj.group(1).decode('utf-8')
912
913 # Extract video description
914 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
915 if mobj is None:
916 self._downloader.trouble(u'ERROR: unable to extract video description')
917 return
918 video_description = mobj.group(1).decode('utf-8')
919 if not video_description:
920 video_description = 'No description available.'
921
922 # Extract video height and width
923 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
924 if mobj is None:
925 self._downloader.trouble(u'ERROR: unable to extract video height')
926 return
927 yv_video_height = mobj.group(1)
928
929 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
930 if mobj is None:
931 self._downloader.trouble(u'ERROR: unable to extract video width')
932 return
933 yv_video_width = mobj.group(1)
934
935 # Retrieve video playlist to extract media URL
936 # I'm not completely sure what all these options are, but we
937 # seem to need most of them, otherwise the server sends a 401.
938 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
939 yv_bitrate = '700' # according to Wikipedia this is hard-coded
940 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
941 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
942 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
943 try:
944 self.report_download_webpage(video_id)
945 webpage = compat_urllib_request.urlopen(request).read()
946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
947 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
948 return
949
950 # Extract media URL from playlist XML
951 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
952 if mobj is None:
953 self._downloader.trouble(u'ERROR: Unable to extract media URL')
954 return
955 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
956 video_url = unescapeHTML(video_url)
957
958 return [{
959 'id': video_id.decode('utf-8'),
960 'url': video_url,
961 'uploader': video_uploader,
962 'upload_date': None,
963 'title': video_title,
964 'ext': video_extension.decode('utf-8'),
965 'thumbnail': video_thumbnail.decode('utf-8'),
966 'description': video_description,
967 }]
968
969
970 class VimeoIE(InfoExtractor):
971 """Information extractor for vimeo.com."""
972
973 # _VALID_URL matches Vimeo URLs
974 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
975 IE_NAME = u'vimeo'
976
977 def __init__(self, downloader=None):
978 InfoExtractor.__init__(self, downloader)
979
980 def report_download_webpage(self, video_id):
981 """Report webpage download."""
982 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
983
984 def report_extraction(self, video_id):
985 """Report information extraction."""
986 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
987
988 def _real_extract(self, url, new_video=True):
989 # Extract ID from URL
990 mobj = re.match(self._VALID_URL, url)
991 if mobj is None:
992 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
993 return
994
995 video_id = mobj.group(1)
996
997 # Retrieve video webpage to extract further information
998 request = compat_urllib_request.Request(url, None, std_headers)
999 try:
1000 self.report_download_webpage(video_id)
1001 webpage_bytes = compat_urllib_request.urlopen(request).read()
1002 webpage = webpage_bytes.decode('utf-8')
1003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005 return
1006
1007 # Now we begin extracting as much information as we can from what we
1008 # retrieved. First we extract the information common to all extractors,
1009 # and latter we extract those that are Vimeo specific.
1010 self.report_extraction(video_id)
1011
1012 # Extract the config JSON
1013 try:
1014 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1015 config = json.loads(config)
1016 except:
1017 self._downloader.trouble(u'ERROR: unable to extract info section')
1018 return
1019
1020 # Extract title
1021 video_title = config["video"]["title"]
1022
1023 # Extract uploader and uploader_id
1024 video_uploader = config["video"]["owner"]["name"]
1025 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026
1027 # Extract video thumbnail
1028 video_thumbnail = config["video"]["thumbnail"]
1029
1030 # Extract video description
1031 video_description = get_element_by_attribute("itemprop", "description", webpage)
1032 if video_description: video_description = clean_html(video_description)
1033 else: video_description = ''
1034
1035 # Extract upload date
1036 video_upload_date = None
1037 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038 if mobj is not None:
1039 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040
1041 # Vimeo specific: extract request signature and timestamp
1042 sig = config['request']['signature']
1043 timestamp = config['request']['timestamp']
1044
1045 # Vimeo specific: extract video codec and quality information
1046 # First consider quality, then codecs, then take everything
1047 # TODO bind to format param
1048 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049 files = { 'hd': [], 'sd': [], 'other': []}
1050 for codec_name, codec_extension in codecs:
1051 if codec_name in config["video"]["files"]:
1052 if 'hd' in config["video"]["files"][codec_name]:
1053 files['hd'].append((codec_name, codec_extension, 'hd'))
1054 elif 'sd' in config["video"]["files"][codec_name]:
1055 files['sd'].append((codec_name, codec_extension, 'sd'))
1056 else:
1057 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059 for quality in ('hd', 'sd', 'other'):
1060 if len(files[quality]) > 0:
1061 video_quality = files[quality][0][2]
1062 video_codec = files[quality][0][0]
1063 video_extension = files[quality][0][1]
1064 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1065 break
1066 else:
1067 self._downloader.trouble(u'ERROR: no known codec found')
1068 return
1069
1070 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1071 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1072
1073 return [{
1074 'id': video_id,
1075 'url': video_url,
1076 'uploader': video_uploader,
1077 'uploader_id': video_uploader_id,
1078 'upload_date': video_upload_date,
1079 'title': video_title,
1080 'ext': video_extension,
1081 'thumbnail': video_thumbnail,
1082 'description': video_description,
1083 }]
1084
1085
1086 class ArteTvIE(InfoExtractor):
1087 """arte.tv information extractor."""
1088
1089 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1090 _LIVE_URL = r'index-[0-9]+\.html$'
1091
1092 IE_NAME = u'arte.tv'
1093
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1096
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1100
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1104
1105 def fetch_webpage(self, url):
1106 request = compat_urllib_request.Request(url)
1107 try:
1108 self.report_download_webpage(url)
1109 webpage = compat_urllib_request.urlopen(request).read()
1110 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1111 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1112 return
1113 except ValueError as err:
1114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115 return
1116 return webpage
1117
1118 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1119 page = self.fetch_webpage(url)
1120 mobj = re.search(regex, page, regexFlags)
1121 info = {}
1122
1123 if mobj is None:
1124 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1125 return
1126
1127 for (i, key, err) in matchTuples:
1128 if mobj.group(i) is None:
1129 self._downloader.trouble(err)
1130 return
1131 else:
1132 info[key] = mobj.group(i)
1133
1134 return info
1135
1136 def extractLiveStream(self, url):
1137 video_lang = url.split('/')[-4]
1138 info = self.grep_webpage(
1139 url,
1140 r'src="(.*?/videothek_js.*?\.js)',
1141 0,
1142 [
1143 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1144 ]
1145 )
1146 http_host = url.split('/')[2]
1147 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1148 info = self.grep_webpage(
1149 next_url,
1150 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1151 '(http://.*?\.swf).*?' +
1152 '(rtmp://.*?)\'',
1153 re.DOTALL,
1154 [
1155 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1156 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1157 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1158 ]
1159 )
1160 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1161
1162 def extractPlus7Stream(self, url):
1163 video_lang = url.split('/')[-3]
1164 info = self.grep_webpage(
1165 url,
1166 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1167 0,
1168 [
1169 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1170 ]
1171 )
1172 next_url = compat_urllib_parse.unquote(info.get('url'))
1173 info = self.grep_webpage(
1174 next_url,
1175 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1176 0,
1177 [
1178 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1179 ]
1180 )
1181 next_url = compat_urllib_parse.unquote(info.get('url'))
1182
1183 info = self.grep_webpage(
1184 next_url,
1185 r'<video id="(.*?)".*?>.*?' +
1186 '<name>(.*?)</name>.*?' +
1187 '<dateVideo>(.*?)</dateVideo>.*?' +
1188 '<url quality="hd">(.*?)</url>',
1189 re.DOTALL,
1190 [
1191 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1192 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1193 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1194 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1195 ]
1196 )
1197
1198 return {
1199 'id': info.get('id'),
1200 'url': compat_urllib_parse.unquote(info.get('url')),
1201 'uploader': u'arte.tv',
1202 'upload_date': info.get('date'),
1203 'title': info.get('title').decode('utf-8'),
1204 'ext': u'mp4',
1205 'format': u'NA',
1206 'player_url': None,
1207 }
1208
1209 def _real_extract(self, url):
1210 video_id = url.split('/')[-1]
1211 self.report_extraction(video_id)
1212
1213 if re.search(self._LIVE_URL, video_id) is not None:
1214 self.extractLiveStream(url)
1215 return
1216 else:
1217 info = self.extractPlus7Stream(url)
1218
1219 return [info]
1220
1221
1222 class GenericIE(InfoExtractor):
1223 """Generic last-resort information extractor."""
1224
1225 _VALID_URL = r'.*'
1226 IE_NAME = u'generic'
1227
1228 def __init__(self, downloader=None):
1229 InfoExtractor.__init__(self, downloader)
1230
1231 def report_download_webpage(self, video_id):
1232 """Report webpage download."""
1233 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1234 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1235
1236 def report_extraction(self, video_id):
1237 """Report information extraction."""
1238 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1239
1240 def report_following_redirect(self, new_url):
1241 """Report information extraction."""
1242 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1243
1244 def _test_redirect(self, url):
1245 """Check if it is a redirect, like url shorteners, in case restart chain."""
1246 class HeadRequest(compat_urllib_request.Request):
1247 def get_method(self):
1248 return "HEAD"
1249
1250 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1251 """
1252 Subclass the HTTPRedirectHandler to make it use our
1253 HeadRequest also on the redirected URL
1254 """
1255 def redirect_request(self, req, fp, code, msg, headers, newurl):
1256 if code in (301, 302, 303, 307):
1257 newurl = newurl.replace(' ', '%20')
1258 newheaders = dict((k,v) for k,v in req.headers.items()
1259 if k.lower() not in ("content-length", "content-type"))
1260 return HeadRequest(newurl,
1261 headers=newheaders,
1262 origin_req_host=req.get_origin_req_host(),
1263 unverifiable=True)
1264 else:
1265 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1266
1267 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1268 """
1269 Fallback to GET if HEAD is not allowed (405 HTTP error)
1270 """
1271 def http_error_405(self, req, fp, code, msg, headers):
1272 fp.read()
1273 fp.close()
1274
1275 newheaders = dict((k,v) for k,v in req.headers.items()
1276 if k.lower() not in ("content-length", "content-type"))
1277 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1278 headers=newheaders,
1279 origin_req_host=req.get_origin_req_host(),
1280 unverifiable=True))
1281
1282 # Build our opener
1283 opener = compat_urllib_request.OpenerDirector()
1284 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1285 HTTPMethodFallback, HEADRedirectHandler,
1286 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1287 opener.add_handler(handler())
1288
1289 response = opener.open(HeadRequest(url))
1290 new_url = response.geturl()
1291
1292 if url == new_url:
1293 return False
1294
1295 self.report_following_redirect(new_url)
1296 self._downloader.download([new_url])
1297 return True
1298
1299 def _real_extract(self, url):
1300 if self._test_redirect(url): return
1301
1302 video_id = url.split('/')[-1]
1303 request = compat_urllib_request.Request(url)
1304 try:
1305 self.report_download_webpage(video_id)
1306 webpage = compat_urllib_request.urlopen(request).read()
1307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1308 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1309 return
1310 except ValueError as err:
1311 # since this is the last-resort InfoExtractor, if
1312 # this error is thrown, it'll be thrown here
1313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314 return
1315
1316 self.report_extraction(video_id)
1317 # Start with something easy: JW Player in SWFObject
1318 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1319 if mobj is None:
1320 # Broaden the search a little bit
1321 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1322 if mobj is None:
1323 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1324 return
1325
1326 # It's possible that one of the regexes
1327 # matched, but returned an empty group:
1328 if mobj.group(1) is None:
1329 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1330 return
1331
1332 video_url = compat_urllib_parse.unquote(mobj.group(1))
1333 video_id = os.path.basename(video_url)
1334
1335 # here's a fun little line of code for you:
1336 video_extension = os.path.splitext(video_id)[1][1:]
1337 video_id = os.path.splitext(video_id)[0]
1338
1339 # it's tempting to parse this further, but you would
1340 # have to take into account all the variations like
1341 # Video Title - Site Name
1342 # Site Name | Video Title
1343 # Video Title - Tagline | Site Name
1344 # and so on and so forth; it's just not practical
1345 mobj = re.search(r'<title>(.*)</title>', webpage)
1346 if mobj is None:
1347 self._downloader.trouble(u'ERROR: unable to extract title')
1348 return
1349 video_title = mobj.group(1)
1350
1351 # video uploader is domain name
1352 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1353 if mobj is None:
1354 self._downloader.trouble(u'ERROR: unable to extract title')
1355 return
1356 video_uploader = mobj.group(1)
1357
1358 return [{
1359 'id': video_id,
1360 'url': video_url,
1361 'uploader': video_uploader,
1362 'upload_date': None,
1363 'title': video_title,
1364 'ext': video_extension,
1365 }]
1366
1367
1368 class YoutubeSearchIE(InfoExtractor):
1369 """Information Extractor for YouTube search queries."""
1370 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1371 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1372 _max_youtube_results = 1000
1373 IE_NAME = u'youtube:search'
1374
1375 def __init__(self, downloader=None):
1376 InfoExtractor.__init__(self, downloader)
1377
1378 def report_download_page(self, query, pagenum):
1379 """Report attempt to download search page with given number."""
1380 query = query.decode(preferredencoding())
1381 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1382
1383 def _real_extract(self, query):
1384 mobj = re.match(self._VALID_URL, query)
1385 if mobj is None:
1386 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1387 return
1388
1389 prefix, query = query.split(':')
1390 prefix = prefix[8:]
1391 query = query.encode('utf-8')
1392 if prefix == '':
1393 self._download_n_results(query, 1)
1394 return
1395 elif prefix == 'all':
1396 self._download_n_results(query, self._max_youtube_results)
1397 return
1398 else:
1399 try:
1400 n = int(prefix)
1401 if n <= 0:
1402 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1403 return
1404 elif n > self._max_youtube_results:
1405 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1406 n = self._max_youtube_results
1407 self._download_n_results(query, n)
1408 return
1409 except ValueError: # parsing prefix as integer fails
1410 self._download_n_results(query, 1)
1411 return
1412
1413 def _download_n_results(self, query, n):
1414 """Downloads a specified number of results for a query"""
1415
1416 video_ids = []
1417 pagenum = 0
1418 limit = n
1419
1420 while (50 * pagenum) < limit:
1421 self.report_download_page(query, pagenum+1)
1422 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1423 request = compat_urllib_request.Request(result_url)
1424 try:
1425 data = compat_urllib_request.urlopen(request).read()
1426 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1427 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1428 return
1429 api_response = json.loads(data)['data']
1430
1431 new_ids = list(video['id'] for video in api_response['items'])
1432 video_ids += new_ids
1433
1434 limit = min(n, api_response['totalItems'])
1435 pagenum += 1
1436
1437 if len(video_ids) > n:
1438 video_ids = video_ids[:n]
1439 for id in video_ids:
1440 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1441 return
1442
1443
1444 class GoogleSearchIE(InfoExtractor):
1445 """Information Extractor for Google Video search queries."""
1446 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1447 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1448 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1449 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1450 _max_google_results = 1000
1451 IE_NAME = u'video.google:search'
1452
1453 def __init__(self, downloader=None):
1454 InfoExtractor.__init__(self, downloader)
1455
1456 def report_download_page(self, query, pagenum):
1457 """Report attempt to download playlist page with given number."""
1458 query = query.decode(preferredencoding())
1459 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1460
1461 def _real_extract(self, query):
1462 mobj = re.match(self._VALID_URL, query)
1463 if mobj is None:
1464 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1465 return
1466
1467 prefix, query = query.split(':')
1468 prefix = prefix[8:]
1469 query = query.encode('utf-8')
1470 if prefix == '':
1471 self._download_n_results(query, 1)
1472 return
1473 elif prefix == 'all':
1474 self._download_n_results(query, self._max_google_results)
1475 return
1476 else:
1477 try:
1478 n = int(prefix)
1479 if n <= 0:
1480 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1481 return
1482 elif n > self._max_google_results:
1483 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1484 n = self._max_google_results
1485 self._download_n_results(query, n)
1486 return
1487 except ValueError: # parsing prefix as integer fails
1488 self._download_n_results(query, 1)
1489 return
1490
1491 def _download_n_results(self, query, n):
1492 """Downloads a specified number of results for a query"""
1493
1494 video_ids = []
1495 pagenum = 0
1496
1497 while True:
1498 self.report_download_page(query, pagenum)
1499 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1500 request = compat_urllib_request.Request(result_url)
1501 try:
1502 page = compat_urllib_request.urlopen(request).read()
1503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1504 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1505 return
1506
1507 # Extract video identifiers
1508 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1509 video_id = mobj.group(1)
1510 if video_id not in video_ids:
1511 video_ids.append(video_id)
1512 if len(video_ids) == n:
1513 # Specified n videos reached
1514 for id in video_ids:
1515 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1516 return
1517
1518 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519 for id in video_ids:
1520 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1521 return
1522
1523 pagenum = pagenum + 1
1524
1525
1526 class YahooSearchIE(InfoExtractor):
1527 """Information Extractor for Yahoo! Video search queries."""
1528
1529 _WORKING = False
1530 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1531 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1532 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1533 _MORE_PAGES_INDICATOR = r'\s*Next'
1534 _max_yahoo_results = 1000
1535 IE_NAME = u'video.yahoo:search'
1536
1537 def __init__(self, downloader=None):
1538 InfoExtractor.__init__(self, downloader)
1539
1540 def report_download_page(self, query, pagenum):
1541 """Report attempt to download playlist page with given number."""
1542 query = query.decode(preferredencoding())
1543 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1544
1545 def _real_extract(self, query):
1546 mobj = re.match(self._VALID_URL, query)
1547 if mobj is None:
1548 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1549 return
1550
1551 prefix, query = query.split(':')
1552 prefix = prefix[8:]
1553 query = query.encode('utf-8')
1554 if prefix == '':
1555 self._download_n_results(query, 1)
1556 return
1557 elif prefix == 'all':
1558 self._download_n_results(query, self._max_yahoo_results)
1559 return
1560 else:
1561 try:
1562 n = int(prefix)
1563 if n <= 0:
1564 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1565 return
1566 elif n > self._max_yahoo_results:
1567 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1568 n = self._max_yahoo_results
1569 self._download_n_results(query, n)
1570 return
1571 except ValueError: # parsing prefix as integer fails
1572 self._download_n_results(query, 1)
1573 return
1574
1575 def _download_n_results(self, query, n):
1576 """Downloads a specified number of results for a query"""
1577
1578 video_ids = []
1579 already_seen = set()
1580 pagenum = 1
1581
1582 while True:
1583 self.report_download_page(query, pagenum)
1584 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1585 request = compat_urllib_request.Request(result_url)
1586 try:
1587 page = compat_urllib_request.urlopen(request).read()
1588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1589 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1590 return
1591
1592 # Extract video identifiers
1593 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1594 video_id = mobj.group(1)
1595 if video_id not in already_seen:
1596 video_ids.append(video_id)
1597 already_seen.add(video_id)
1598 if len(video_ids) == n:
1599 # Specified n videos reached
1600 for id in video_ids:
1601 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1602 return
1603
1604 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1605 for id in video_ids:
1606 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1607 return
1608
1609 pagenum = pagenum + 1
1610
1611
1612 class YoutubePlaylistIE(InfoExtractor):
1613 """Information Extractor for YouTube playlists."""
1614
1615 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1616 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1617 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1618 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1619 IE_NAME = u'youtube:playlist'
1620
1621 def __init__(self, downloader=None):
1622 InfoExtractor.__init__(self, downloader)
1623
1624 def report_download_page(self, playlist_id, pagenum):
1625 """Report attempt to download playlist page with given number."""
1626 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1627
1628 def _real_extract(self, url):
1629 # Extract playlist id
1630 mobj = re.match(self._VALID_URL, url)
1631 if mobj is None:
1632 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1633 return
1634
1635 # Single video case
1636 if mobj.group(3) is not None:
1637 self._downloader.download([mobj.group(3)])
1638 return
1639
1640 # Download playlist pages
1641 # prefix is 'p' as default for playlists but there are other types that need extra care
1642 playlist_prefix = mobj.group(1)
1643 if playlist_prefix == 'a':
1644 playlist_access = 'artist'
1645 else:
1646 playlist_prefix = 'p'
1647 playlist_access = 'view_play_list'
1648 playlist_id = mobj.group(2)
1649 video_ids = []
1650 pagenum = 1
1651
1652 while True:
1653 self.report_download_page(playlist_id, pagenum)
1654 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1655 request = compat_urllib_request.Request(url)
1656 try:
1657 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1658 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1659 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1660 return
1661
1662 # Extract video identifiers
1663 ids_in_page = []
1664 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1665 if mobj.group(1) not in ids_in_page:
1666 ids_in_page.append(mobj.group(1))
1667 video_ids.extend(ids_in_page)
1668
1669 if self._MORE_PAGES_INDICATOR not in page:
1670 break
1671 pagenum = pagenum + 1
1672
1673 total = len(video_ids)
1674
1675 playliststart = self._downloader.params.get('playliststart', 1) - 1
1676 playlistend = self._downloader.params.get('playlistend', -1)
1677 if playlistend == -1:
1678 video_ids = video_ids[playliststart:]
1679 else:
1680 video_ids = video_ids[playliststart:playlistend]
1681
1682 if len(video_ids) == total:
1683 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1684 else:
1685 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1686
1687 for id in video_ids:
1688 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1689 return
1690
1691
1692 class YoutubeChannelIE(InfoExtractor):
1693 """Information Extractor for YouTube channels."""
1694
1695 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1696 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1697 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1698 IE_NAME = u'youtube:channel'
1699
1700 def report_download_page(self, channel_id, pagenum):
1701 """Report attempt to download channel page with given number."""
1702 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1703
1704 def _real_extract(self, url):
1705 # Extract channel id
1706 mobj = re.match(self._VALID_URL, url)
1707 if mobj is None:
1708 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1709 return
1710
1711 # Download channel pages
1712 channel_id = mobj.group(1)
1713 video_ids = []
1714 pagenum = 1
1715
1716 while True:
1717 self.report_download_page(channel_id, pagenum)
1718 url = self._TEMPLATE_URL % (channel_id, pagenum)
1719 request = compat_urllib_request.Request(url)
1720 try:
1721 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1724 return
1725
1726 # Extract video identifiers
1727 ids_in_page = []
1728 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1729 if mobj.group(1) not in ids_in_page:
1730 ids_in_page.append(mobj.group(1))
1731 video_ids.extend(ids_in_page)
1732
1733 if self._MORE_PAGES_INDICATOR not in page:
1734 break
1735 pagenum = pagenum + 1
1736
1737 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1738
1739 for id in video_ids:
1740 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1741 return
1742
1743
1744 class YoutubeUserIE(InfoExtractor):
1745 """Information Extractor for YouTube users."""
1746
1747 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1748 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1749 _GDATA_PAGE_SIZE = 50
1750 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1751 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1752 IE_NAME = u'youtube:user'
1753
1754 def __init__(self, downloader=None):
1755 InfoExtractor.__init__(self, downloader)
1756
1757 def report_download_page(self, username, start_index):
1758 """Report attempt to download user page."""
1759 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1760 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1761
1762 def _real_extract(self, url):
1763 # Extract username
1764 mobj = re.match(self._VALID_URL, url)
1765 if mobj is None:
1766 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1767 return
1768
1769 username = mobj.group(1)
1770
1771 # Download video ids using YouTube Data API. Result size per
1772 # query is limited (currently to 50 videos) so we need to query
1773 # page by page until there are no video ids - it means we got
1774 # all of them.
1775
1776 video_ids = []
1777 pagenum = 0
1778
1779 while True:
1780 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1781 self.report_download_page(username, start_index)
1782
1783 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1784
1785 try:
1786 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1789 return
1790
1791 # Extract video identifiers
1792 ids_in_page = []
1793
1794 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1795 if mobj.group(1) not in ids_in_page:
1796 ids_in_page.append(mobj.group(1))
1797
1798 video_ids.extend(ids_in_page)
1799
1800 # A little optimization - if current page is not
1801 # "full", ie. does not contain PAGE_SIZE video ids then
1802 # we can assume that this page is the last one - there
1803 # are no more ids on further pages - no need to query
1804 # again.
1805
1806 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1807 break
1808
1809 pagenum += 1
1810
1811 all_ids_count = len(video_ids)
1812 playliststart = self._downloader.params.get('playliststart', 1) - 1
1813 playlistend = self._downloader.params.get('playlistend', -1)
1814
1815 if playlistend == -1:
1816 video_ids = video_ids[playliststart:]
1817 else:
1818 video_ids = video_ids[playliststart:playlistend]
1819
1820 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1821 (username, all_ids_count, len(video_ids)))
1822
1823 for video_id in video_ids:
1824 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1825
1826
1827 class BlipTVUserIE(InfoExtractor):
1828 """Information Extractor for blip.tv users."""
1829
1830 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1831 _PAGE_SIZE = 12
1832 IE_NAME = u'blip.tv:user'
1833
1834 def __init__(self, downloader=None):
1835 InfoExtractor.__init__(self, downloader)
1836
1837 def report_download_page(self, username, pagenum):
1838 """Report attempt to download user page."""
1839 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1840 (self.IE_NAME, username, pagenum))
1841
1842 def _real_extract(self, url):
1843 # Extract username
1844 mobj = re.match(self._VALID_URL, url)
1845 if mobj is None:
1846 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1847 return
1848
1849 username = mobj.group(1)
1850
1851 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1852
1853 request = compat_urllib_request.Request(url)
1854
1855 try:
1856 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1857 mobj = re.search(r'data-users-id="([^"]+)"', page)
1858 page_base = page_base % mobj.group(1)
1859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1861 return
1862
1863
1864 # Download video ids using BlipTV Ajax calls. Result size per
1865 # query is limited (currently to 12 videos) so we need to query
1866 # page by page until there are no video ids - it means we got
1867 # all of them.
1868
1869 video_ids = []
1870 pagenum = 1
1871
1872 while True:
1873 self.report_download_page(username, pagenum)
1874
1875 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1876
1877 try:
1878 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1879 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1880 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1881 return
1882
1883 # Extract video identifiers
1884 ids_in_page = []
1885
1886 for mobj in re.finditer(r'href="/([^"]+)"', page):
1887 if mobj.group(1) not in ids_in_page:
1888 ids_in_page.append(unescapeHTML(mobj.group(1)))
1889
1890 video_ids.extend(ids_in_page)
1891
1892 # A little optimization - if current page is not
1893 # "full", ie. does not contain PAGE_SIZE video ids then
1894 # we can assume that this page is the last one - there
1895 # are no more ids on further pages - no need to query
1896 # again.
1897
1898 if len(ids_in_page) < self._PAGE_SIZE:
1899 break
1900
1901 pagenum += 1
1902
1903 all_ids_count = len(video_ids)
1904 playliststart = self._downloader.params.get('playliststart', 1) - 1
1905 playlistend = self._downloader.params.get('playlistend', -1)
1906
1907 if playlistend == -1:
1908 video_ids = video_ids[playliststart:]
1909 else:
1910 video_ids = video_ids[playliststart:playlistend]
1911
1912 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1913 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1914
1915 for video_id in video_ids:
1916 self._downloader.download([u'http://blip.tv/'+video_id])
1917
1918
1919 class DepositFilesIE(InfoExtractor):
1920 """Information extractor for depositfiles.com"""
1921
1922 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1923
1924 def report_download_webpage(self, file_id):
1925 """Report webpage download."""
1926 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1927
1928 def report_extraction(self, file_id):
1929 """Report information extraction."""
1930 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1931
1932 def _real_extract(self, url):
1933 file_id = url.split('/')[-1]
1934 # Rebuild url in english locale
1935 url = 'http://depositfiles.com/en/files/' + file_id
1936
1937 # Retrieve file webpage with 'Free download' button pressed
1938 free_download_indication = { 'gateway_result' : '1' }
1939 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1940 try:
1941 self.report_download_webpage(file_id)
1942 webpage = compat_urllib_request.urlopen(request).read()
1943 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1945 return
1946
1947 # Search for the real file URL
1948 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1949 if (mobj is None) or (mobj.group(1) is None):
1950 # Try to figure out reason of the error.
1951 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1952 if (mobj is not None) and (mobj.group(1) is not None):
1953 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1954 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1955 else:
1956 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1957 return
1958
1959 file_url = mobj.group(1)
1960 file_extension = os.path.splitext(file_url)[1][1:]
1961
1962 # Search for file title
1963 mobj = re.search(r'<b title="(.*?)">', webpage)
1964 if mobj is None:
1965 self._downloader.trouble(u'ERROR: unable to extract title')
1966 return
1967 file_title = mobj.group(1).decode('utf-8')
1968
1969 return [{
1970 'id': file_id.decode('utf-8'),
1971 'url': file_url.decode('utf-8'),
1972 'uploader': None,
1973 'upload_date': None,
1974 'title': file_title,
1975 'ext': file_extension.decode('utf-8'),
1976 }]
1977
1978
1979 class FacebookIE(InfoExtractor):
1980 """Information Extractor for Facebook"""
1981
1982 _WORKING = False
1983 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1984 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1985 _NETRC_MACHINE = 'facebook'
1986 _available_formats = ['video', 'highqual', 'lowqual']
1987 _video_extensions = {
1988 'video': 'mp4',
1989 'highqual': 'mp4',
1990 'lowqual': 'mp4',
1991 }
1992 IE_NAME = u'facebook'
1993
1994 def __init__(self, downloader=None):
1995 InfoExtractor.__init__(self, downloader)
1996
1997 def _reporter(self, message):
1998 """Add header and report message."""
1999 self._downloader.to_screen(u'[facebook] %s' % message)
2000
2001 def report_login(self):
2002 """Report attempt to log in."""
2003 self._reporter(u'Logging in')
2004
2005 def report_video_webpage_download(self, video_id):
2006 """Report attempt to download video webpage."""
2007 self._reporter(u'%s: Downloading video webpage' % video_id)
2008
2009 def report_information_extraction(self, video_id):
2010 """Report attempt to extract video information."""
2011 self._reporter(u'%s: Extracting video information' % video_id)
2012
2013 def _parse_page(self, video_webpage):
2014 """Extract video information from page"""
2015 # General data
2016 data = {'title': r'\("video_title", "(.*?)"\)',
2017 'description': r'<div class="datawrap">(.*?)</div>',
2018 'owner': r'\("video_owner_name", "(.*?)"\)',
2019 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2020 }
2021 video_info = {}
2022 for piece in data.keys():
2023 mobj = re.search(data[piece], video_webpage)
2024 if mobj is not None:
2025 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2026
2027 # Video urls
2028 video_urls = {}
2029 for fmt in self._available_formats:
2030 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2031 if mobj is not None:
2032 # URL is in a Javascript segment inside an escaped Unicode format within
2033 # the generally utf-8 page
2034 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2035 video_info['video_urls'] = video_urls
2036
2037 return video_info
2038
2039 def _real_initialize(self):
2040 if self._downloader is None:
2041 return
2042
2043 useremail = None
2044 password = None
2045 downloader_params = self._downloader.params
2046
2047 # Attempt to use provided username and password or .netrc data
2048 if downloader_params.get('username', None) is not None:
2049 useremail = downloader_params['username']
2050 password = downloader_params['password']
2051 elif downloader_params.get('usenetrc', False):
2052 try:
2053 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2054 if info is not None:
2055 useremail = info[0]
2056 password = info[2]
2057 else:
2058 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2059 except (IOError, netrc.NetrcParseError) as err:
2060 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2061 return
2062
2063 if useremail is None:
2064 return
2065
2066 # Log in
2067 login_form = {
2068 'email': useremail,
2069 'pass': password,
2070 'login': 'Log+In'
2071 }
2072 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2073 try:
2074 self.report_login()
2075 login_results = compat_urllib_request.urlopen(request).read()
2076 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2077 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2078 return
2079 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2080 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2081 return
2082
2083 def _real_extract(self, url):
2084 mobj = re.match(self._VALID_URL, url)
2085 if mobj is None:
2086 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2087 return
2088 video_id = mobj.group('ID')
2089
2090 # Get video webpage
2091 self.report_video_webpage_download(video_id)
2092 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2093 try:
2094 page = compat_urllib_request.urlopen(request)
2095 video_webpage = page.read()
2096 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2097 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2098 return
2099
2100 # Start extracting information
2101 self.report_information_extraction(video_id)
2102
2103 # Extract information
2104 video_info = self._parse_page(video_webpage)
2105
2106 # uploader
2107 if 'owner' not in video_info:
2108 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2109 return
2110 video_uploader = video_info['owner']
2111
2112 # title
2113 if 'title' not in video_info:
2114 self._downloader.trouble(u'ERROR: unable to extract video title')
2115 return
2116 video_title = video_info['title']
2117 video_title = video_title.decode('utf-8')
2118
2119 # thumbnail image
2120 if 'thumbnail' not in video_info:
2121 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2122 video_thumbnail = ''
2123 else:
2124 video_thumbnail = video_info['thumbnail']
2125
2126 # upload date
2127 upload_date = None
2128 if 'upload_date' in video_info:
2129 upload_time = video_info['upload_date']
2130 timetuple = email.utils.parsedate_tz(upload_time)
2131 if timetuple is not None:
2132 try:
2133 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2134 except:
2135 pass
2136
2137 # description
2138 video_description = video_info.get('description', 'No description available.')
2139
2140 url_map = video_info['video_urls']
2141 if url_map:
2142 # Decide which formats to download
2143 req_format = self._downloader.params.get('format', None)
2144 format_limit = self._downloader.params.get('format_limit', None)
2145
2146 if format_limit is not None and format_limit in self._available_formats:
2147 format_list = self._available_formats[self._available_formats.index(format_limit):]
2148 else:
2149 format_list = self._available_formats
2150 existing_formats = [x for x in format_list if x in url_map]
2151 if len(existing_formats) == 0:
2152 self._downloader.trouble(u'ERROR: no known formats available for video')
2153 return
2154 if req_format is None:
2155 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2156 elif req_format == 'worst':
2157 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2158 elif req_format == '-1':
2159 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2160 else:
2161 # Specific format
2162 if req_format not in url_map:
2163 self._downloader.trouble(u'ERROR: requested format not available')
2164 return
2165 video_url_list = [(req_format, url_map[req_format])] # Specific format
2166
2167 results = []
2168 for format_param, video_real_url in video_url_list:
2169 # Extension
2170 video_extension = self._video_extensions.get(format_param, 'mp4')
2171
2172 results.append({
2173 'id': video_id.decode('utf-8'),
2174 'url': video_real_url.decode('utf-8'),
2175 'uploader': video_uploader.decode('utf-8'),
2176 'upload_date': upload_date,
2177 'title': video_title,
2178 'ext': video_extension.decode('utf-8'),
2179 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2180 'thumbnail': video_thumbnail.decode('utf-8'),
2181 'description': video_description.decode('utf-8'),
2182 })
2183 return results
2184
2185 class BlipTVIE(InfoExtractor):
2186 """Information extractor for blip.tv"""
2187
2188 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2189 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2190 IE_NAME = u'blip.tv'
2191
2192 def report_extraction(self, file_id):
2193 """Report information extraction."""
2194 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2195
2196 def report_direct_download(self, title):
2197 """Report information extraction."""
2198 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2199
2200 def _real_extract(self, url):
2201 mobj = re.match(self._VALID_URL, url)
2202 if mobj is None:
2203 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2204 return
2205
2206 if '?' in url:
2207 cchar = '&'
2208 else:
2209 cchar = '?'
2210 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2211 request = compat_urllib_request.Request(json_url)
2212 request.add_header('User-Agent', 'iTunes/10.6.1')
2213 self.report_extraction(mobj.group(1))
2214 info = None
2215 try:
2216 urlh = compat_urllib_request.urlopen(request)
2217 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2218 basename = url.split('/')[-1]
2219 title,ext = os.path.splitext(basename)
2220 title = title.decode('UTF-8')
2221 ext = ext.replace('.', '')
2222 self.report_direct_download(title)
2223 info = {
2224 'id': title,
2225 'url': url,
2226 'uploader': None,
2227 'upload_date': None,
2228 'title': title,
2229 'ext': ext,
2230 'urlhandle': urlh
2231 }
2232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2233 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2234 if info is None: # Regular URL
2235 try:
2236 json_code_bytes = urlh.read()
2237 json_code = json_code_bytes.decode('utf-8')
2238 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2239 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2240 return
2241
2242 try:
2243 json_data = json.loads(json_code)
2244 if 'Post' in json_data:
2245 data = json_data['Post']
2246 else:
2247 data = json_data
2248
2249 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2250 video_url = data['media']['url']
2251 umobj = re.match(self._URL_EXT, video_url)
2252 if umobj is None:
2253 raise ValueError('Can not determine filename extension')
2254 ext = umobj.group(1)
2255
2256 info = {
2257 'id': data['item_id'],
2258 'url': video_url,
2259 'uploader': data['display_name'],
2260 'upload_date': upload_date,
2261 'title': data['title'],
2262 'ext': ext,
2263 'format': data['media']['mimeType'],
2264 'thumbnail': data['thumbnailUrl'],
2265 'description': data['description'],
2266 'player_url': data['embedUrl'],
2267 'user_agent': 'iTunes/10.6.1',
2268 }
2269 except (ValueError,KeyError) as err:
2270 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2271 return
2272
2273 return [info]
2274
2275
2276 class MyVideoIE(InfoExtractor):
2277 """Information Extractor for myvideo.de."""
2278
2279 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2280 IE_NAME = u'myvideo'
2281
2282 def __init__(self, downloader=None):
2283 InfoExtractor.__init__(self, downloader)
2284
2285 def report_extraction(self, video_id):
2286 """Report information extraction."""
2287 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2288
2289 def _real_extract(self,url):
2290 mobj = re.match(self._VALID_URL, url)
2291 if mobj is None:
2292 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2293 return
2294
2295 video_id = mobj.group(1)
2296
2297 # Get video webpage
2298 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2299 webpage = self._download_webpage(webpage_url, video_id)
2300
2301 self.report_extraction(video_id)
2302 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2303 webpage)
2304 if mobj is None:
2305 self._downloader.trouble(u'ERROR: unable to extract media URL')
2306 return
2307 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2308
2309 mobj = re.search('<title>([^<]+)</title>', webpage)
2310 if mobj is None:
2311 self._downloader.trouble(u'ERROR: unable to extract title')
2312 return
2313
2314 video_title = mobj.group(1)
2315
2316 return [{
2317 'id': video_id,
2318 'url': video_url,
2319 'uploader': None,
2320 'upload_date': None,
2321 'title': video_title,
2322 'ext': u'flv',
2323 }]
2324
2325 class ComedyCentralIE(InfoExtractor):
2326 """Information extractor for The Daily Show and Colbert Report """
2327
2328 # urls can be abbreviations like :thedailyshow or :colbert
2329 # urls for episodes like:
2330 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2331 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2332 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2333 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2334 |(https?://)?(www\.)?
2335 (?P<showname>thedailyshow|colbertnation)\.com/
2336 (full-episodes/(?P<episode>.*)|
2337 (?P<clip>
2338 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2339 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2340 $"""
2341
2342 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2343
2344 _video_extensions = {
2345 '3500': 'mp4',
2346 '2200': 'mp4',
2347 '1700': 'mp4',
2348 '1200': 'mp4',
2349 '750': 'mp4',
2350 '400': 'mp4',
2351 }
2352 _video_dimensions = {
2353 '3500': '1280x720',
2354 '2200': '960x540',
2355 '1700': '768x432',
2356 '1200': '640x360',
2357 '750': '512x288',
2358 '400': '384x216',
2359 }
2360
2361 def suitable(self, url):
2362 """Receives a URL and returns True if suitable for this IE."""
2363 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2364
2365 def report_extraction(self, episode_id):
2366 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2367
2368 def report_config_download(self, episode_id, media_id):
2369 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2370
2371 def report_index_download(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2373
2374 def _print_formats(self, formats):
2375 print('Available formats:')
2376 for x in formats:
2377 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2378
2379
2380 def _real_extract(self, url):
2381 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2382 if mobj is None:
2383 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2384 return
2385
2386 if mobj.group('shortname'):
2387 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2388 url = u'http://www.thedailyshow.com/full-episodes/'
2389 else:
2390 url = u'http://www.colbertnation.com/full-episodes/'
2391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 assert mobj is not None
2393
2394 if mobj.group('clip'):
2395 if mobj.group('showname') == 'thedailyshow':
2396 epTitle = mobj.group('tdstitle')
2397 else:
2398 epTitle = mobj.group('cntitle')
2399 dlNewest = False
2400 else:
2401 dlNewest = not mobj.group('episode')
2402 if dlNewest:
2403 epTitle = mobj.group('showname')
2404 else:
2405 epTitle = mobj.group('episode')
2406
2407 req = compat_urllib_request.Request(url)
2408 self.report_extraction(epTitle)
2409 try:
2410 htmlHandle = compat_urllib_request.urlopen(req)
2411 html = htmlHandle.read()
2412 webpage = html.decode('utf-8')
2413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2414 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2415 return
2416 if dlNewest:
2417 url = htmlHandle.geturl()
2418 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2419 if mobj is None:
2420 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2421 return
2422 if mobj.group('episode') == '':
2423 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2424 return
2425 epTitle = mobj.group('episode')
2426
2427 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2428
2429 if len(mMovieParams) == 0:
2430 # The Colbert Report embeds the information in a without
2431 # a URL prefix; so extract the alternate reference
2432 # and then add the URL prefix manually.
2433
2434 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2435 if len(altMovieParams) == 0:
2436 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2437 return
2438 else:
2439 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2440
2441 uri = mMovieParams[0][1]
2442 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2443 self.report_index_download(epTitle)
2444 try:
2445 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2448 return
2449
2450 results = []
2451
2452 idoc = xml.etree.ElementTree.fromstring(indexXml)
2453 itemEls = idoc.findall('.//item')
2454 for partNum,itemEl in enumerate(itemEls):
2455 mediaId = itemEl.findall('./guid')[0].text
2456 shortMediaId = mediaId.split(':')[-1]
2457 showId = mediaId.split(':')[-2].replace('.com', '')
2458 officialTitle = itemEl.findall('./title')[0].text
2459 officialDate = itemEl.findall('./pubDate')[0].text
2460
2461 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2462 compat_urllib_parse.urlencode({'uri': mediaId}))
2463 configReq = compat_urllib_request.Request(configUrl)
2464 self.report_config_download(epTitle, shortMediaId)
2465 try:
2466 configXml = compat_urllib_request.urlopen(configReq).read()
2467 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2469 return
2470
2471 cdoc = xml.etree.ElementTree.fromstring(configXml)
2472 turls = []
2473 for rendition in cdoc.findall('.//rendition'):
2474 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2475 turls.append(finfo)
2476
2477 if len(turls) == 0:
2478 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2479 continue
2480
2481 if self._downloader.params.get('listformats', None):
2482 self._print_formats([i[0] for i in turls])
2483 return
2484
2485 # For now, just pick the highest bitrate
2486 format,rtmp_video_url = turls[-1]
2487
2488 # Get the format arg from the arg stream
2489 req_format = self._downloader.params.get('format', None)
2490
2491 # Select format if we can find one
2492 for f,v in turls:
2493 if f == req_format:
2494 format, rtmp_video_url = f, v
2495 break
2496
2497 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2498 if not m:
2499 raise ExtractorError(u'Cannot transform RTMP url')
2500 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2501 video_url = base + m.group('finalid')
2502
2503 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2504 info = {
2505 'id': shortMediaId,
2506 'url': video_url,
2507 'uploader': showId,
2508 'upload_date': officialDate,
2509 'title': effTitle,
2510 'ext': 'mp4',
2511 'format': format,
2512 'thumbnail': None,
2513 'description': officialTitle,
2514 }
2515 results.append(info)
2516
2517 return results
2518
2519
2520 class EscapistIE(InfoExtractor):
2521 """Information extractor for The Escapist """
2522
2523 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2524 IE_NAME = u'escapist'
2525
2526 def report_extraction(self, showName):
2527 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2528
2529 def report_config_download(self, showName):
2530 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2531
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2534 if mobj is None:
2535 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2536 return
2537 showName = mobj.group('showname')
2538 videoId = mobj.group('episode')
2539
2540 self.report_extraction(showName)
2541 try:
2542 webPage = compat_urllib_request.urlopen(url)
2543 webPageBytes = webPage.read()
2544 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2545 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2548 return
2549
2550 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2551 description = unescapeHTML(descMatch.group(1))
2552 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2553 imgUrl = unescapeHTML(imgMatch.group(1))
2554 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2555 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2556 configUrlMatch = re.search('config=(.*)$', playerUrl)
2557 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2558
2559 self.report_config_download(showName)
2560 try:
2561 configJSON = compat_urllib_request.urlopen(configUrl)
2562 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2563 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2566 return
2567
2568 # Technically, it's JavaScript, not JSON
2569 configJSON = configJSON.replace("'", '"')
2570
2571 try:
2572 config = json.loads(configJSON)
2573 except (ValueError,) as err:
2574 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2575 return
2576
2577 playlist = config['playlist']
2578 videoUrl = playlist[1]['url']
2579
2580 info = {
2581 'id': videoId,
2582 'url': videoUrl,
2583 'uploader': showName,
2584 'upload_date': None,
2585 'title': showName,
2586 'ext': 'flv',
2587 'thumbnail': imgUrl,
2588 'description': description,
2589 'player_url': playerUrl,
2590 }
2591
2592 return [info]
2593
2594 class CollegeHumorIE(InfoExtractor):
2595 """Information extractor for collegehumor.com"""
2596
2597 _WORKING = False
2598 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2599 IE_NAME = u'collegehumor'
2600
2601 def report_manifest(self, video_id):
2602 """Report information extraction."""
2603 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2604
2605 def report_extraction(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2608
2609 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2611 if mobj is None:
2612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613 return
2614 video_id = mobj.group('videoid')
2615
2616 info = {
2617 'id': video_id,
2618 'uploader': None,
2619 'upload_date': None,
2620 }
2621
2622 self.report_extraction(video_id)
2623 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2624 try:
2625 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2627 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2628 return
2629
2630 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2631 try:
2632 videoNode = mdoc.findall('./video')[0]
2633 info['description'] = videoNode.findall('./description')[0].text
2634 info['title'] = videoNode.findall('./caption')[0].text
2635 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2636 manifest_url = videoNode.findall('./file')[0].text
2637 except IndexError:
2638 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2639 return
2640
2641 manifest_url += '?hdcore=2.10.3'
2642 self.report_manifest(video_id)
2643 try:
2644 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2647 return
2648
2649 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2650 try:
2651 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2652 node_id = media_node.attrib['url']
2653 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2654 except IndexError as err:
2655 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2656 return
2657
2658 url_pr = compat_urllib_parse_urlparse(manifest_url)
2659 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2660
2661 info['url'] = url
2662 info['ext'] = 'f4f'
2663 return [info]
2664
2665
2666 class XVideosIE(InfoExtractor):
2667 """Information extractor for xvideos.com"""
2668
2669 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2670 IE_NAME = u'xvideos'
2671
2672 def report_extraction(self, video_id):
2673 """Report information extraction."""
2674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2675
2676 def _real_extract(self, url):
2677 mobj = re.match(self._VALID_URL, url)
2678 if mobj is None:
2679 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2680 return
2681 video_id = mobj.group(1)
2682
2683 webpage = self._download_webpage(url, video_id)
2684
2685 self.report_extraction(video_id)
2686
2687
2688 # Extract video URL
2689 mobj = re.search(r'flv_url=(.+?)&', webpage)
2690 if mobj is None:
2691 self._downloader.trouble(u'ERROR: unable to extract video url')
2692 return
2693 video_url = compat_urllib_parse.unquote(mobj.group(1))
2694
2695
2696 # Extract title
2697 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2698 if mobj is None:
2699 self._downloader.trouble(u'ERROR: unable to extract video title')
2700 return
2701 video_title = mobj.group(1)
2702
2703
2704 # Extract video thumbnail
2705 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2706 if mobj is None:
2707 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2708 return
2709 video_thumbnail = mobj.group(0)
2710
2711 info = {
2712 'id': video_id,
2713 'url': video_url,
2714 'uploader': None,
2715 'upload_date': None,
2716 'title': video_title,
2717 'ext': 'flv',
2718 'thumbnail': video_thumbnail,
2719 'description': None,
2720 }
2721
2722 return [info]
2723
2724
2725 class SoundcloudIE(InfoExtractor):
2726 """Information extractor for soundcloud.com
2727 To access the media, the uid of the song and a stream token
2728 must be extracted from the page source and the script must make
2729 a request to media.soundcloud.com/crossdomain.xml. Then
2730 the media can be grabbed by requesting from an url composed
2731 of the stream token and uid
2732 """
2733
2734 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2735 IE_NAME = u'soundcloud'
2736
2737 def __init__(self, downloader=None):
2738 InfoExtractor.__init__(self, downloader)
2739
2740 def report_resolve(self, video_id):
2741 """Report information extraction."""
2742 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2743
2744 def report_extraction(self, video_id):
2745 """Report information extraction."""
2746 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2747
2748 def _real_extract(self, url):
2749 mobj = re.match(self._VALID_URL, url)
2750 if mobj is None:
2751 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2752 return
2753
2754 # extract uploader (which is in the url)
2755 uploader = mobj.group(1)
2756 # extract simple title (uploader + slug of song title)
2757 slug_title = mobj.group(2)
2758 simple_title = uploader + u'-' + slug_title
2759
2760 self.report_resolve('%s/%s' % (uploader, slug_title))
2761
2762 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2763 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2764 request = compat_urllib_request.Request(resolv_url)
2765 try:
2766 info_json_bytes = compat_urllib_request.urlopen(request).read()
2767 info_json = info_json_bytes.decode('utf-8')
2768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2769 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2770 return
2771
2772 info = json.loads(info_json)
2773 video_id = info['id']
2774 self.report_extraction('%s/%s' % (uploader, slug_title))
2775
2776 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777 request = compat_urllib_request.Request(streams_url)
2778 try:
2779 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2780 stream_json = stream_json_bytes.decode('utf-8')
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2783 return
2784
2785 streams = json.loads(stream_json)
2786 mediaURL = streams['http_mp3_128_url']
2787
2788 return [{
2789 'id': info['id'],
2790 'url': mediaURL,
2791 'uploader': info['user']['username'],
2792 'upload_date': info['created_at'],
2793 'title': info['title'],
2794 'ext': u'mp3',
2795 'description': info['description'],
2796 }]
2797
2798
2799 class InfoQIE(InfoExtractor):
2800 """Information extractor for infoq.com"""
2801 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2802
2803 def report_extraction(self, video_id):
2804 """Report information extraction."""
2805 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2806
2807 def _real_extract(self, url):
2808 mobj = re.match(self._VALID_URL, url)
2809 if mobj is None:
2810 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2811 return
2812
2813 webpage = self._download_webpage(url, video_id=url)
2814 self.report_extraction(url)
2815
2816 # Extract video URL
2817 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2818 if mobj is None:
2819 self._downloader.trouble(u'ERROR: unable to extract video url')
2820 return
2821 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2822 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2823
2824 # Extract title
2825 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2826 if mobj is None:
2827 self._downloader.trouble(u'ERROR: unable to extract video title')
2828 return
2829 video_title = mobj.group(1)
2830
2831 # Extract description
2832 video_description = u'No description available.'
2833 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2834 if mobj is not None:
2835 video_description = mobj.group(1)
2836
2837 video_filename = video_url.split('/')[-1]
2838 video_id, extension = video_filename.split('.')
2839
2840 info = {
2841 'id': video_id,
2842 'url': video_url,
2843 'uploader': None,
2844 'upload_date': None,
2845 'title': video_title,
2846 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2847 'thumbnail': None,
2848 'description': video_description,
2849 }
2850
2851 return [info]
2852
2853 class MixcloudIE(InfoExtractor):
2854 """Information extractor for www.mixcloud.com"""
2855
2856 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2857 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2858 IE_NAME = u'mixcloud'
2859
2860 def __init__(self, downloader=None):
2861 InfoExtractor.__init__(self, downloader)
2862
2863 def report_download_json(self, file_id):
2864 """Report JSON download."""
2865 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2866
2867 def report_extraction(self, file_id):
2868 """Report information extraction."""
2869 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2870
2871 def get_urls(self, jsonData, fmt, bitrate='best'):
2872 """Get urls from 'audio_formats' section in json"""
2873 file_url = None
2874 try:
2875 bitrate_list = jsonData[fmt]
2876 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2877 bitrate = max(bitrate_list) # select highest
2878
2879 url_list = jsonData[fmt][bitrate]
2880 except TypeError: # we have no bitrate info.
2881 url_list = jsonData[fmt]
2882 return url_list
2883
2884 def check_urls(self, url_list):
2885 """Returns 1st active url from list"""
2886 for url in url_list:
2887 try:
2888 compat_urllib_request.urlopen(url)
2889 return url
2890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2891 url = None
2892
2893 return None
2894
2895 def _print_formats(self, formats):
2896 print('Available formats:')
2897 for fmt in formats.keys():
2898 for b in formats[fmt]:
2899 try:
2900 ext = formats[fmt][b][0]
2901 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2902 except TypeError: # we have no bitrate info
2903 ext = formats[fmt][0]
2904 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2905 break
2906
2907 def _real_extract(self, url):
2908 mobj = re.match(self._VALID_URL, url)
2909 if mobj is None:
2910 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2911 return
2912 # extract uploader & filename from url
2913 uploader = mobj.group(1).decode('utf-8')
2914 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2915
2916 # construct API request
2917 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2918 # retrieve .json file with links to files
2919 request = compat_urllib_request.Request(file_url)
2920 try:
2921 self.report_download_json(file_url)
2922 jsonData = compat_urllib_request.urlopen(request).read()
2923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2925 return
2926
2927 # parse JSON
2928 json_data = json.loads(jsonData)
2929 player_url = json_data['player_swf_url']
2930 formats = dict(json_data['audio_formats'])
2931
2932 req_format = self._downloader.params.get('format', None)
2933 bitrate = None
2934
2935 if self._downloader.params.get('listformats', None):
2936 self._print_formats(formats)
2937 return
2938
2939 if req_format is None or req_format == 'best':
2940 for format_param in formats.keys():
2941 url_list = self.get_urls(formats, format_param)
2942 # check urls
2943 file_url = self.check_urls(url_list)
2944 if file_url is not None:
2945 break # got it!
2946 else:
2947 if req_format not in formats:
2948 self._downloader.trouble(u'ERROR: format is not available')
2949 return
2950
2951 url_list = self.get_urls(formats, req_format)
2952 file_url = self.check_urls(url_list)
2953 format_param = req_format
2954
2955 return [{
2956 'id': file_id.decode('utf-8'),
2957 'url': file_url.decode('utf-8'),
2958 'uploader': uploader.decode('utf-8'),
2959 'upload_date': None,
2960 'title': json_data['name'],
2961 'ext': file_url.split('.')[-1].decode('utf-8'),
2962 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2963 'thumbnail': json_data['thumbnail_url'],
2964 'description': json_data['description'],
2965 'player_url': player_url.decode('utf-8'),
2966 }]
2967
2968 class StanfordOpenClassroomIE(InfoExtractor):
2969 """Information extractor for Stanford's Open ClassRoom"""
2970
2971 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2972 IE_NAME = u'stanfordoc'
2973
2974 def report_download_webpage(self, objid):
2975 """Report information extraction."""
2976 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2977
2978 def report_extraction(self, video_id):
2979 """Report information extraction."""
2980 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2981
2982 def _real_extract(self, url):
2983 mobj = re.match(self._VALID_URL, url)
2984 if mobj is None:
2985 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2986 return
2987
2988 if mobj.group('course') and mobj.group('video'): # A specific video
2989 course = mobj.group('course')
2990 video = mobj.group('video')
2991 info = {
2992 'id': course + '_' + video,
2993 'uploader': None,
2994 'upload_date': None,
2995 }
2996
2997 self.report_extraction(info['id'])
2998 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2999 xmlUrl = baseUrl + video + '.xml'
3000 try:
3001 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3003 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3004 return
3005 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3006 try:
3007 info['title'] = mdoc.findall('./title')[0].text
3008 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3009 except IndexError:
3010 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3011 return
3012 info['ext'] = info['url'].rpartition('.')[2]
3013 return [info]
3014 elif mobj.group('course'): # A course page
3015 course = mobj.group('course')
3016 info = {
3017 'id': course,
3018 'type': 'playlist',
3019 'uploader': None,
3020 'upload_date': None,
3021 }
3022
3023 self.report_download_webpage(info['id'])
3024 try:
3025 coursepage = compat_urllib_request.urlopen(url).read()
3026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3028 return
3029
3030 m = re.search('<h1>([^<]+)</h1>', coursepage)
3031 if m:
3032 info['title'] = unescapeHTML(m.group(1))
3033 else:
3034 info['title'] = info['id']
3035
3036 m = re.search('<description>([^<]+)</description>', coursepage)
3037 if m:
3038 info['description'] = unescapeHTML(m.group(1))
3039
3040 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3041 info['list'] = [
3042 {
3043 'type': 'reference',
3044 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3045 }
3046 for vpage in links]
3047 results = []
3048 for entry in info['list']:
3049 assert entry['type'] == 'reference'
3050 results += self.extract(entry['url'])
3051 return results
3052
3053 else: # Root page
3054 info = {
3055 'id': 'Stanford OpenClassroom',
3056 'type': 'playlist',
3057 'uploader': None,
3058 'upload_date': None,
3059 }
3060
3061 self.report_download_webpage(info['id'])
3062 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3063 try:
3064 rootpage = compat_urllib_request.urlopen(rootURL).read()
3065 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3066 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3067 return
3068
3069 info['title'] = info['id']
3070
3071 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3072 info['list'] = [
3073 {
3074 'type': 'reference',
3075 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3076 }
3077 for cpage in links]
3078
3079 results = []
3080 for entry in info['list']:
3081 assert entry['type'] == 'reference'
3082 results += self.extract(entry['url'])
3083 return results
3084
3085 class MTVIE(InfoExtractor):
3086 """Information extractor for MTV.com"""
3087
3088 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3089 IE_NAME = u'mtv'
3090
3091 def report_extraction(self, video_id):
3092 """Report information extraction."""
3093 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3094
3095 def _real_extract(self, url):
3096 mobj = re.match(self._VALID_URL, url)
3097 if mobj is None:
3098 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3099 return
3100 if not mobj.group('proto'):
3101 url = 'http://' + url
3102 video_id = mobj.group('videoid')
3103
3104 webpage = self._download_webpage(url, video_id)
3105
3106 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3107 if mobj is None:
3108 self._downloader.trouble(u'ERROR: unable to extract song name')
3109 return
3110 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3111 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3112 if mobj is None:
3113 self._downloader.trouble(u'ERROR: unable to extract performer')
3114 return
3115 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3116 video_title = performer + ' - ' + song_name
3117
3118 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3119 if mobj is None:
3120 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3121 return
3122 mtvn_uri = mobj.group(1)
3123
3124 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3125 if mobj is None:
3126 self._downloader.trouble(u'ERROR: unable to extract content id')
3127 return
3128 content_id = mobj.group(1)
3129
3130 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3131 self.report_extraction(video_id)
3132 request = compat_urllib_request.Request(videogen_url)
3133 try:
3134 metadataXml = compat_urllib_request.urlopen(request).read()
3135 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3136 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3137 return
3138
3139 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3140 renditions = mdoc.findall('.//rendition')
3141
3142 # For now, always pick the highest quality.
3143 rendition = renditions[-1]
3144
3145 try:
3146 _,_,ext = rendition.attrib['type'].partition('/')
3147 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3148 video_url = rendition.find('./src').text
3149 except KeyError:
3150 self._downloader.trouble('Invalid rendition field.')
3151 return
3152
3153 info = {
3154 'id': video_id,
3155 'url': video_url,
3156 'uploader': performer,
3157 'upload_date': None,
3158 'title': video_title,
3159 'ext': ext,
3160 'format': format,
3161 }
3162
3163 return [info]
3164
3165
3166 class YoukuIE(InfoExtractor):
3167 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3168
3169 def report_download_webpage(self, file_id):
3170 """Report webpage download."""
3171 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3172
3173 def report_extraction(self, file_id):
3174 """Report information extraction."""
3175 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3176
3177 def _gen_sid(self):
3178 nowTime = int(time.time() * 1000)
3179 random1 = random.randint(1000,1998)
3180 random2 = random.randint(1000,9999)
3181
3182 return "%d%d%d" %(nowTime,random1,random2)
3183
3184 def _get_file_ID_mix_string(self, seed):
3185 mixed = []
3186 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3187 seed = float(seed)
3188 for i in range(len(source)):
3189 seed = (seed * 211 + 30031 ) % 65536
3190 index = math.floor(seed / 65536 * len(source) )
3191 mixed.append(source[int(index)])
3192 source.remove(source[int(index)])
3193 #return ''.join(mixed)
3194 return mixed
3195
3196 def _get_file_id(self, fileId, seed):
3197 mixed = self._get_file_ID_mix_string(seed)
3198 ids = fileId.split('*')
3199 realId = []
3200 for ch in ids:
3201 if ch:
3202 realId.append(mixed[int(ch)])
3203 return ''.join(realId)
3204
3205 def _real_extract(self, url):
3206 mobj = re.match(self._VALID_URL, url)
3207 if mobj is None:
3208 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3209 return
3210 video_id = mobj.group('ID')
3211
3212 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3213
3214 request = compat_urllib_request.Request(info_url, None, std_headers)
3215 try:
3216 self.report_download_webpage(video_id)
3217 jsondata = compat_urllib_request.urlopen(request).read()
3218 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3219 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3220 return
3221
3222 self.report_extraction(video_id)
3223 try:
3224 jsonstr = jsondata.decode('utf-8')
3225 config = json.loads(jsonstr)
3226
3227 video_title = config['data'][0]['title']
3228 seed = config['data'][0]['seed']
3229
3230 format = self._downloader.params.get('format', None)
3231 supported_format = list(config['data'][0]['streamfileids'].keys())
3232
3233 if format is None or format == 'best':
3234 if 'hd2' in supported_format:
3235 format = 'hd2'
3236 else:
3237 format = 'flv'
3238 ext = u'flv'
3239 elif format == 'worst':
3240 format = 'mp4'
3241 ext = u'mp4'
3242 else:
3243 format = 'flv'
3244 ext = u'flv'
3245
3246
3247 fileid = config['data'][0]['streamfileids'][format]
3248 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3249 except (UnicodeDecodeError, ValueError, KeyError):
3250 self._downloader.trouble(u'ERROR: unable to extract info section')
3251 return
3252
3253 files_info=[]
3254 sid = self._gen_sid()
3255 fileid = self._get_file_id(fileid, seed)
3256
3257 #column 8,9 of fileid represent the segment number
3258 #fileid[7:9] should be changed
3259 for index, key in enumerate(keys):
3260
3261 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3262 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3263
3264 info = {
3265 'id': '%s_part%02d' % (video_id, index),
3266 'url': download_url,
3267 'uploader': None,
3268 'upload_date': None,
3269 'title': video_title,
3270 'ext': ext,
3271 }
3272 files_info.append(info)
3273
3274 return files_info
3275
3276
3277 class XNXXIE(InfoExtractor):
3278 """Information extractor for xnxx.com"""
3279
3280 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3281 IE_NAME = u'xnxx'
3282 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3283 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3284 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3285
3286 def report_webpage(self, video_id):
3287 """Report information extraction"""
3288 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3289
3290 def report_extraction(self, video_id):
3291 """Report information extraction"""
3292 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3293
3294 def _real_extract(self, url):
3295 mobj = re.match(self._VALID_URL, url)
3296 if mobj is None:
3297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3298 return
3299 video_id = mobj.group(1)
3300
3301 self.report_webpage(video_id)
3302
3303 # Get webpage content
3304 try:
3305 webpage_bytes = compat_urllib_request.urlopen(url).read()
3306 webpage = webpage_bytes.decode('utf-8')
3307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3309 return
3310
3311 result = re.search(self.VIDEO_URL_RE, webpage)
3312 if result is None:
3313 self._downloader.trouble(u'ERROR: unable to extract video url')
3314 return
3315 video_url = compat_urllib_parse.unquote(result.group(1))
3316
3317 result = re.search(self.VIDEO_TITLE_RE, webpage)
3318 if result is None:
3319 self._downloader.trouble(u'ERROR: unable to extract video title')
3320 return
3321 video_title = result.group(1)
3322
3323 result = re.search(self.VIDEO_THUMB_RE, webpage)
3324 if result is None:
3325 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3326 return
3327 video_thumbnail = result.group(1)
3328
3329 return [{
3330 'id': video_id,
3331 'url': video_url,
3332 'uploader': None,
3333 'upload_date': None,
3334 'title': video_title,
3335 'ext': 'flv',
3336 'thumbnail': video_thumbnail,
3337 'description': None,
3338 }]
3339
3340
3341 class GooglePlusIE(InfoExtractor):
3342 """Information extractor for plus.google.com."""
3343
3344 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3345 IE_NAME = u'plus.google'
3346
3347 def __init__(self, downloader=None):
3348 InfoExtractor.__init__(self, downloader)
3349
3350 def report_extract_entry(self, url):
3351 """Report downloading extry"""
3352 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3353
3354 def report_date(self, upload_date):
3355 """Report downloading extry"""
3356 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3357
3358 def report_uploader(self, uploader):
3359 """Report downloading extry"""
3360 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3361
3362 def report_title(self, video_title):
3363 """Report downloading extry"""
3364 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3365
3366 def report_extract_vid_page(self, video_page):
3367 """Report information extraction."""
3368 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3369
3370 def _real_extract(self, url):
3371 # Extract id from URL
3372 mobj = re.match(self._VALID_URL, url)
3373 if mobj is None:
3374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3375 return
3376
3377 post_url = mobj.group(0)
3378 video_id = mobj.group(1)
3379
3380 video_extension = 'flv'
3381
3382 # Step 1, Retrieve post webpage to extract further information
3383 self.report_extract_entry(post_url)
3384 request = compat_urllib_request.Request(post_url)
3385 try:
3386 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3388 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3389 return
3390
3391 # Extract update date
3392 upload_date = None
3393 pattern = 'title="Timestamp">(.*?)</a>'
3394 mobj = re.search(pattern, webpage)
3395 if mobj:
3396 upload_date = mobj.group(1)
3397 # Convert timestring to a format suitable for filename
3398 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3399 upload_date = upload_date.strftime('%Y%m%d')
3400 self.report_date(upload_date)
3401
3402 # Extract uploader
3403 uploader = None
3404 pattern = r'rel\="author".*?>(.*?)</a>'
3405 mobj = re.search(pattern, webpage)
3406 if mobj:
3407 uploader = mobj.group(1)
3408 self.report_uploader(uploader)
3409
3410 # Extract title
3411 # Get the first line for title
3412 video_title = u'NA'
3413 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3414 mobj = re.search(pattern, webpage)
3415 if mobj:
3416 video_title = mobj.group(1)
3417 self.report_title(video_title)
3418
3419 # Step 2, Stimulate clicking the image box to launch video
3420 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3421 mobj = re.search(pattern, webpage)
3422 if mobj is None:
3423 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3424
3425 video_page = mobj.group(1)
3426 request = compat_urllib_request.Request(video_page)
3427 try:
3428 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3429 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3430 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3431 return
3432 self.report_extract_vid_page(video_page)
3433
3434
3435 # Extract video links on video page
3436 """Extract video links of all sizes"""
3437 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3438 mobj = re.findall(pattern, webpage)
3439 if len(mobj) == 0:
3440 self._downloader.trouble(u'ERROR: unable to extract video links')
3441
3442 # Sort in resolution
3443 links = sorted(mobj)
3444
3445 # Choose the lowest of the sort, i.e. highest resolution
3446 video_url = links[-1]
3447 # Only get the url. The resolution part in the tuple has no use anymore
3448 video_url = video_url[-1]
3449 # Treat escaped \u0026 style hex
3450 try:
3451 video_url = video_url.decode("unicode_escape")
3452 except AttributeError: # Python 3
3453 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3454
3455
3456 return [{
3457 'id': video_id,
3458 'url': video_url,
3459 'uploader': uploader,
3460 'upload_date': upload_date,
3461 'title': video_title,
3462 'ext': video_extension,
3463 }]
3464
3465 class NBAIE(InfoExtractor):
3466 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3467 IE_NAME = u'nba'
3468
3469 def _real_extract(self, url):
3470 mobj = re.match(self._VALID_URL, url)
3471 if mobj is None:
3472 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3473 return
3474
3475 video_id = mobj.group(1)
3476 if video_id.endswith('/index.html'):
3477 video_id = video_id[:-len('/index.html')]
3478
3479 webpage = self._download_webpage(url, video_id)
3480
3481 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3482 def _findProp(rexp, default=None):
3483 m = re.search(rexp, webpage)
3484 if m:
3485 return unescapeHTML(m.group(1))
3486 else:
3487 return default
3488
3489 shortened_video_id = video_id.rpartition('/')[2]
3490 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3491 info = {
3492 'id': shortened_video_id,
3493 'url': video_url,
3494 'ext': 'mp4',
3495 'title': title,
3496 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3497 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3498 }
3499 return [info]
3500
3501 class JustinTVIE(InfoExtractor):
3502 """Information extractor for justin.tv and twitch.tv"""
3503 # TODO: One broadcast may be split into multiple videos. The key
3504 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3505 # starts at 1 and increases. Can we treat all parts as one video?
3506
3507 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3508 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3509 _JUSTIN_PAGE_LIMIT = 100
3510 IE_NAME = u'justin.tv'
3511
3512 def report_extraction(self, file_id):
3513 """Report information extraction."""
3514 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3515
3516 def report_download_page(self, channel, offset):
3517 """Report attempt to download a single page of videos."""
3518 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3519 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3520
3521 # Return count of items, list of *valid* items
3522 def _parse_page(self, url):
3523 try:
3524 urlh = compat_urllib_request.urlopen(url)
3525 webpage_bytes = urlh.read()
3526 webpage = webpage_bytes.decode('utf-8', 'ignore')
3527 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3528 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3529 return
3530
3531 response = json.loads(webpage)
3532 if type(response) != list:
3533 error_text = response.get('error', 'unknown error')
3534 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3535 return
3536 info = []
3537 for clip in response:
3538 video_url = clip['video_file_url']
3539 if video_url:
3540 video_extension = os.path.splitext(video_url)[1][1:]
3541 video_date = re.sub('-', '', clip['start_time'][:10])
3542 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3543 info.append({
3544 'id': clip['id'],
3545 'url': video_url,
3546 'title': clip['title'],
3547 'uploader': clip.get('channel_name', video_uploader_id),
3548 'uploader_id': video_uploader_id,
3549 'upload_date': video_date,
3550 'ext': video_extension,
3551 })
3552 return (len(response), info)
3553
3554 def _real_extract(self, url):
3555 mobj = re.match(self._VALID_URL, url)
3556 if mobj is None:
3557 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3558 return
3559
3560 api = 'http://api.justin.tv'
3561 video_id = mobj.group(mobj.lastindex)
3562 paged = False
3563 if mobj.lastindex == 1:
3564 paged = True
3565 api += '/channel/archives/%s.json'
3566 else:
3567 api += '/broadcast/by_archive/%s.json'
3568 api = api % (video_id,)
3569
3570 self.report_extraction(video_id)
3571
3572 info = []
3573 offset = 0
3574 limit = self._JUSTIN_PAGE_LIMIT
3575 while True:
3576 if paged:
3577 self.report_download_page(video_id, offset)
3578 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3579 page_count, page_info = self._parse_page(page_url)
3580 info.extend(page_info)
3581 if not paged or page_count != limit:
3582 break
3583 offset += limit
3584 return info
3585
3586 class FunnyOrDieIE(InfoExtractor):
3587 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3588
3589 def _real_extract(self, url):
3590 mobj = re.match(self._VALID_URL, url)
3591 if mobj is None:
3592 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3593 return
3594
3595 video_id = mobj.group('id')
3596 webpage = self._download_webpage(url, video_id)
3597
3598 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3599 if not m:
3600 self._downloader.trouble(u'ERROR: unable to find video information')
3601 video_url = unescapeHTML(m.group('url'))
3602
3603 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3604 if not m:
3605 self._downloader.trouble(u'Cannot find video title')
3606 title = unescapeHTML(m.group('title'))
3607
3608 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3609 if m:
3610 desc = unescapeHTML(m.group('desc'))
3611 else:
3612 desc = None
3613
3614 info = {
3615 'id': video_id,
3616 'url': video_url,
3617 'ext': 'mp4',
3618 'title': title,
3619 'description': desc,
3620 }
3621 return [info]
3622
3623 class TweetReelIE(InfoExtractor):
3624 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3625
3626 def _real_extract(self, url):
3627 mobj = re.match(self._VALID_URL, url)
3628 if mobj is None:
3629 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3630 return
3631
3632 video_id = mobj.group('id')
3633 webpage = self._download_webpage(url, video_id)
3634
3635 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3636 if not m:
3637 self._downloader.trouble(u'ERROR: Cannot find status ID')
3638 status_id = m.group(1)
3639
3640 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3641 if not m:
3642 self._downloader.trouble(u'WARNING: Cannot find description')
3643 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3644
3645 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3646 if not m:
3647 self._downloader.trouble(u'ERROR: Cannot find uploader')
3648 uploader = unescapeHTML(m.group('uploader'))
3649 uploader_id = unescapeHTML(m.group('uploader_id'))
3650
3651 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3652 if not m:
3653 self._downloader.trouble(u'ERROR: Cannot find upload date')
3654 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3655
3656 title = desc
3657 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3658
3659 info = {
3660 'id': video_id,
3661 'url': video_url,
3662 'ext': 'mov',
3663 'title': title,
3664 'description': desc,
3665 'uploader': uploader,
3666 'uploader_id': uploader_id,
3667 'internal_id': status_id,
3668 'upload_date': upload_date
3669 }
3670 return [info]
3671
3672 class SteamIE(InfoExtractor):
3673 _VALID_URL = r"""http://store.steampowered.com/
3674 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3675 (?P<gameID>\d+)/?
3676 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3677 """
3678
3679 def suitable(self, url):
3680 """Receives a URL and returns True if suitable for this IE."""
3681 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3682
3683 def _real_extract(self, url):
3684 m = re.match(self._VALID_URL, url, re.VERBOSE)
3685 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3686 gameID = m.group('gameID')
3687 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3688 webpage = self._download_webpage(videourl, gameID)
3689 mweb = re.finditer(urlRE, webpage)
3690 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3691 titles = re.finditer(namesRE, webpage)
3692 videos = []
3693 for vid,vtitle in zip(mweb,titles):
3694 video_id = vid.group('videoID')
3695 title = vtitle.group('videoName')
3696 video_url = vid.group('videoURL')
3697 if not video_url:
3698 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3699 info = {
3700 'id':video_id,
3701 'url':video_url,
3702 'ext': 'flv',
3703 'title': unescapeHTML(title)
3704 }
3705 videos.append(info)
3706 return videos
3707
3708 class UstreamIE(InfoExtractor):
3709 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3710 IE_NAME = u'ustream'
3711
3712 def _real_extract(self, url):
3713 m = re.match(self._VALID_URL, url)
3714 video_id = m.group('videoID')
3715 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3716 webpage = self._download_webpage(url, video_id)
3717 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3718 title = m.group('title')
3719 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3720 uploader = m.group('uploader')
3721 info = {
3722 'id':video_id,
3723 'url':video_url,
3724 'ext': 'flv',
3725 'title': title,
3726 'uploader': uploader
3727 }
3728 return [info]
3729
3730
3731
3732 class YouPornIE(InfoExtractor):
3733 """Information extractor for youporn.com."""
3734 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3735
3736 def _print_formats(self, formats):
3737 """Print all available formats"""
3738 print(u'Available formats:')
3739 print(u'ext\t\tformat')
3740 print(u'---------------------------------')
3741 for format in formats:
3742 print(u'%s\t\t%s' % (format['ext'], format['format']))
3743
3744 def _specific(self, req_format, formats):
3745 for x in formats:
3746 if(x["format"]==req_format):
3747 return x
3748 return None
3749
3750 def _real_extract(self, url):
3751 mobj = re.match(self._VALID_URL, url)
3752 if mobj is None:
3753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3754 return
3755
3756 video_id = mobj.group('videoid')
3757
3758 req = compat_urllib_request.Request(url)
3759 req.add_header('Cookie', 'age_verified=1')
3760 webpage = self._download_webpage(req, video_id)
3761
3762 # Get the video title
3763 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3764 if result is None:
3765 raise ExtractorError(u'ERROR: unable to extract video title')
3766 video_title = result.group('title').strip()
3767
3768 # Get the video date
3769 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3770 if result is None:
3771 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3772 upload_date = None
3773 else:
3774 upload_date = result.group('date').strip()
3775
3776 # Get the video uploader
3777 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3778 if result is None:
3779 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3780 video_uploader = None
3781 else:
3782 video_uploader = result.group('uploader').strip()
3783 video_uploader = clean_html( video_uploader )
3784
3785 # Get all of the formats available
3786 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3787 result = re.search(DOWNLOAD_LIST_RE, webpage)
3788 if result is None:
3789 raise ExtractorError(u'Unable to extract download list')
3790 download_list_html = result.group('download_list').strip()
3791
3792 # Get all of the links from the page
3793 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3794 links = re.findall(LINK_RE, download_list_html)
3795 if(len(links) == 0):
3796 raise ExtractorError(u'ERROR: no known formats available for video')
3797
3798 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3799
3800 formats = []
3801 for link in links:
3802
3803 # A link looks like this:
3804 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3805 # A path looks like this:
3806 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3807 video_url = unescapeHTML( link )
3808 path = compat_urllib_parse_urlparse( video_url ).path
3809 extension = os.path.splitext( path )[1][1:]
3810 format = path.split('/')[4].split('_')[:2]
3811 size = format[0]
3812 bitrate = format[1]
3813 format = "-".join( format )
3814 title = u'%s-%s-%s' % (video_title, size, bitrate)
3815
3816 formats.append({
3817 'id': video_id,
3818 'url': video_url,
3819 'uploader': video_uploader,
3820 'upload_date': upload_date,
3821 'title': title,
3822 'ext': extension,
3823 'format': format,
3824 'thumbnail': None,
3825 'description': None,
3826 'player_url': None
3827 })
3828
3829 if self._downloader.params.get('listformats', None):
3830 self._print_formats(formats)
3831 return
3832
3833 req_format = self._downloader.params.get('format', None)
3834 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3835
3836 if req_format is None or req_format == 'best':
3837 return [formats[0]]
3838 elif req_format == 'worst':
3839 return [formats[-1]]
3840 elif req_format in ('-1', 'all'):
3841 return formats
3842 else:
3843 format = self._specific( req_format, formats )
3844 if result is None:
3845 self._downloader.trouble(u'ERROR: requested format not available')
3846 return
3847 return [format]
3848
3849
3850
3851 class PornotubeIE(InfoExtractor):
3852 """Information extractor for pornotube.com."""
3853 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3854
3855 def _real_extract(self, url):
3856 mobj = re.match(self._VALID_URL, url)
3857 if mobj is None:
3858 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3859 return
3860
3861 video_id = mobj.group('videoid')
3862 video_title = mobj.group('title')
3863
3864 # Get webpage content
3865 webpage = self._download_webpage(url, video_id)
3866
3867 # Get the video URL
3868 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3869 result = re.search(VIDEO_URL_RE, webpage)
3870 if result is None:
3871 self._downloader.trouble(u'ERROR: unable to extract video url')
3872 return
3873 video_url = compat_urllib_parse.unquote(result.group('url'))
3874
3875 #Get the uploaded date
3876 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3877 result = re.search(VIDEO_UPLOADED_RE, webpage)
3878 if result is None:
3879 self._downloader.trouble(u'ERROR: unable to extract video title')
3880 return
3881 upload_date = result.group('date')
3882
3883 info = {'id': video_id,
3884 'url': video_url,
3885 'uploader': None,
3886 'upload_date': upload_date,
3887 'title': video_title,
3888 'ext': 'flv',
3889 'format': 'flv'}
3890
3891 return [info]
3892
3893
3894
3895 class YouJizzIE(InfoExtractor):
3896 """Information extractor for youjizz.com."""
3897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3898
3899 def _real_extract(self, url):
3900 mobj = re.match(self._VALID_URL, url)
3901 if mobj is None:
3902 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3903 return
3904
3905 video_id = mobj.group('videoid')
3906
3907 # Get webpage content
3908 webpage = self._download_webpage(url, video_id)
3909
3910 # Get the video title
3911 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3912 if result is None:
3913 raise ExtractorError(u'ERROR: unable to extract video title')
3914 video_title = result.group('title').strip()
3915
3916 # Get the embed page
3917 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3918 if result is None:
3919 raise ExtractorError(u'ERROR: unable to extract embed page')
3920
3921 embed_page_url = result.group(0).strip()
3922 video_id = result.group('videoid')
3923
3924 webpage = self._download_webpage(embed_page_url, video_id)
3925
3926 # Get the video URL
3927 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3928 if result is None:
3929 raise ExtractorError(u'ERROR: unable to extract video url')
3930 video_url = result.group('source')
3931
3932 info = {'id': video_id,
3933 'url': video_url,
3934 'title': video_title,
3935 'ext': 'flv',
3936 'format': 'flv',
3937 'player_url': embed_page_url}
3938
3939 return [info]
3940
3941
3942 def gen_extractors():
3943 """ Return a list of an instance of every supported extractor.
3944 The order does matter; the first extractor matched is the one handling the URL.
3945 """
3946 return [
3947 YoutubePlaylistIE(),
3948 YoutubeChannelIE(),
3949 YoutubeUserIE(),
3950 YoutubeSearchIE(),
3951 YoutubeIE(),
3952 MetacafeIE(),
3953 DailymotionIE(),
3954 GoogleSearchIE(),
3955 PhotobucketIE(),
3956 YahooIE(),
3957 YahooSearchIE(),
3958 DepositFilesIE(),
3959 FacebookIE(),
3960 BlipTVUserIE(),
3961 BlipTVIE(),
3962 VimeoIE(),
3963 MyVideoIE(),
3964 ComedyCentralIE(),
3965 EscapistIE(),
3966 CollegeHumorIE(),
3967 XVideosIE(),
3968 SoundcloudIE(),
3969 InfoQIE(),
3970 MixcloudIE(),
3971 StanfordOpenClassroomIE(),
3972 MTVIE(),
3973 YoukuIE(),
3974 XNXXIE(),
3975 YouJizzIE(),
3976 PornotubeIE(),
3977 YouPornIE(),
3978 GooglePlusIE(),
3979 ArteTvIE(),
3980 NBAIE(),
3981 JustinTVIE(),
3982 FunnyOrDieIE(),
3983 TweetReelIE(),
3984 SteamIE(),
3985 UstreamIE(),
3986 GenericIE()
3987 ]
3988
3989