]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
ExtractorError for errors during extraction
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
50
51 The fields should all be Unicode strings.
52
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
56
57 _real_extract() must return a *list* of information dictionaries as
58 described above.
59
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
62 """
63
64 _ready = False
65 _downloader = None
66 _WORKING = True
67
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
70 self._ready = False
71 self.set_downloader(downloader)
72
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
76
77 def working(self):
78 """Getter method for _WORKING."""
79 return self._WORKING
80
81 def initialize(self):
82 """Initializes an instance (authentication, etc)."""
83 if not self._ready:
84 self._real_initialize()
85 self._ready = True
86
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
89 self.initialize()
90 return self._real_extract(url)
91
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
95
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
98 pass
99
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
102 pass
103
104 @property
105 def IE_NAME(self):
106 return type(self).__name__[:-2]
107
108 class YoutubeIE(InfoExtractor):
109 """Information extractor for youtube.com."""
110
111 _VALID_URL = r"""^
112 (
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
124 v=
125 )
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
130 $"""
131 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE = 'youtube'
136 # Listed in order of quality
137 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions = {
140 '13': '3gp',
141 '17': 'mp4',
142 '18': 'mp4',
143 '22': 'mp4',
144 '37': 'mp4',
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
146 '43': 'webm',
147 '44': 'webm',
148 '45': 'webm',
149 '46': 'webm',
150 }
151 _video_dimensions = {
152 '5': '240x400',
153 '6': '???',
154 '13': '???',
155 '17': '144x176',
156 '18': '360x640',
157 '22': '720x1280',
158 '34': '360x640',
159 '35': '480x854',
160 '37': '1080x1920',
161 '38': '3072x4096',
162 '43': '360x640',
163 '44': '480x854',
164 '45': '720x1280',
165 '46': '1080x1920',
166 }
167 IE_NAME = u'youtube'
168
169 def suitable(self, url):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
172
173 def report_lang(self):
174 """Report attempt to set language."""
175 self._downloader.to_screen(u'[youtube] Setting language')
176
177 def report_login(self):
178 """Report attempt to log in."""
179 self._downloader.to_screen(u'[youtube] Logging in')
180
181 def report_age_confirmation(self):
182 """Report attempt to confirm age."""
183 self._downloader.to_screen(u'[youtube] Confirming age')
184
185 def report_video_webpage_download(self, video_id):
186 """Report attempt to download video webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
188
189 def report_video_info_webpage_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
192
193 def report_video_subtitles_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
196
197 def report_information_extraction(self, video_id):
198 """Report attempt to extract video information."""
199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
200
201 def report_unavailable_format(self, video_id, format):
202 """Report extracted video URL."""
203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
204
205 def report_rtmp_download(self):
206 """Indicate the download will use the RTMP protocol."""
207 self._downloader.to_screen(u'[youtube] RTMP download detected')
208
209 def _closed_captions_xml_to_srt(self, xml_string):
210 srt = ''
211 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212 # TODO parse xml instead of regex
213 for n, (start, dur_tag, dur, caption) in enumerate(texts):
214 if not dur: dur = '4'
215 start = float(start)
216 end = start + float(dur)
217 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219 caption = unescapeHTML(caption)
220 caption = unescapeHTML(caption) # double cycle, intentional
221 srt += str(n+1) + '\n'
222 srt += start + ' --> ' + end + '\n'
223 srt += caption + '\n\n'
224 return srt
225
226 def _extract_subtitles(self, video_id):
227 self.report_video_subtitles_download(video_id)
228 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
229 try:
230 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235 if not srt_lang_list:
236 return (u'WARNING: video has no closed captions', None)
237 if self._downloader.params.get('subtitleslang', False):
238 srt_lang = self._downloader.params.get('subtitleslang')
239 elif 'en' in srt_lang_list:
240 srt_lang = 'en'
241 else:
242 srt_lang = list(srt_lang_list.keys())[0]
243 if not srt_lang in srt_lang_list:
244 return (u'WARNING: no closed captions found in the specified language', None)
245 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
246 try:
247 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
250 if not srt_xml:
251 return (u'WARNING: unable to download video subtitles', None)
252 return (None, self._closed_captions_xml_to_srt(srt_xml))
253
254 def _print_formats(self, formats):
255 print('Available formats:')
256 for x in formats:
257 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
258
259 def _real_initialize(self):
260 if self._downloader is None:
261 return
262
263 username = None
264 password = None
265 downloader_params = self._downloader.params
266
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params.get('username', None) is not None:
269 username = downloader_params['username']
270 password = downloader_params['password']
271 elif downloader_params.get('usenetrc', False):
272 try:
273 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
274 if info is not None:
275 username = info[0]
276 password = info[2]
277 else:
278 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279 except (IOError, netrc.NetrcParseError) as err:
280 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
281 return
282
283 # Set language
284 request = compat_urllib_request.Request(self._LANG_URL)
285 try:
286 self.report_lang()
287 compat_urllib_request.urlopen(request).read()
288 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
290 return
291
292 # No authentication to be performed
293 if username is None:
294 return
295
296 # Log in
297 login_form = {
298 'current_form': 'loginForm',
299 'next': '/',
300 'action_login': 'Log In',
301 'username': username,
302 'password': password,
303 }
304 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
305 try:
306 self.report_login()
307 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
310 return
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
313 return
314
315 # Confirm age
316 age_form = {
317 'next_url': '/',
318 'action_confirm': 'Confirm',
319 }
320 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
321 try:
322 self.report_age_confirmation()
323 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
326 return
327
328 def _extract_id(self, url):
329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
330 if mobj is None:
331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
332 return
333 video_id = mobj.group(2)
334 return video_id
335
336 def _real_extract(self, url):
337 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
338 mobj = re.search(self._NEXT_URL_RE, url)
339 if mobj:
340 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
341 video_id = self._extract_id(url)
342
343 # Get video webpage
344 self.report_video_webpage_download(video_id)
345 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
346 request = compat_urllib_request.Request(url)
347 try:
348 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
349 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
351 return
352
353 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
354
355 # Attempt to extract SWF player URL
356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
357 if mobj is not None:
358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
359 else:
360 player_url = None
361
362 # Get video info
363 self.report_video_info_webpage_download(video_id)
364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
366 % (video_id, el_type))
367 request = compat_urllib_request.Request(video_info_url)
368 try:
369 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
371 video_info = compat_parse_qs(video_info_webpage)
372 if 'token' in video_info:
373 break
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
376 return
377 if 'token' not in video_info:
378 if 'reason' in video_info:
379 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
380 else:
381 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
382 return
383
384 # Check for "rental" videos
385 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
386 self._downloader.trouble(u'ERROR: "rental" videos not supported')
387 return
388
389 # Start extracting information
390 self.report_information_extraction(video_id)
391
392 # uploader
393 if 'author' not in video_info:
394 self._downloader.trouble(u'ERROR: unable to extract uploader name')
395 return
396 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
397
398 # uploader_id
399 video_uploader_id = None
400 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
401 if mobj is not None:
402 video_uploader_id = mobj.group(1)
403 else:
404 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
405
406 # title
407 if 'title' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract video title')
409 return
410 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
411
412 # thumbnail image
413 if 'thumbnail_url' not in video_info:
414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
415 video_thumbnail = ''
416 else: # don't panic if we can't find it
417 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
418
419 # upload date
420 upload_date = None
421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
422 if mobj is not None:
423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
425 for expression in format_expressions:
426 try:
427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
428 except:
429 pass
430
431 # description
432 video_description = get_element_by_id("eow-description", video_webpage)
433 if video_description:
434 video_description = clean_html(video_description)
435 else:
436 video_description = ''
437
438 # closed captions
439 video_subtitles = None
440 if self._downloader.params.get('writesubtitles', False):
441 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
442 if srt_error:
443 self._downloader.trouble(srt_error)
444
445 if 'length_seconds' not in video_info:
446 self._downloader.trouble(u'WARNING: unable to extract video duration')
447 video_duration = ''
448 else:
449 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
450
451 # token
452 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
453
454 # Decide which formats to download
455 req_format = self._downloader.params.get('format', None)
456
457 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
458 self.report_rtmp_download()
459 video_url_list = [(None, video_info['conn'][0])]
460 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
461 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
462 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
463 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
464 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
465
466 format_limit = self._downloader.params.get('format_limit', None)
467 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
468 if format_limit is not None and format_limit in available_formats:
469 format_list = available_formats[available_formats.index(format_limit):]
470 else:
471 format_list = available_formats
472 existing_formats = [x for x in format_list if x in url_map]
473 if len(existing_formats) == 0:
474 self._downloader.trouble(u'ERROR: no known formats available for video')
475 return
476 if self._downloader.params.get('listformats', None):
477 self._print_formats(existing_formats)
478 return
479 if req_format is None or req_format == 'best':
480 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
481 elif req_format == 'worst':
482 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
483 elif req_format in ('-1', 'all'):
484 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
485 else:
486 # Specific formats. We pick the first in a slash-delimeted sequence.
487 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
488 req_formats = req_format.split('/')
489 video_url_list = None
490 for rf in req_formats:
491 if rf in url_map:
492 video_url_list = [(rf, url_map[rf])]
493 break
494 if video_url_list is None:
495 self._downloader.trouble(u'ERROR: requested format not available')
496 return
497 else:
498 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
499 return
500
501 results = []
502 for format_param, video_real_url in video_url_list:
503 # Extension
504 video_extension = self._video_extensions.get(format_param, 'flv')
505
506 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
507 self._video_dimensions.get(format_param, '???'))
508
509 results.append({
510 'id': video_id,
511 'url': video_real_url,
512 'uploader': video_uploader,
513 'uploader_id': video_uploader_id,
514 'upload_date': upload_date,
515 'title': video_title,
516 'ext': video_extension,
517 'format': video_format,
518 'thumbnail': video_thumbnail,
519 'description': video_description,
520 'player_url': player_url,
521 'subtitles': video_subtitles,
522 'duration': video_duration
523 })
524 return results
525
526
527 class MetacafeIE(InfoExtractor):
528 """Information Extractor for metacafe.com."""
529
530 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
531 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
532 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
533 IE_NAME = u'metacafe'
534
535 def __init__(self, downloader=None):
536 InfoExtractor.__init__(self, downloader)
537
538 def report_disclaimer(self):
539 """Report disclaimer retrieval."""
540 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
541
542 def report_age_confirmation(self):
543 """Report attempt to confirm age."""
544 self._downloader.to_screen(u'[metacafe] Confirming age')
545
546 def report_download_webpage(self, video_id):
547 """Report webpage download."""
548 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
549
550 def report_extraction(self, video_id):
551 """Report information extraction."""
552 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
553
554 def _real_initialize(self):
555 # Retrieve disclaimer
556 request = compat_urllib_request.Request(self._DISCLAIMER)
557 try:
558 self.report_disclaimer()
559 disclaimer = compat_urllib_request.urlopen(request).read()
560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
561 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
562 return
563
564 # Confirm age
565 disclaimer_form = {
566 'filters': '0',
567 'submit': "Continue - I'm over 18",
568 }
569 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
570 try:
571 self.report_age_confirmation()
572 disclaimer = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
575 return
576
577 def _real_extract(self, url):
578 # Extract id and simplified title from URL
579 mobj = re.match(self._VALID_URL, url)
580 if mobj is None:
581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
582 return
583
584 video_id = mobj.group(1)
585
586 # Check if video comes from YouTube
587 mobj2 = re.match(r'^yt-(.*)$', video_id)
588 if mobj2 is not None:
589 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
590 return
591
592 # Retrieve video webpage to extract further information
593 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
594 try:
595 self.report_download_webpage(video_id)
596 webpage = compat_urllib_request.urlopen(request).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
599 return
600
601 # Extract URL, uploader and title from webpage
602 self.report_extraction(video_id)
603 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
604 if mobj is not None:
605 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
606 video_extension = mediaURL[-3:]
607
608 # Extract gdaKey if available
609 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
610 if mobj is None:
611 video_url = mediaURL
612 else:
613 gdaKey = mobj.group(1)
614 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
615 else:
616 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
617 if mobj is None:
618 self._downloader.trouble(u'ERROR: unable to extract media URL')
619 return
620 vardict = compat_parse_qs(mobj.group(1))
621 if 'mediaData' not in vardict:
622 self._downloader.trouble(u'ERROR: unable to extract media URL')
623 return
624 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
625 if mobj is None:
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
627 return
628 mediaURL = mobj.group(1).replace('\\/', '/')
629 video_extension = mediaURL[-3:]
630 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
631
632 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
633 if mobj is None:
634 self._downloader.trouble(u'ERROR: unable to extract title')
635 return
636 video_title = mobj.group(1).decode('utf-8')
637
638 mobj = re.search(r'submitter=(.*?);', webpage)
639 if mobj is None:
640 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
641 return
642 video_uploader = mobj.group(1)
643
644 return [{
645 'id': video_id.decode('utf-8'),
646 'url': video_url.decode('utf-8'),
647 'uploader': video_uploader.decode('utf-8'),
648 'upload_date': None,
649 'title': video_title,
650 'ext': video_extension.decode('utf-8'),
651 }]
652
653
654 class DailymotionIE(InfoExtractor):
655 """Information Extractor for Dailymotion"""
656
657 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
658 IE_NAME = u'dailymotion'
659
660 def __init__(self, downloader=None):
661 InfoExtractor.__init__(self, downloader)
662
663 def report_download_webpage(self, video_id):
664 """Report webpage download."""
665 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
666
667 def report_extraction(self, video_id):
668 """Report information extraction."""
669 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
670
671 def _real_extract(self, url):
672 # Extract id and simplified title from URL
673 mobj = re.match(self._VALID_URL, url)
674 if mobj is None:
675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
676 return
677
678 video_id = mobj.group(1).split('_')[0].split('?')[0]
679
680 video_extension = 'mp4'
681
682 # Retrieve video webpage to extract further information
683 request = compat_urllib_request.Request(url)
684 request.add_header('Cookie', 'family_filter=off')
685 try:
686 self.report_download_webpage(video_id)
687 webpage_bytes = compat_urllib_request.urlopen(request).read()
688 webpage = webpage_bytes.decode('utf-8')
689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
690 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
691 return
692
693 # Extract URL, uploader and title from webpage
694 self.report_extraction(video_id)
695 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
696 if mobj is None:
697 self._downloader.trouble(u'ERROR: unable to extract media URL')
698 return
699 flashvars = compat_urllib_parse.unquote(mobj.group(1))
700
701 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
702 if key in flashvars:
703 max_quality = key
704 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
705 break
706 else:
707 self._downloader.trouble(u'ERROR: unable to extract video URL')
708 return
709
710 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
711 if mobj is None:
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
713 return
714
715 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
716
717 # TODO: support choosing qualities
718
719 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
720 if mobj is None:
721 self._downloader.trouble(u'ERROR: unable to extract title')
722 return
723 video_title = unescapeHTML(mobj.group('title'))
724
725 video_uploader = None
726 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
727 if mobj is None:
728 # lookin for official user
729 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
730 if mobj_official is None:
731 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
732 else:
733 video_uploader = mobj_official.group(1)
734 else:
735 video_uploader = mobj.group(1)
736
737 video_upload_date = None
738 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
739 if mobj is not None:
740 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
741
742 return [{
743 'id': video_id,
744 'url': video_url,
745 'uploader': video_uploader,
746 'upload_date': video_upload_date,
747 'title': video_title,
748 'ext': video_extension,
749 }]
750
751
752 class PhotobucketIE(InfoExtractor):
753 """Information extractor for photobucket.com."""
754
755 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
756 IE_NAME = u'photobucket'
757
758 def __init__(self, downloader=None):
759 InfoExtractor.__init__(self, downloader)
760
761 def report_download_webpage(self, video_id):
762 """Report webpage download."""
763 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
764
765 def report_extraction(self, video_id):
766 """Report information extraction."""
767 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
768
769 def _real_extract(self, url):
770 # Extract id from URL
771 mobj = re.match(self._VALID_URL, url)
772 if mobj is None:
773 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
774 return
775
776 video_id = mobj.group(1)
777
778 video_extension = 'flv'
779
780 # Retrieve video webpage to extract further information
781 request = compat_urllib_request.Request(url)
782 try:
783 self.report_download_webpage(video_id)
784 webpage = compat_urllib_request.urlopen(request).read()
785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
786 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
787 return
788
789 # Extract URL, uploader, and title from webpage
790 self.report_extraction(video_id)
791 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
792 if mobj is None:
793 self._downloader.trouble(u'ERROR: unable to extract media URL')
794 return
795 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
796
797 video_url = mediaURL
798
799 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
800 if mobj is None:
801 self._downloader.trouble(u'ERROR: unable to extract title')
802 return
803 video_title = mobj.group(1).decode('utf-8')
804
805 video_uploader = mobj.group(2).decode('utf-8')
806
807 return [{
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
810 'uploader': video_uploader,
811 'upload_date': None,
812 'title': video_title,
813 'ext': video_extension.decode('utf-8'),
814 }]
815
816
817 class YahooIE(InfoExtractor):
818 """Information extractor for video.yahoo.com."""
819
820 _WORKING = False
821 # _VALID_URL matches all Yahoo! Video URLs
822 # _VPAGE_URL matches only the extractable '/watch/' URLs
823 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
824 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
825 IE_NAME = u'video.yahoo'
826
827 def __init__(self, downloader=None):
828 InfoExtractor.__init__(self, downloader)
829
830 def report_download_webpage(self, video_id):
831 """Report webpage download."""
832 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
833
834 def report_extraction(self, video_id):
835 """Report information extraction."""
836 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
837
838 def _real_extract(self, url, new_video=True):
839 # Extract ID from URL
840 mobj = re.match(self._VALID_URL, url)
841 if mobj is None:
842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
843 return
844
845 video_id = mobj.group(2)
846 video_extension = 'flv'
847
848 # Rewrite valid but non-extractable URLs as
849 # extractable English language /watch/ URLs
850 if re.match(self._VPAGE_URL, url) is None:
851 request = compat_urllib_request.Request(url)
852 try:
853 webpage = compat_urllib_request.urlopen(request).read()
854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
855 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
856 return
857
858 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
859 if mobj is None:
860 self._downloader.trouble(u'ERROR: Unable to extract id field')
861 return
862 yahoo_id = mobj.group(1)
863
864 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
865 if mobj is None:
866 self._downloader.trouble(u'ERROR: Unable to extract vid field')
867 return
868 yahoo_vid = mobj.group(1)
869
870 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
871 return self._real_extract(url, new_video=False)
872
873 # Retrieve video webpage to extract further information
874 request = compat_urllib_request.Request(url)
875 try:
876 self.report_download_webpage(video_id)
877 webpage = compat_urllib_request.urlopen(request).read()
878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
880 return
881
882 # Extract uploader and title from webpage
883 self.report_extraction(video_id)
884 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
885 if mobj is None:
886 self._downloader.trouble(u'ERROR: unable to extract video title')
887 return
888 video_title = mobj.group(1).decode('utf-8')
889
890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
891 if mobj is None:
892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
893 return
894 video_uploader = mobj.group(1).decode('utf-8')
895
896 # Extract video thumbnail
897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
898 if mobj is None:
899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
900 return
901 video_thumbnail = mobj.group(1).decode('utf-8')
902
903 # Extract video description
904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
905 if mobj is None:
906 self._downloader.trouble(u'ERROR: unable to extract video description')
907 return
908 video_description = mobj.group(1).decode('utf-8')
909 if not video_description:
910 video_description = 'No description available.'
911
912 # Extract video height and width
913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
914 if mobj is None:
915 self._downloader.trouble(u'ERROR: unable to extract video height')
916 return
917 yv_video_height = mobj.group(1)
918
919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
920 if mobj is None:
921 self._downloader.trouble(u'ERROR: unable to extract video width')
922 return
923 yv_video_width = mobj.group(1)
924
925 # Retrieve video playlist to extract media URL
926 # I'm not completely sure what all these options are, but we
927 # seem to need most of them, otherwise the server sends a 401.
928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
930 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
933 try:
934 self.report_download_webpage(video_id)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
938 return
939
940 # Extract media URL from playlist XML
941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
942 if mobj is None:
943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
944 return
945 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
946 video_url = unescapeHTML(video_url)
947
948 return [{
949 'id': video_id.decode('utf-8'),
950 'url': video_url,
951 'uploader': video_uploader,
952 'upload_date': None,
953 'title': video_title,
954 'ext': video_extension.decode('utf-8'),
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description,
957 }]
958
959
960 class VimeoIE(InfoExtractor):
961 """Information extractor for vimeo.com."""
962
963 # _VALID_URL matches Vimeo URLs
964 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
965 IE_NAME = u'vimeo'
966
967 def __init__(self, downloader=None):
968 InfoExtractor.__init__(self, downloader)
969
970 def report_download_webpage(self, video_id):
971 """Report webpage download."""
972 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
973
974 def report_extraction(self, video_id):
975 """Report information extraction."""
976 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
977
978 def _real_extract(self, url, new_video=True):
979 # Extract ID from URL
980 mobj = re.match(self._VALID_URL, url)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
983 return
984
985 video_id = mobj.group(1)
986
987 # Retrieve video webpage to extract further information
988 request = compat_urllib_request.Request(url, None, std_headers)
989 try:
990 self.report_download_webpage(video_id)
991 webpage_bytes = compat_urllib_request.urlopen(request).read()
992 webpage = webpage_bytes.decode('utf-8')
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995 return
996
997 # Now we begin extracting as much information as we can from what we
998 # retrieved. First we extract the information common to all extractors,
999 # and latter we extract those that are Vimeo specific.
1000 self.report_extraction(video_id)
1001
1002 # Extract the config JSON
1003 try:
1004 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005 config = json.loads(config)
1006 except:
1007 self._downloader.trouble(u'ERROR: unable to extract info section')
1008 return
1009
1010 # Extract title
1011 video_title = config["video"]["title"]
1012
1013 # Extract uploader and uploader_id
1014 video_uploader = config["video"]["owner"]["name"]
1015 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1016
1017 # Extract video thumbnail
1018 video_thumbnail = config["video"]["thumbnail"]
1019
1020 # Extract video description
1021 video_description = get_element_by_attribute("itemprop", "description", webpage)
1022 if video_description: video_description = clean_html(video_description)
1023 else: video_description = ''
1024
1025 # Extract upload date
1026 video_upload_date = None
1027 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028 if mobj is not None:
1029 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1030
1031 # Vimeo specific: extract request signature and timestamp
1032 sig = config['request']['signature']
1033 timestamp = config['request']['timestamp']
1034
1035 # Vimeo specific: extract video codec and quality information
1036 # First consider quality, then codecs, then take everything
1037 # TODO bind to format param
1038 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039 files = { 'hd': [], 'sd': [], 'other': []}
1040 for codec_name, codec_extension in codecs:
1041 if codec_name in config["video"]["files"]:
1042 if 'hd' in config["video"]["files"][codec_name]:
1043 files['hd'].append((codec_name, codec_extension, 'hd'))
1044 elif 'sd' in config["video"]["files"][codec_name]:
1045 files['sd'].append((codec_name, codec_extension, 'sd'))
1046 else:
1047 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1048
1049 for quality in ('hd', 'sd', 'other'):
1050 if len(files[quality]) > 0:
1051 video_quality = files[quality][0][2]
1052 video_codec = files[quality][0][0]
1053 video_extension = files[quality][0][1]
1054 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1055 break
1056 else:
1057 self._downloader.trouble(u'ERROR: no known codec found')
1058 return
1059
1060 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1062
1063 return [{
1064 'id': video_id,
1065 'url': video_url,
1066 'uploader': video_uploader,
1067 'uploader_id': video_uploader_id,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1073 }]
1074
1075
1076 class ArteTvIE(InfoExtractor):
1077 """arte.tv information extractor."""
1078
1079 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080 _LIVE_URL = r'index-[0-9]+\.html$'
1081
1082 IE_NAME = u'arte.tv'
1083
1084 def __init__(self, downloader=None):
1085 InfoExtractor.__init__(self, downloader)
1086
1087 def report_download_webpage(self, video_id):
1088 """Report webpage download."""
1089 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1090
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1094
1095 def fetch_webpage(self, url):
1096 request = compat_urllib_request.Request(url)
1097 try:
1098 self.report_download_webpage(url)
1099 webpage = compat_urllib_request.urlopen(request).read()
1100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1102 return
1103 except ValueError as err:
1104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1105 return
1106 return webpage
1107
1108 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109 page = self.fetch_webpage(url)
1110 mobj = re.search(regex, page, regexFlags)
1111 info = {}
1112
1113 if mobj is None:
1114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115 return
1116
1117 for (i, key, err) in matchTuples:
1118 if mobj.group(i) is None:
1119 self._downloader.trouble(err)
1120 return
1121 else:
1122 info[key] = mobj.group(i)
1123
1124 return info
1125
1126 def extractLiveStream(self, url):
1127 video_lang = url.split('/')[-4]
1128 info = self.grep_webpage(
1129 url,
1130 r'src="(.*?/videothek_js.*?\.js)',
1131 0,
1132 [
1133 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1134 ]
1135 )
1136 http_host = url.split('/')[2]
1137 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138 info = self.grep_webpage(
1139 next_url,
1140 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141 '(http://.*?\.swf).*?' +
1142 '(rtmp://.*?)\'',
1143 re.DOTALL,
1144 [
1145 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1146 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1148 ]
1149 )
1150 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1151
1152 def extractPlus7Stream(self, url):
1153 video_lang = url.split('/')[-3]
1154 info = self.grep_webpage(
1155 url,
1156 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1157 0,
1158 [
1159 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1160 ]
1161 )
1162 next_url = compat_urllib_parse.unquote(info.get('url'))
1163 info = self.grep_webpage(
1164 next_url,
1165 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1166 0,
1167 [
1168 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1169 ]
1170 )
1171 next_url = compat_urllib_parse.unquote(info.get('url'))
1172
1173 info = self.grep_webpage(
1174 next_url,
1175 r'<video id="(.*?)".*?>.*?' +
1176 '<name>(.*?)</name>.*?' +
1177 '<dateVideo>(.*?)</dateVideo>.*?' +
1178 '<url quality="hd">(.*?)</url>',
1179 re.DOTALL,
1180 [
1181 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1182 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1184 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1185 ]
1186 )
1187
1188 return {
1189 'id': info.get('id'),
1190 'url': compat_urllib_parse.unquote(info.get('url')),
1191 'uploader': u'arte.tv',
1192 'upload_date': info.get('date'),
1193 'title': info.get('title').decode('utf-8'),
1194 'ext': u'mp4',
1195 'format': u'NA',
1196 'player_url': None,
1197 }
1198
1199 def _real_extract(self, url):
1200 video_id = url.split('/')[-1]
1201 self.report_extraction(video_id)
1202
1203 if re.search(self._LIVE_URL, video_id) is not None:
1204 self.extractLiveStream(url)
1205 return
1206 else:
1207 info = self.extractPlus7Stream(url)
1208
1209 return [info]
1210
1211
1212 class GenericIE(InfoExtractor):
1213 """Generic last-resort information extractor."""
1214
1215 _VALID_URL = r'.*'
1216 IE_NAME = u'generic'
1217
1218 def __init__(self, downloader=None):
1219 InfoExtractor.__init__(self, downloader)
1220
1221 def report_download_webpage(self, video_id):
1222 """Report webpage download."""
1223 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1225
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1229
1230 def report_following_redirect(self, new_url):
1231 """Report information extraction."""
1232 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1233
1234 def _test_redirect(self, url):
1235 """Check if it is a redirect, like url shorteners, in case restart chain."""
1236 class HeadRequest(compat_urllib_request.Request):
1237 def get_method(self):
1238 return "HEAD"
1239
1240 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1241 """
1242 Subclass the HTTPRedirectHandler to make it use our
1243 HeadRequest also on the redirected URL
1244 """
1245 def redirect_request(self, req, fp, code, msg, headers, newurl):
1246 if code in (301, 302, 303, 307):
1247 newurl = newurl.replace(' ', '%20')
1248 newheaders = dict((k,v) for k,v in req.headers.items()
1249 if k.lower() not in ("content-length", "content-type"))
1250 return HeadRequest(newurl,
1251 headers=newheaders,
1252 origin_req_host=req.get_origin_req_host(),
1253 unverifiable=True)
1254 else:
1255 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1256
1257 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1258 """
1259 Fallback to GET if HEAD is not allowed (405 HTTP error)
1260 """
1261 def http_error_405(self, req, fp, code, msg, headers):
1262 fp.read()
1263 fp.close()
1264
1265 newheaders = dict((k,v) for k,v in req.headers.items()
1266 if k.lower() not in ("content-length", "content-type"))
1267 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1268 headers=newheaders,
1269 origin_req_host=req.get_origin_req_host(),
1270 unverifiable=True))
1271
1272 # Build our opener
1273 opener = compat_urllib_request.OpenerDirector()
1274 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275 HTTPMethodFallback, HEADRedirectHandler,
1276 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277 opener.add_handler(handler())
1278
1279 response = opener.open(HeadRequest(url))
1280 new_url = response.geturl()
1281
1282 if url == new_url:
1283 return False
1284
1285 self.report_following_redirect(new_url)
1286 self._downloader.download([new_url])
1287 return True
1288
1289 def _real_extract(self, url):
1290 if self._test_redirect(url): return
1291
1292 video_id = url.split('/')[-1]
1293 request = compat_urllib_request.Request(url)
1294 try:
1295 self.report_download_webpage(video_id)
1296 webpage = compat_urllib_request.urlopen(request).read()
1297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1299 return
1300 except ValueError as err:
1301 # since this is the last-resort InfoExtractor, if
1302 # this error is thrown, it'll be thrown here
1303 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1304 return
1305
1306 self.report_extraction(video_id)
1307 # Start with something easy: JW Player in SWFObject
1308 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1309 if mobj is None:
1310 # Broaden the search a little bit
1311 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1312 if mobj is None:
1313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314 return
1315
1316 # It's possible that one of the regexes
1317 # matched, but returned an empty group:
1318 if mobj.group(1) is None:
1319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 return
1321
1322 video_url = compat_urllib_parse.unquote(mobj.group(1))
1323 video_id = os.path.basename(video_url)
1324
1325 # here's a fun little line of code for you:
1326 video_extension = os.path.splitext(video_id)[1][1:]
1327 video_id = os.path.splitext(video_id)[0]
1328
1329 # it's tempting to parse this further, but you would
1330 # have to take into account all the variations like
1331 # Video Title - Site Name
1332 # Site Name | Video Title
1333 # Video Title - Tagline | Site Name
1334 # and so on and so forth; it's just not practical
1335 mobj = re.search(r'<title>(.*)</title>', webpage)
1336 if mobj is None:
1337 self._downloader.trouble(u'ERROR: unable to extract title')
1338 return
1339 video_title = mobj.group(1)
1340
1341 # video uploader is domain name
1342 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1343 if mobj is None:
1344 self._downloader.trouble(u'ERROR: unable to extract title')
1345 return
1346 video_uploader = mobj.group(1)
1347
1348 return [{
1349 'id': video_id,
1350 'url': video_url,
1351 'uploader': video_uploader,
1352 'upload_date': None,
1353 'title': video_title,
1354 'ext': video_extension,
1355 }]
1356
1357
1358 class YoutubeSearchIE(InfoExtractor):
1359 """Information Extractor for YouTube search queries."""
1360 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362 _max_youtube_results = 1000
1363 IE_NAME = u'youtube:search'
1364
1365 def __init__(self, downloader=None):
1366 InfoExtractor.__init__(self, downloader)
1367
1368 def report_download_page(self, query, pagenum):
1369 """Report attempt to download search page with given number."""
1370 query = query.decode(preferredencoding())
1371 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1372
1373 def _real_extract(self, query):
1374 mobj = re.match(self._VALID_URL, query)
1375 if mobj is None:
1376 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1377 return
1378
1379 prefix, query = query.split(':')
1380 prefix = prefix[8:]
1381 query = query.encode('utf-8')
1382 if prefix == '':
1383 self._download_n_results(query, 1)
1384 return
1385 elif prefix == 'all':
1386 self._download_n_results(query, self._max_youtube_results)
1387 return
1388 else:
1389 try:
1390 n = int(prefix)
1391 if n <= 0:
1392 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1393 return
1394 elif n > self._max_youtube_results:
1395 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396 n = self._max_youtube_results
1397 self._download_n_results(query, n)
1398 return
1399 except ValueError: # parsing prefix as integer fails
1400 self._download_n_results(query, 1)
1401 return
1402
1403 def _download_n_results(self, query, n):
1404 """Downloads a specified number of results for a query"""
1405
1406 video_ids = []
1407 pagenum = 0
1408 limit = n
1409
1410 while (50 * pagenum) < limit:
1411 self.report_download_page(query, pagenum+1)
1412 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413 request = compat_urllib_request.Request(result_url)
1414 try:
1415 data = compat_urllib_request.urlopen(request).read()
1416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1418 return
1419 api_response = json.loads(data)['data']
1420
1421 new_ids = list(video['id'] for video in api_response['items'])
1422 video_ids += new_ids
1423
1424 limit = min(n, api_response['totalItems'])
1425 pagenum += 1
1426
1427 if len(video_ids) > n:
1428 video_ids = video_ids[:n]
1429 for id in video_ids:
1430 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1431 return
1432
1433
1434 class GoogleSearchIE(InfoExtractor):
1435 """Information Extractor for Google Video search queries."""
1436 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440 _max_google_results = 1000
1441 IE_NAME = u'video.google:search'
1442
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1445
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download playlist page with given number."""
1448 query = query.decode(preferredencoding())
1449 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1450
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1453 if mobj is None:
1454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455 return
1456
1457 prefix, query = query.split(':')
1458 prefix = prefix[8:]
1459 query = query.encode('utf-8')
1460 if prefix == '':
1461 self._download_n_results(query, 1)
1462 return
1463 elif prefix == 'all':
1464 self._download_n_results(query, self._max_google_results)
1465 return
1466 else:
1467 try:
1468 n = int(prefix)
1469 if n <= 0:
1470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471 return
1472 elif n > self._max_google_results:
1473 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474 n = self._max_google_results
1475 self._download_n_results(query, n)
1476 return
1477 except ValueError: # parsing prefix as integer fails
1478 self._download_n_results(query, 1)
1479 return
1480
1481 def _download_n_results(self, query, n):
1482 """Downloads a specified number of results for a query"""
1483
1484 video_ids = []
1485 pagenum = 0
1486
1487 while True:
1488 self.report_download_page(query, pagenum)
1489 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490 request = compat_urllib_request.Request(result_url)
1491 try:
1492 page = compat_urllib_request.urlopen(request).read()
1493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1495 return
1496
1497 # Extract video identifiers
1498 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499 video_id = mobj.group(1)
1500 if video_id not in video_ids:
1501 video_ids.append(video_id)
1502 if len(video_ids) == n:
1503 # Specified n videos reached
1504 for id in video_ids:
1505 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506 return
1507
1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511 return
1512
1513 pagenum = pagenum + 1
1514
1515
1516 class YahooSearchIE(InfoExtractor):
1517 """Information Extractor for Yahoo! Video search queries."""
1518
1519 _WORKING = False
1520 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1521 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1522 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1523 _MORE_PAGES_INDICATOR = r'\s*Next'
1524 _max_yahoo_results = 1000
1525 IE_NAME = u'video.yahoo:search'
1526
1527 def __init__(self, downloader=None):
1528 InfoExtractor.__init__(self, downloader)
1529
1530 def report_download_page(self, query, pagenum):
1531 """Report attempt to download playlist page with given number."""
1532 query = query.decode(preferredencoding())
1533 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1534
1535 def _real_extract(self, query):
1536 mobj = re.match(self._VALID_URL, query)
1537 if mobj is None:
1538 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1539 return
1540
1541 prefix, query = query.split(':')
1542 prefix = prefix[8:]
1543 query = query.encode('utf-8')
1544 if prefix == '':
1545 self._download_n_results(query, 1)
1546 return
1547 elif prefix == 'all':
1548 self._download_n_results(query, self._max_yahoo_results)
1549 return
1550 else:
1551 try:
1552 n = int(prefix)
1553 if n <= 0:
1554 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1555 return
1556 elif n > self._max_yahoo_results:
1557 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1558 n = self._max_yahoo_results
1559 self._download_n_results(query, n)
1560 return
1561 except ValueError: # parsing prefix as integer fails
1562 self._download_n_results(query, 1)
1563 return
1564
1565 def _download_n_results(self, query, n):
1566 """Downloads a specified number of results for a query"""
1567
1568 video_ids = []
1569 already_seen = set()
1570 pagenum = 1
1571
1572 while True:
1573 self.report_download_page(query, pagenum)
1574 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1575 request = compat_urllib_request.Request(result_url)
1576 try:
1577 page = compat_urllib_request.urlopen(request).read()
1578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1579 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1580 return
1581
1582 # Extract video identifiers
1583 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1584 video_id = mobj.group(1)
1585 if video_id not in already_seen:
1586 video_ids.append(video_id)
1587 already_seen.add(video_id)
1588 if len(video_ids) == n:
1589 # Specified n videos reached
1590 for id in video_ids:
1591 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592 return
1593
1594 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597 return
1598
1599 pagenum = pagenum + 1
1600
1601
1602 class YoutubePlaylistIE(InfoExtractor):
1603 """Information Extractor for YouTube playlists."""
1604
1605 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1606 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1607 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1608 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1609 IE_NAME = u'youtube:playlist'
1610
1611 def __init__(self, downloader=None):
1612 InfoExtractor.__init__(self, downloader)
1613
1614 def report_download_page(self, playlist_id, pagenum):
1615 """Report attempt to download playlist page with given number."""
1616 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1617
1618 def _real_extract(self, url):
1619 # Extract playlist id
1620 mobj = re.match(self._VALID_URL, url)
1621 if mobj is None:
1622 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1623 return
1624
1625 # Single video case
1626 if mobj.group(3) is not None:
1627 self._downloader.download([mobj.group(3)])
1628 return
1629
1630 # Download playlist pages
1631 # prefix is 'p' as default for playlists but there are other types that need extra care
1632 playlist_prefix = mobj.group(1)
1633 if playlist_prefix == 'a':
1634 playlist_access = 'artist'
1635 else:
1636 playlist_prefix = 'p'
1637 playlist_access = 'view_play_list'
1638 playlist_id = mobj.group(2)
1639 video_ids = []
1640 pagenum = 1
1641
1642 while True:
1643 self.report_download_page(playlist_id, pagenum)
1644 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1645 request = compat_urllib_request.Request(url)
1646 try:
1647 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650 return
1651
1652 # Extract video identifiers
1653 ids_in_page = []
1654 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1655 if mobj.group(1) not in ids_in_page:
1656 ids_in_page.append(mobj.group(1))
1657 video_ids.extend(ids_in_page)
1658
1659 if self._MORE_PAGES_INDICATOR not in page:
1660 break
1661 pagenum = pagenum + 1
1662
1663 total = len(video_ids)
1664
1665 playliststart = self._downloader.params.get('playliststart', 1) - 1
1666 playlistend = self._downloader.params.get('playlistend', -1)
1667 if playlistend == -1:
1668 video_ids = video_ids[playliststart:]
1669 else:
1670 video_ids = video_ids[playliststart:playlistend]
1671
1672 if len(video_ids) == total:
1673 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1674 else:
1675 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1676
1677 for id in video_ids:
1678 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1679 return
1680
1681
1682 class YoutubeChannelIE(InfoExtractor):
1683 """Information Extractor for YouTube channels."""
1684
1685 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1686 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688 IE_NAME = u'youtube:channel'
1689
1690 def report_download_page(self, channel_id, pagenum):
1691 """Report attempt to download channel page with given number."""
1692 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1693
1694 def _real_extract(self, url):
1695 # Extract channel id
1696 mobj = re.match(self._VALID_URL, url)
1697 if mobj is None:
1698 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1699 return
1700
1701 # Download channel pages
1702 channel_id = mobj.group(1)
1703 video_ids = []
1704 pagenum = 1
1705
1706 while True:
1707 self.report_download_page(channel_id, pagenum)
1708 url = self._TEMPLATE_URL % (channel_id, pagenum)
1709 request = compat_urllib_request.Request(url)
1710 try:
1711 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714 return
1715
1716 # Extract video identifiers
1717 ids_in_page = []
1718 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1722
1723 if self._MORE_PAGES_INDICATOR not in page:
1724 break
1725 pagenum = pagenum + 1
1726
1727 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1728
1729 for id in video_ids:
1730 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1731 return
1732
1733
1734 class YoutubeUserIE(InfoExtractor):
1735 """Information Extractor for YouTube users."""
1736
1737 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1738 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1739 _GDATA_PAGE_SIZE = 50
1740 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1741 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1742 IE_NAME = u'youtube:user'
1743
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1746
1747 def report_download_page(self, username, start_index):
1748 """Report attempt to download user page."""
1749 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1750 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1751
1752 def _real_extract(self, url):
1753 # Extract username
1754 mobj = re.match(self._VALID_URL, url)
1755 if mobj is None:
1756 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 return
1758
1759 username = mobj.group(1)
1760
1761 # Download video ids using YouTube Data API. Result size per
1762 # query is limited (currently to 50 videos) so we need to query
1763 # page by page until there are no video ids - it means we got
1764 # all of them.
1765
1766 video_ids = []
1767 pagenum = 0
1768
1769 while True:
1770 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1771 self.report_download_page(username, start_index)
1772
1773 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1774
1775 try:
1776 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1779 return
1780
1781 # Extract video identifiers
1782 ids_in_page = []
1783
1784 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785 if mobj.group(1) not in ids_in_page:
1786 ids_in_page.append(mobj.group(1))
1787
1788 video_ids.extend(ids_in_page)
1789
1790 # A little optimization - if current page is not
1791 # "full", ie. does not contain PAGE_SIZE video ids then
1792 # we can assume that this page is the last one - there
1793 # are no more ids on further pages - no need to query
1794 # again.
1795
1796 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1797 break
1798
1799 pagenum += 1
1800
1801 all_ids_count = len(video_ids)
1802 playliststart = self._downloader.params.get('playliststart', 1) - 1
1803 playlistend = self._downloader.params.get('playlistend', -1)
1804
1805 if playlistend == -1:
1806 video_ids = video_ids[playliststart:]
1807 else:
1808 video_ids = video_ids[playliststart:playlistend]
1809
1810 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1811 (username, all_ids_count, len(video_ids)))
1812
1813 for video_id in video_ids:
1814 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1815
1816
1817 class BlipTVUserIE(InfoExtractor):
1818 """Information Extractor for blip.tv users."""
1819
1820 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1821 _PAGE_SIZE = 12
1822 IE_NAME = u'blip.tv:user'
1823
1824 def __init__(self, downloader=None):
1825 InfoExtractor.__init__(self, downloader)
1826
1827 def report_download_page(self, username, pagenum):
1828 """Report attempt to download user page."""
1829 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1830 (self.IE_NAME, username, pagenum))
1831
1832 def _real_extract(self, url):
1833 # Extract username
1834 mobj = re.match(self._VALID_URL, url)
1835 if mobj is None:
1836 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837 return
1838
1839 username = mobj.group(1)
1840
1841 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1842
1843 request = compat_urllib_request.Request(url)
1844
1845 try:
1846 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847 mobj = re.search(r'data-users-id="([^"]+)"', page)
1848 page_base = page_base % mobj.group(1)
1849 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1850 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1851 return
1852
1853
1854 # Download video ids using BlipTV Ajax calls. Result size per
1855 # query is limited (currently to 12 videos) so we need to query
1856 # page by page until there are no video ids - it means we got
1857 # all of them.
1858
1859 video_ids = []
1860 pagenum = 1
1861
1862 while True:
1863 self.report_download_page(username, pagenum)
1864
1865 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1866
1867 try:
1868 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1870 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871 return
1872
1873 # Extract video identifiers
1874 ids_in_page = []
1875
1876 for mobj in re.finditer(r'href="/([^"]+)"', page):
1877 if mobj.group(1) not in ids_in_page:
1878 ids_in_page.append(unescapeHTML(mobj.group(1)))
1879
1880 video_ids.extend(ids_in_page)
1881
1882 # A little optimization - if current page is not
1883 # "full", ie. does not contain PAGE_SIZE video ids then
1884 # we can assume that this page is the last one - there
1885 # are no more ids on further pages - no need to query
1886 # again.
1887
1888 if len(ids_in_page) < self._PAGE_SIZE:
1889 break
1890
1891 pagenum += 1
1892
1893 all_ids_count = len(video_ids)
1894 playliststart = self._downloader.params.get('playliststart', 1) - 1
1895 playlistend = self._downloader.params.get('playlistend', -1)
1896
1897 if playlistend == -1:
1898 video_ids = video_ids[playliststart:]
1899 else:
1900 video_ids = video_ids[playliststart:playlistend]
1901
1902 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1903 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1904
1905 for video_id in video_ids:
1906 self._downloader.download([u'http://blip.tv/'+video_id])
1907
1908
1909 class DepositFilesIE(InfoExtractor):
1910 """Information extractor for depositfiles.com"""
1911
1912 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1913
1914 def report_download_webpage(self, file_id):
1915 """Report webpage download."""
1916 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1917
1918 def report_extraction(self, file_id):
1919 """Report information extraction."""
1920 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1921
1922 def _real_extract(self, url):
1923 file_id = url.split('/')[-1]
1924 # Rebuild url in english locale
1925 url = 'http://depositfiles.com/en/files/' + file_id
1926
1927 # Retrieve file webpage with 'Free download' button pressed
1928 free_download_indication = { 'gateway_result' : '1' }
1929 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1930 try:
1931 self.report_download_webpage(file_id)
1932 webpage = compat_urllib_request.urlopen(request).read()
1933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1934 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1935 return
1936
1937 # Search for the real file URL
1938 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1939 if (mobj is None) or (mobj.group(1) is None):
1940 # Try to figure out reason of the error.
1941 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1942 if (mobj is not None) and (mobj.group(1) is not None):
1943 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1944 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1945 else:
1946 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1947 return
1948
1949 file_url = mobj.group(1)
1950 file_extension = os.path.splitext(file_url)[1][1:]
1951
1952 # Search for file title
1953 mobj = re.search(r'<b title="(.*?)">', webpage)
1954 if mobj is None:
1955 self._downloader.trouble(u'ERROR: unable to extract title')
1956 return
1957 file_title = mobj.group(1).decode('utf-8')
1958
1959 return [{
1960 'id': file_id.decode('utf-8'),
1961 'url': file_url.decode('utf-8'),
1962 'uploader': None,
1963 'upload_date': None,
1964 'title': file_title,
1965 'ext': file_extension.decode('utf-8'),
1966 }]
1967
1968
1969 class FacebookIE(InfoExtractor):
1970 """Information Extractor for Facebook"""
1971
1972 _WORKING = False
1973 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1974 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1975 _NETRC_MACHINE = 'facebook'
1976 _available_formats = ['video', 'highqual', 'lowqual']
1977 _video_extensions = {
1978 'video': 'mp4',
1979 'highqual': 'mp4',
1980 'lowqual': 'mp4',
1981 }
1982 IE_NAME = u'facebook'
1983
1984 def __init__(self, downloader=None):
1985 InfoExtractor.__init__(self, downloader)
1986
1987 def _reporter(self, message):
1988 """Add header and report message."""
1989 self._downloader.to_screen(u'[facebook] %s' % message)
1990
1991 def report_login(self):
1992 """Report attempt to log in."""
1993 self._reporter(u'Logging in')
1994
1995 def report_video_webpage_download(self, video_id):
1996 """Report attempt to download video webpage."""
1997 self._reporter(u'%s: Downloading video webpage' % video_id)
1998
1999 def report_information_extraction(self, video_id):
2000 """Report attempt to extract video information."""
2001 self._reporter(u'%s: Extracting video information' % video_id)
2002
2003 def _parse_page(self, video_webpage):
2004 """Extract video information from page"""
2005 # General data
2006 data = {'title': r'\("video_title", "(.*?)"\)',
2007 'description': r'<div class="datawrap">(.*?)</div>',
2008 'owner': r'\("video_owner_name", "(.*?)"\)',
2009 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2010 }
2011 video_info = {}
2012 for piece in data.keys():
2013 mobj = re.search(data[piece], video_webpage)
2014 if mobj is not None:
2015 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2016
2017 # Video urls
2018 video_urls = {}
2019 for fmt in self._available_formats:
2020 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2021 if mobj is not None:
2022 # URL is in a Javascript segment inside an escaped Unicode format within
2023 # the generally utf-8 page
2024 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2025 video_info['video_urls'] = video_urls
2026
2027 return video_info
2028
2029 def _real_initialize(self):
2030 if self._downloader is None:
2031 return
2032
2033 useremail = None
2034 password = None
2035 downloader_params = self._downloader.params
2036
2037 # Attempt to use provided username and password or .netrc data
2038 if downloader_params.get('username', None) is not None:
2039 useremail = downloader_params['username']
2040 password = downloader_params['password']
2041 elif downloader_params.get('usenetrc', False):
2042 try:
2043 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2044 if info is not None:
2045 useremail = info[0]
2046 password = info[2]
2047 else:
2048 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2049 except (IOError, netrc.NetrcParseError) as err:
2050 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2051 return
2052
2053 if useremail is None:
2054 return
2055
2056 # Log in
2057 login_form = {
2058 'email': useremail,
2059 'pass': password,
2060 'login': 'Log+In'
2061 }
2062 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2063 try:
2064 self.report_login()
2065 login_results = compat_urllib_request.urlopen(request).read()
2066 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2067 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2068 return
2069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2071 return
2072
2073 def _real_extract(self, url):
2074 mobj = re.match(self._VALID_URL, url)
2075 if mobj is None:
2076 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2077 return
2078 video_id = mobj.group('ID')
2079
2080 # Get video webpage
2081 self.report_video_webpage_download(video_id)
2082 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2083 try:
2084 page = compat_urllib_request.urlopen(request)
2085 video_webpage = page.read()
2086 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2087 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2088 return
2089
2090 # Start extracting information
2091 self.report_information_extraction(video_id)
2092
2093 # Extract information
2094 video_info = self._parse_page(video_webpage)
2095
2096 # uploader
2097 if 'owner' not in video_info:
2098 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2099 return
2100 video_uploader = video_info['owner']
2101
2102 # title
2103 if 'title' not in video_info:
2104 self._downloader.trouble(u'ERROR: unable to extract video title')
2105 return
2106 video_title = video_info['title']
2107 video_title = video_title.decode('utf-8')
2108
2109 # thumbnail image
2110 if 'thumbnail' not in video_info:
2111 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2112 video_thumbnail = ''
2113 else:
2114 video_thumbnail = video_info['thumbnail']
2115
2116 # upload date
2117 upload_date = None
2118 if 'upload_date' in video_info:
2119 upload_time = video_info['upload_date']
2120 timetuple = email.utils.parsedate_tz(upload_time)
2121 if timetuple is not None:
2122 try:
2123 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2124 except:
2125 pass
2126
2127 # description
2128 video_description = video_info.get('description', 'No description available.')
2129
2130 url_map = video_info['video_urls']
2131 if url_map:
2132 # Decide which formats to download
2133 req_format = self._downloader.params.get('format', None)
2134 format_limit = self._downloader.params.get('format_limit', None)
2135
2136 if format_limit is not None and format_limit in self._available_formats:
2137 format_list = self._available_formats[self._available_formats.index(format_limit):]
2138 else:
2139 format_list = self._available_formats
2140 existing_formats = [x for x in format_list if x in url_map]
2141 if len(existing_formats) == 0:
2142 self._downloader.trouble(u'ERROR: no known formats available for video')
2143 return
2144 if req_format is None:
2145 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2146 elif req_format == 'worst':
2147 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2148 elif req_format == '-1':
2149 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2150 else:
2151 # Specific format
2152 if req_format not in url_map:
2153 self._downloader.trouble(u'ERROR: requested format not available')
2154 return
2155 video_url_list = [(req_format, url_map[req_format])] # Specific format
2156
2157 results = []
2158 for format_param, video_real_url in video_url_list:
2159 # Extension
2160 video_extension = self._video_extensions.get(format_param, 'mp4')
2161
2162 results.append({
2163 'id': video_id.decode('utf-8'),
2164 'url': video_real_url.decode('utf-8'),
2165 'uploader': video_uploader.decode('utf-8'),
2166 'upload_date': upload_date,
2167 'title': video_title,
2168 'ext': video_extension.decode('utf-8'),
2169 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2170 'thumbnail': video_thumbnail.decode('utf-8'),
2171 'description': video_description.decode('utf-8'),
2172 })
2173 return results
2174
2175 class BlipTVIE(InfoExtractor):
2176 """Information extractor for blip.tv"""
2177
2178 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2179 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2180 IE_NAME = u'blip.tv'
2181
2182 def report_extraction(self, file_id):
2183 """Report information extraction."""
2184 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2185
2186 def report_direct_download(self, title):
2187 """Report information extraction."""
2188 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2189
2190 def _real_extract(self, url):
2191 mobj = re.match(self._VALID_URL, url)
2192 if mobj is None:
2193 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2194 return
2195
2196 if '?' in url:
2197 cchar = '&'
2198 else:
2199 cchar = '?'
2200 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2201 request = compat_urllib_request.Request(json_url)
2202 self.report_extraction(mobj.group(1))
2203 info = None
2204 try:
2205 urlh = compat_urllib_request.urlopen(request)
2206 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2207 basename = url.split('/')[-1]
2208 title,ext = os.path.splitext(basename)
2209 title = title.decode('UTF-8')
2210 ext = ext.replace('.', '')
2211 self.report_direct_download(title)
2212 info = {
2213 'id': title,
2214 'url': url,
2215 'uploader': None,
2216 'upload_date': None,
2217 'title': title,
2218 'ext': ext,
2219 'urlhandle': urlh
2220 }
2221 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2222 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2223 return
2224 if info is None: # Regular URL
2225 try:
2226 json_code_bytes = urlh.read()
2227 json_code = json_code_bytes.decode('utf-8')
2228 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2229 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2230 return
2231
2232 try:
2233 json_data = json.loads(json_code)
2234 if 'Post' in json_data:
2235 data = json_data['Post']
2236 else:
2237 data = json_data
2238
2239 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2240 video_url = data['media']['url']
2241 umobj = re.match(self._URL_EXT, video_url)
2242 if umobj is None:
2243 raise ValueError('Can not determine filename extension')
2244 ext = umobj.group(1)
2245
2246 info = {
2247 'id': data['item_id'],
2248 'url': video_url,
2249 'uploader': data['display_name'],
2250 'upload_date': upload_date,
2251 'title': data['title'],
2252 'ext': ext,
2253 'format': data['media']['mimeType'],
2254 'thumbnail': data['thumbnailUrl'],
2255 'description': data['description'],
2256 'player_url': data['embedUrl']
2257 }
2258 except (ValueError,KeyError) as err:
2259 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2260 return
2261
2262 std_headers['User-Agent'] = 'iTunes/10.6.1'
2263 return [info]
2264
2265
2266 class MyVideoIE(InfoExtractor):
2267 """Information Extractor for myvideo.de."""
2268
2269 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2270 IE_NAME = u'myvideo'
2271
2272 def __init__(self, downloader=None):
2273 InfoExtractor.__init__(self, downloader)
2274
2275 def report_download_webpage(self, video_id):
2276 """Report webpage download."""
2277 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2278
2279 def report_extraction(self, video_id):
2280 """Report information extraction."""
2281 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2282
2283 def _real_extract(self,url):
2284 mobj = re.match(self._VALID_URL, url)
2285 if mobj is None:
2286 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2287 return
2288
2289 video_id = mobj.group(1)
2290
2291 # Get video webpage
2292 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2293 try:
2294 self.report_download_webpage(video_id)
2295 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2296 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2297 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2298 return
2299
2300 self.report_extraction(video_id)
2301 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2302 webpage)
2303 if mobj is None:
2304 self._downloader.trouble(u'ERROR: unable to extract media URL')
2305 return
2306 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2307
2308 mobj = re.search('<title>([^<]+)</title>', webpage)
2309 if mobj is None:
2310 self._downloader.trouble(u'ERROR: unable to extract title')
2311 return
2312
2313 video_title = mobj.group(1)
2314
2315 return [{
2316 'id': video_id,
2317 'url': video_url,
2318 'uploader': None,
2319 'upload_date': None,
2320 'title': video_title,
2321 'ext': u'flv',
2322 }]
2323
2324 class ComedyCentralIE(InfoExtractor):
2325 """Information extractor for The Daily Show and Colbert Report """
2326
2327 # urls can be abbreviations like :thedailyshow or :colbert
2328 # urls for episodes like:
2329 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2330 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2331 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2332 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2333 |(https?://)?(www\.)?
2334 (?P<showname>thedailyshow|colbertnation)\.com/
2335 (full-episodes/(?P<episode>.*)|
2336 (?P<clip>
2337 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2338 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2339 $"""
2340 IE_NAME = u'comedycentral'
2341
2342 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2343
2344 _video_extensions = {
2345 '3500': 'mp4',
2346 '2200': 'mp4',
2347 '1700': 'mp4',
2348 '1200': 'mp4',
2349 '750': 'mp4',
2350 '400': 'mp4',
2351 }
2352 _video_dimensions = {
2353 '3500': '1280x720',
2354 '2200': '960x540',
2355 '1700': '768x432',
2356 '1200': '640x360',
2357 '750': '512x288',
2358 '400': '384x216',
2359 }
2360
2361 def suitable(self, url):
2362 """Receives a URL and returns True if suitable for this IE."""
2363 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2364
2365 def report_extraction(self, episode_id):
2366 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2367
2368 def report_config_download(self, episode_id):
2369 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2370
2371 def report_index_download(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2373
2374 def report_player_url(self, episode_id):
2375 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2376
2377
2378 def _print_formats(self, formats):
2379 print('Available formats:')
2380 for x in formats:
2381 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2382
2383
2384 def _real_extract(self, url):
2385 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2386 if mobj is None:
2387 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2388 return
2389
2390 if mobj.group('shortname'):
2391 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2392 url = u'http://www.thedailyshow.com/full-episodes/'
2393 else:
2394 url = u'http://www.colbertnation.com/full-episodes/'
2395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396 assert mobj is not None
2397
2398 if mobj.group('clip'):
2399 if mobj.group('showname') == 'thedailyshow':
2400 epTitle = mobj.group('tdstitle')
2401 else:
2402 epTitle = mobj.group('cntitle')
2403 dlNewest = False
2404 else:
2405 dlNewest = not mobj.group('episode')
2406 if dlNewest:
2407 epTitle = mobj.group('showname')
2408 else:
2409 epTitle = mobj.group('episode')
2410
2411 req = compat_urllib_request.Request(url)
2412 self.report_extraction(epTitle)
2413 try:
2414 htmlHandle = compat_urllib_request.urlopen(req)
2415 html = htmlHandle.read()
2416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2418 return
2419 if dlNewest:
2420 url = htmlHandle.geturl()
2421 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422 if mobj is None:
2423 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2424 return
2425 if mobj.group('episode') == '':
2426 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2427 return
2428 epTitle = mobj.group('episode')
2429
2430 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2431
2432 if len(mMovieParams) == 0:
2433 # The Colbert Report embeds the information in a without
2434 # a URL prefix; so extract the alternate reference
2435 # and then add the URL prefix manually.
2436
2437 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2438 if len(altMovieParams) == 0:
2439 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2440 return
2441 else:
2442 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2443
2444 playerUrl_raw = mMovieParams[0][0]
2445 self.report_player_url(epTitle)
2446 try:
2447 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2448 playerUrl = urlHandle.geturl()
2449 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2451 return
2452
2453 uri = mMovieParams[0][1]
2454 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2455 self.report_index_download(epTitle)
2456 try:
2457 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2460 return
2461
2462 results = []
2463
2464 idoc = xml.etree.ElementTree.fromstring(indexXml)
2465 itemEls = idoc.findall('.//item')
2466 for itemEl in itemEls:
2467 mediaId = itemEl.findall('./guid')[0].text
2468 shortMediaId = mediaId.split(':')[-1]
2469 showId = mediaId.split(':')[-2].replace('.com', '')
2470 officialTitle = itemEl.findall('./title')[0].text
2471 officialDate = itemEl.findall('./pubDate')[0].text
2472
2473 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2474 compat_urllib_parse.urlencode({'uri': mediaId}))
2475 configReq = compat_urllib_request.Request(configUrl)
2476 self.report_config_download(epTitle)
2477 try:
2478 configXml = compat_urllib_request.urlopen(configReq).read()
2479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2481 return
2482
2483 cdoc = xml.etree.ElementTree.fromstring(configXml)
2484 turls = []
2485 for rendition in cdoc.findall('.//rendition'):
2486 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2487 turls.append(finfo)
2488
2489 if len(turls) == 0:
2490 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2491 continue
2492
2493 if self._downloader.params.get('listformats', None):
2494 self._print_formats([i[0] for i in turls])
2495 return
2496
2497 # For now, just pick the highest bitrate
2498 format,video_url = turls[-1]
2499
2500 # Get the format arg from the arg stream
2501 req_format = self._downloader.params.get('format', None)
2502
2503 # Select format if we can find one
2504 for f,v in turls:
2505 if f == req_format:
2506 format, video_url = f, v
2507 break
2508
2509 # Patch to download from alternative CDN, which does not
2510 # break on current RTMPDump builds
2511 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2512 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2513
2514 if video_url.startswith(broken_cdn):
2515 video_url = video_url.replace(broken_cdn, better_cdn)
2516
2517 effTitle = showId + u'-' + epTitle
2518 info = {
2519 'id': shortMediaId,
2520 'url': video_url,
2521 'uploader': showId,
2522 'upload_date': officialDate,
2523 'title': effTitle,
2524 'ext': 'mp4',
2525 'format': format,
2526 'thumbnail': None,
2527 'description': officialTitle,
2528 'player_url': None #playerUrl
2529 }
2530
2531 results.append(info)
2532
2533 return results
2534
2535
2536 class EscapistIE(InfoExtractor):
2537 """Information extractor for The Escapist """
2538
2539 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540 IE_NAME = u'escapist'
2541
2542 def report_extraction(self, showName):
2543 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2544
2545 def report_config_download(self, showName):
2546 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2547
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2550 if mobj is None:
2551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2552 return
2553 showName = mobj.group('showname')
2554 videoId = mobj.group('episode')
2555
2556 self.report_extraction(showName)
2557 try:
2558 webPage = compat_urllib_request.urlopen(url)
2559 webPageBytes = webPage.read()
2560 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2561 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2564 return
2565
2566 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2567 description = unescapeHTML(descMatch.group(1))
2568 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2569 imgUrl = unescapeHTML(imgMatch.group(1))
2570 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2571 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572 configUrlMatch = re.search('config=(.*)$', playerUrl)
2573 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2574
2575 self.report_config_download(showName)
2576 try:
2577 configJSON = compat_urllib_request.urlopen(configUrl)
2578 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2579 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2582 return
2583
2584 # Technically, it's JavaScript, not JSON
2585 configJSON = configJSON.replace("'", '"')
2586
2587 try:
2588 config = json.loads(configJSON)
2589 except (ValueError,) as err:
2590 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2591 return
2592
2593 playlist = config['playlist']
2594 videoUrl = playlist[1]['url']
2595
2596 info = {
2597 'id': videoId,
2598 'url': videoUrl,
2599 'uploader': showName,
2600 'upload_date': None,
2601 'title': showName,
2602 'ext': 'flv',
2603 'thumbnail': imgUrl,
2604 'description': description,
2605 'player_url': playerUrl,
2606 }
2607
2608 return [info]
2609
2610
2611 class CollegeHumorIE(InfoExtractor):
2612 """Information extractor for collegehumor.com"""
2613
2614 _WORKING = False
2615 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616 IE_NAME = u'collegehumor'
2617
2618 def report_manifest(self, video_id):
2619 """Report information extraction."""
2620 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621
2622 def report_extraction(self, video_id):
2623 """Report information extraction."""
2624 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625
2626 def _real_extract(self, url):
2627 mobj = re.match(self._VALID_URL, url)
2628 if mobj is None:
2629 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2630 return
2631 video_id = mobj.group('videoid')
2632
2633 info = {
2634 'id': video_id,
2635 'uploader': None,
2636 'upload_date': None,
2637 }
2638
2639 self.report_extraction(video_id)
2640 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641 try:
2642 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2645 return
2646
2647 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648 try:
2649 videoNode = mdoc.findall('./video')[0]
2650 info['description'] = videoNode.findall('./description')[0].text
2651 info['title'] = videoNode.findall('./caption')[0].text
2652 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653 manifest_url = videoNode.findall('./file')[0].text
2654 except IndexError:
2655 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2656 return
2657
2658 manifest_url += '?hdcore=2.10.3'
2659 self.report_manifest(video_id)
2660 try:
2661 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2664 return
2665
2666 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667 try:
2668 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669 node_id = media_node.attrib['url']
2670 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671 except IndexError as err:
2672 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2673 return
2674
2675 url_pr = compat_urllib_parse_urlparse(manifest_url)
2676 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2677
2678 info['url'] = url
2679 info['ext'] = 'f4f'
2680 return [info]
2681
2682
2683 class XVideosIE(InfoExtractor):
2684 """Information extractor for xvideos.com"""
2685
2686 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687 IE_NAME = u'xvideos'
2688
2689 def report_webpage(self, video_id):
2690 """Report information extraction."""
2691 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2692
2693 def report_extraction(self, video_id):
2694 """Report information extraction."""
2695 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2696
2697 def _real_extract(self, url):
2698 mobj = re.match(self._VALID_URL, url)
2699 if mobj is None:
2700 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701 return
2702 video_id = mobj.group(1)
2703
2704 self.report_webpage(video_id)
2705
2706 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2707 try:
2708 webpage_bytes = compat_urllib_request.urlopen(request).read()
2709 webpage = webpage_bytes.decode('utf-8', 'replace')
2710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2711 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2712 return
2713
2714 self.report_extraction(video_id)
2715
2716
2717 # Extract video URL
2718 mobj = re.search(r'flv_url=(.+?)&', webpage)
2719 if mobj is None:
2720 self._downloader.trouble(u'ERROR: unable to extract video url')
2721 return
2722 video_url = compat_urllib_parse.unquote(mobj.group(1))
2723
2724
2725 # Extract title
2726 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2727 if mobj is None:
2728 self._downloader.trouble(u'ERROR: unable to extract video title')
2729 return
2730 video_title = mobj.group(1)
2731
2732
2733 # Extract video thumbnail
2734 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2735 if mobj is None:
2736 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2737 return
2738 video_thumbnail = mobj.group(0)
2739
2740 info = {
2741 'id': video_id,
2742 'url': video_url,
2743 'uploader': None,
2744 'upload_date': None,
2745 'title': video_title,
2746 'ext': 'flv',
2747 'thumbnail': video_thumbnail,
2748 'description': None,
2749 }
2750
2751 return [info]
2752
2753
2754 class SoundcloudIE(InfoExtractor):
2755 """Information extractor for soundcloud.com
2756 To access the media, the uid of the song and a stream token
2757 must be extracted from the page source and the script must make
2758 a request to media.soundcloud.com/crossdomain.xml. Then
2759 the media can be grabbed by requesting from an url composed
2760 of the stream token and uid
2761 """
2762
2763 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2764 IE_NAME = u'soundcloud'
2765
2766 def __init__(self, downloader=None):
2767 InfoExtractor.__init__(self, downloader)
2768
2769 def report_resolve(self, video_id):
2770 """Report information extraction."""
2771 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2772
2773 def report_extraction(self, video_id):
2774 """Report information extraction."""
2775 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2776
2777 def _real_extract(self, url):
2778 mobj = re.match(self._VALID_URL, url)
2779 if mobj is None:
2780 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2781 return
2782
2783 # extract uploader (which is in the url)
2784 uploader = mobj.group(1)
2785 # extract simple title (uploader + slug of song title)
2786 slug_title = mobj.group(2)
2787 simple_title = uploader + u'-' + slug_title
2788
2789 self.report_resolve('%s/%s' % (uploader, slug_title))
2790
2791 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2792 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793 request = compat_urllib_request.Request(resolv_url)
2794 try:
2795 info_json_bytes = compat_urllib_request.urlopen(request).read()
2796 info_json = info_json_bytes.decode('utf-8')
2797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2798 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2799 return
2800
2801 info = json.loads(info_json)
2802 video_id = info['id']
2803 self.report_extraction('%s/%s' % (uploader, slug_title))
2804
2805 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2806 request = compat_urllib_request.Request(streams_url)
2807 try:
2808 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2809 stream_json = stream_json_bytes.decode('utf-8')
2810 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2812 return
2813
2814 streams = json.loads(stream_json)
2815 mediaURL = streams['http_mp3_128_url']
2816
2817 return [{
2818 'id': info['id'],
2819 'url': mediaURL,
2820 'uploader': info['user']['username'],
2821 'upload_date': info['created_at'],
2822 'title': info['title'],
2823 'ext': u'mp3',
2824 'description': info['description'],
2825 }]
2826
2827
2828 class InfoQIE(InfoExtractor):
2829 """Information extractor for infoq.com"""
2830
2831 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2832 IE_NAME = u'infoq'
2833
2834 def report_webpage(self, video_id):
2835 """Report information extraction."""
2836 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2837
2838 def report_extraction(self, video_id):
2839 """Report information extraction."""
2840 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2841
2842 def _real_extract(self, url):
2843 mobj = re.match(self._VALID_URL, url)
2844 if mobj is None:
2845 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2846 return
2847
2848 self.report_webpage(url)
2849
2850 request = compat_urllib_request.Request(url)
2851 try:
2852 webpage = compat_urllib_request.urlopen(request).read()
2853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2854 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2855 return
2856
2857 self.report_extraction(url)
2858
2859
2860 # Extract video URL
2861 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2862 if mobj is None:
2863 self._downloader.trouble(u'ERROR: unable to extract video url')
2864 return
2865 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2866
2867
2868 # Extract title
2869 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2870 if mobj is None:
2871 self._downloader.trouble(u'ERROR: unable to extract video title')
2872 return
2873 video_title = mobj.group(1).decode('utf-8')
2874
2875 # Extract description
2876 video_description = u'No description available.'
2877 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2878 if mobj is not None:
2879 video_description = mobj.group(1).decode('utf-8')
2880
2881 video_filename = video_url.split('/')[-1]
2882 video_id, extension = video_filename.split('.')
2883
2884 info = {
2885 'id': video_id,
2886 'url': video_url,
2887 'uploader': None,
2888 'upload_date': None,
2889 'title': video_title,
2890 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2891 'thumbnail': None,
2892 'description': video_description,
2893 }
2894
2895 return [info]
2896
2897 class MixcloudIE(InfoExtractor):
2898 """Information extractor for www.mixcloud.com"""
2899
2900 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2901 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2902 IE_NAME = u'mixcloud'
2903
2904 def __init__(self, downloader=None):
2905 InfoExtractor.__init__(self, downloader)
2906
2907 def report_download_json(self, file_id):
2908 """Report JSON download."""
2909 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2910
2911 def report_extraction(self, file_id):
2912 """Report information extraction."""
2913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2914
2915 def get_urls(self, jsonData, fmt, bitrate='best'):
2916 """Get urls from 'audio_formats' section in json"""
2917 file_url = None
2918 try:
2919 bitrate_list = jsonData[fmt]
2920 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2921 bitrate = max(bitrate_list) # select highest
2922
2923 url_list = jsonData[fmt][bitrate]
2924 except TypeError: # we have no bitrate info.
2925 url_list = jsonData[fmt]
2926 return url_list
2927
2928 def check_urls(self, url_list):
2929 """Returns 1st active url from list"""
2930 for url in url_list:
2931 try:
2932 compat_urllib_request.urlopen(url)
2933 return url
2934 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2935 url = None
2936
2937 return None
2938
2939 def _print_formats(self, formats):
2940 print('Available formats:')
2941 for fmt in formats.keys():
2942 for b in formats[fmt]:
2943 try:
2944 ext = formats[fmt][b][0]
2945 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2946 except TypeError: # we have no bitrate info
2947 ext = formats[fmt][0]
2948 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2949 break
2950
2951 def _real_extract(self, url):
2952 mobj = re.match(self._VALID_URL, url)
2953 if mobj is None:
2954 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955 return
2956 # extract uploader & filename from url
2957 uploader = mobj.group(1).decode('utf-8')
2958 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2959
2960 # construct API request
2961 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2962 # retrieve .json file with links to files
2963 request = compat_urllib_request.Request(file_url)
2964 try:
2965 self.report_download_json(file_url)
2966 jsonData = compat_urllib_request.urlopen(request).read()
2967 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2968 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2969 return
2970
2971 # parse JSON
2972 json_data = json.loads(jsonData)
2973 player_url = json_data['player_swf_url']
2974 formats = dict(json_data['audio_formats'])
2975
2976 req_format = self._downloader.params.get('format', None)
2977 bitrate = None
2978
2979 if self._downloader.params.get('listformats', None):
2980 self._print_formats(formats)
2981 return
2982
2983 if req_format is None or req_format == 'best':
2984 for format_param in formats.keys():
2985 url_list = self.get_urls(formats, format_param)
2986 # check urls
2987 file_url = self.check_urls(url_list)
2988 if file_url is not None:
2989 break # got it!
2990 else:
2991 if req_format not in formats:
2992 self._downloader.trouble(u'ERROR: format is not available')
2993 return
2994
2995 url_list = self.get_urls(formats, req_format)
2996 file_url = self.check_urls(url_list)
2997 format_param = req_format
2998
2999 return [{
3000 'id': file_id.decode('utf-8'),
3001 'url': file_url.decode('utf-8'),
3002 'uploader': uploader.decode('utf-8'),
3003 'upload_date': None,
3004 'title': json_data['name'],
3005 'ext': file_url.split('.')[-1].decode('utf-8'),
3006 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3007 'thumbnail': json_data['thumbnail_url'],
3008 'description': json_data['description'],
3009 'player_url': player_url.decode('utf-8'),
3010 }]
3011
3012 class StanfordOpenClassroomIE(InfoExtractor):
3013 """Information extractor for Stanford's Open ClassRoom"""
3014
3015 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3016 IE_NAME = u'stanfordoc'
3017
3018 def report_download_webpage(self, objid):
3019 """Report information extraction."""
3020 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3021
3022 def report_extraction(self, video_id):
3023 """Report information extraction."""
3024 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3025
3026 def _real_extract(self, url):
3027 mobj = re.match(self._VALID_URL, url)
3028 if mobj is None:
3029 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3030 return
3031
3032 if mobj.group('course') and mobj.group('video'): # A specific video
3033 course = mobj.group('course')
3034 video = mobj.group('video')
3035 info = {
3036 'id': course + '_' + video,
3037 'uploader': None,
3038 'upload_date': None,
3039 }
3040
3041 self.report_extraction(info['id'])
3042 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3043 xmlUrl = baseUrl + video + '.xml'
3044 try:
3045 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3046 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3047 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3048 return
3049 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3050 try:
3051 info['title'] = mdoc.findall('./title')[0].text
3052 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3053 except IndexError:
3054 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3055 return
3056 info['ext'] = info['url'].rpartition('.')[2]
3057 return [info]
3058 elif mobj.group('course'): # A course page
3059 course = mobj.group('course')
3060 info = {
3061 'id': course,
3062 'type': 'playlist',
3063 'uploader': None,
3064 'upload_date': None,
3065 }
3066
3067 self.report_download_webpage(info['id'])
3068 try:
3069 coursepage = compat_urllib_request.urlopen(url).read()
3070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3071 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3072 return
3073
3074 m = re.search('<h1>([^<]+)</h1>', coursepage)
3075 if m:
3076 info['title'] = unescapeHTML(m.group(1))
3077 else:
3078 info['title'] = info['id']
3079
3080 m = re.search('<description>([^<]+)</description>', coursepage)
3081 if m:
3082 info['description'] = unescapeHTML(m.group(1))
3083
3084 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3085 info['list'] = [
3086 {
3087 'type': 'reference',
3088 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3089 }
3090 for vpage in links]
3091 results = []
3092 for entry in info['list']:
3093 assert entry['type'] == 'reference'
3094 results += self.extract(entry['url'])
3095 return results
3096
3097 else: # Root page
3098 info = {
3099 'id': 'Stanford OpenClassroom',
3100 'type': 'playlist',
3101 'uploader': None,
3102 'upload_date': None,
3103 }
3104
3105 self.report_download_webpage(info['id'])
3106 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3107 try:
3108 rootpage = compat_urllib_request.urlopen(rootURL).read()
3109 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3110 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3111 return
3112
3113 info['title'] = info['id']
3114
3115 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3116 info['list'] = [
3117 {
3118 'type': 'reference',
3119 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3120 }
3121 for cpage in links]
3122
3123 results = []
3124 for entry in info['list']:
3125 assert entry['type'] == 'reference'
3126 results += self.extract(entry['url'])
3127 return results
3128
3129 class MTVIE(InfoExtractor):
3130 """Information extractor for MTV.com"""
3131
3132 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3133 IE_NAME = u'mtv'
3134
3135 def report_webpage(self, video_id):
3136 """Report information extraction."""
3137 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3138
3139 def report_extraction(self, video_id):
3140 """Report information extraction."""
3141 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3142
3143 def _real_extract(self, url):
3144 mobj = re.match(self._VALID_URL, url)
3145 if mobj is None:
3146 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3147 return
3148 if not mobj.group('proto'):
3149 url = 'http://' + url
3150 video_id = mobj.group('videoid')
3151 self.report_webpage(video_id)
3152
3153 request = compat_urllib_request.Request(url)
3154 try:
3155 webpage = compat_urllib_request.urlopen(request).read()
3156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3157 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3158 return
3159
3160 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3161 if mobj is None:
3162 self._downloader.trouble(u'ERROR: unable to extract song name')
3163 return
3164 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3165 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3166 if mobj is None:
3167 self._downloader.trouble(u'ERROR: unable to extract performer')
3168 return
3169 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3170 video_title = performer + ' - ' + song_name
3171
3172 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3173 if mobj is None:
3174 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3175 return
3176 mtvn_uri = mobj.group(1)
3177
3178 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3179 if mobj is None:
3180 self._downloader.trouble(u'ERROR: unable to extract content id')
3181 return
3182 content_id = mobj.group(1)
3183
3184 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3185 self.report_extraction(video_id)
3186 request = compat_urllib_request.Request(videogen_url)
3187 try:
3188 metadataXml = compat_urllib_request.urlopen(request).read()
3189 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3190 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3191 return
3192
3193 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3194 renditions = mdoc.findall('.//rendition')
3195
3196 # For now, always pick the highest quality.
3197 rendition = renditions[-1]
3198
3199 try:
3200 _,_,ext = rendition.attrib['type'].partition('/')
3201 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3202 video_url = rendition.find('./src').text
3203 except KeyError:
3204 self._downloader.trouble('Invalid rendition field.')
3205 return
3206
3207 info = {
3208 'id': video_id,
3209 'url': video_url,
3210 'uploader': performer,
3211 'upload_date': None,
3212 'title': video_title,
3213 'ext': ext,
3214 'format': format,
3215 }
3216
3217 return [info]
3218
3219
3220 class YoukuIE(InfoExtractor):
3221 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3222
3223 def report_download_webpage(self, file_id):
3224 """Report webpage download."""
3225 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3226
3227 def report_extraction(self, file_id):
3228 """Report information extraction."""
3229 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3230
3231 def _gen_sid(self):
3232 nowTime = int(time.time() * 1000)
3233 random1 = random.randint(1000,1998)
3234 random2 = random.randint(1000,9999)
3235
3236 return "%d%d%d" %(nowTime,random1,random2)
3237
3238 def _get_file_ID_mix_string(self, seed):
3239 mixed = []
3240 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3241 seed = float(seed)
3242 for i in range(len(source)):
3243 seed = (seed * 211 + 30031 ) % 65536
3244 index = math.floor(seed / 65536 * len(source) )
3245 mixed.append(source[int(index)])
3246 source.remove(source[int(index)])
3247 #return ''.join(mixed)
3248 return mixed
3249
3250 def _get_file_id(self, fileId, seed):
3251 mixed = self._get_file_ID_mix_string(seed)
3252 ids = fileId.split('*')
3253 realId = []
3254 for ch in ids:
3255 if ch:
3256 realId.append(mixed[int(ch)])
3257 return ''.join(realId)
3258
3259 def _real_extract(self, url):
3260 mobj = re.match(self._VALID_URL, url)
3261 if mobj is None:
3262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3263 return
3264 video_id = mobj.group('ID')
3265
3266 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3267
3268 request = compat_urllib_request.Request(info_url, None, std_headers)
3269 try:
3270 self.report_download_webpage(video_id)
3271 jsondata = compat_urllib_request.urlopen(request).read()
3272 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3273 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3274 return
3275
3276 self.report_extraction(video_id)
3277 try:
3278 jsonstr = jsondata.decode('utf-8')
3279 config = json.loads(jsonstr)
3280
3281 video_title = config['data'][0]['title']
3282 seed = config['data'][0]['seed']
3283
3284 format = self._downloader.params.get('format', None)
3285 supported_format = list(config['data'][0]['streamfileids'].keys())
3286
3287 if format is None or format == 'best':
3288 if 'hd2' in supported_format:
3289 format = 'hd2'
3290 else:
3291 format = 'flv'
3292 ext = u'flv'
3293 elif format == 'worst':
3294 format = 'mp4'
3295 ext = u'mp4'
3296 else:
3297 format = 'flv'
3298 ext = u'flv'
3299
3300
3301 fileid = config['data'][0]['streamfileids'][format]
3302 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3303 except (UnicodeDecodeError, ValueError, KeyError):
3304 self._downloader.trouble(u'ERROR: unable to extract info section')
3305 return
3306
3307 files_info=[]
3308 sid = self._gen_sid()
3309 fileid = self._get_file_id(fileid, seed)
3310
3311 #column 8,9 of fileid represent the segment number
3312 #fileid[7:9] should be changed
3313 for index, key in enumerate(keys):
3314
3315 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3316 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3317
3318 info = {
3319 'id': '%s_part%02d' % (video_id, index),
3320 'url': download_url,
3321 'uploader': None,
3322 'upload_date': None,
3323 'title': video_title,
3324 'ext': ext,
3325 }
3326 files_info.append(info)
3327
3328 return files_info
3329
3330
3331 class XNXXIE(InfoExtractor):
3332 """Information extractor for xnxx.com"""
3333
3334 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3335 IE_NAME = u'xnxx'
3336 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3337 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3338 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3339
3340 def report_webpage(self, video_id):
3341 """Report information extraction"""
3342 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3343
3344 def report_extraction(self, video_id):
3345 """Report information extraction"""
3346 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3347
3348 def _real_extract(self, url):
3349 mobj = re.match(self._VALID_URL, url)
3350 if mobj is None:
3351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3352 return
3353 video_id = mobj.group(1)
3354
3355 self.report_webpage(video_id)
3356
3357 # Get webpage content
3358 try:
3359 webpage_bytes = compat_urllib_request.urlopen(url).read()
3360 webpage = webpage_bytes.decode('utf-8')
3361 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3362 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3363 return
3364
3365 result = re.search(self.VIDEO_URL_RE, webpage)
3366 if result is None:
3367 self._downloader.trouble(u'ERROR: unable to extract video url')
3368 return
3369 video_url = compat_urllib_parse.unquote(result.group(1))
3370
3371 result = re.search(self.VIDEO_TITLE_RE, webpage)
3372 if result is None:
3373 self._downloader.trouble(u'ERROR: unable to extract video title')
3374 return
3375 video_title = result.group(1)
3376
3377 result = re.search(self.VIDEO_THUMB_RE, webpage)
3378 if result is None:
3379 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3380 return
3381 video_thumbnail = result.group(1)
3382
3383 return [{
3384 'id': video_id,
3385 'url': video_url,
3386 'uploader': None,
3387 'upload_date': None,
3388 'title': video_title,
3389 'ext': 'flv',
3390 'thumbnail': video_thumbnail,
3391 'description': None,
3392 }]
3393
3394
3395 class GooglePlusIE(InfoExtractor):
3396 """Information extractor for plus.google.com."""
3397
3398 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3399 IE_NAME = u'plus.google'
3400
3401 def __init__(self, downloader=None):
3402 InfoExtractor.__init__(self, downloader)
3403
3404 def report_extract_entry(self, url):
3405 """Report downloading extry"""
3406 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3407
3408 def report_date(self, upload_date):
3409 """Report downloading extry"""
3410 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3411
3412 def report_uploader(self, uploader):
3413 """Report downloading extry"""
3414 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3415
3416 def report_title(self, video_title):
3417 """Report downloading extry"""
3418 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3419
3420 def report_extract_vid_page(self, video_page):
3421 """Report information extraction."""
3422 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3423
3424 def _real_extract(self, url):
3425 # Extract id from URL
3426 mobj = re.match(self._VALID_URL, url)
3427 if mobj is None:
3428 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3429 return
3430
3431 post_url = mobj.group(0)
3432 video_id = mobj.group(1)
3433
3434 video_extension = 'flv'
3435
3436 # Step 1, Retrieve post webpage to extract further information
3437 self.report_extract_entry(post_url)
3438 request = compat_urllib_request.Request(post_url)
3439 try:
3440 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3441 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3442 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3443 return
3444
3445 # Extract update date
3446 upload_date = None
3447 pattern = 'title="Timestamp">(.*?)</a>'
3448 mobj = re.search(pattern, webpage)
3449 if mobj:
3450 upload_date = mobj.group(1)
3451 # Convert timestring to a format suitable for filename
3452 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3453 upload_date = upload_date.strftime('%Y%m%d')
3454 self.report_date(upload_date)
3455
3456 # Extract uploader
3457 uploader = None
3458 pattern = r'rel\="author".*?>(.*?)</a>'
3459 mobj = re.search(pattern, webpage)
3460 if mobj:
3461 uploader = mobj.group(1)
3462 self.report_uploader(uploader)
3463
3464 # Extract title
3465 # Get the first line for title
3466 video_title = u'NA'
3467 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3468 mobj = re.search(pattern, webpage)
3469 if mobj:
3470 video_title = mobj.group(1)
3471 self.report_title(video_title)
3472
3473 # Step 2, Stimulate clicking the image box to launch video
3474 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3475 mobj = re.search(pattern, webpage)
3476 if mobj is None:
3477 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3478
3479 video_page = mobj.group(1)
3480 request = compat_urllib_request.Request(video_page)
3481 try:
3482 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3483 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3484 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3485 return
3486 self.report_extract_vid_page(video_page)
3487
3488
3489 # Extract video links on video page
3490 """Extract video links of all sizes"""
3491 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3492 mobj = re.findall(pattern, webpage)
3493 if len(mobj) == 0:
3494 self._downloader.trouble(u'ERROR: unable to extract video links')
3495
3496 # Sort in resolution
3497 links = sorted(mobj)
3498
3499 # Choose the lowest of the sort, i.e. highest resolution
3500 video_url = links[-1]
3501 # Only get the url. The resolution part in the tuple has no use anymore
3502 video_url = video_url[-1]
3503 # Treat escaped \u0026 style hex
3504 try:
3505 video_url = video_url.decode("unicode_escape")
3506 except AttributeError: # Python 3
3507 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3508
3509
3510 return [{
3511 'id': video_id,
3512 'url': video_url,
3513 'uploader': uploader,
3514 'upload_date': upload_date,
3515 'title': video_title,
3516 'ext': video_extension,
3517 }]
3518
3519 class NBAIE(InfoExtractor):
3520 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3521 IE_NAME = u'nba'
3522
3523 def report_extraction(self, video_id):
3524 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3525
3526 def _real_extract(self, url):
3527 mobj = re.match(self._VALID_URL, url)
3528 if mobj is None:
3529 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3530 return
3531
3532 video_id = mobj.group(1)
3533 if video_id.endswith('/index.html'):
3534 video_id = video_id[:-len('/index.html')]
3535
3536 self.report_extraction(video_id)
3537 try:
3538 urlh = compat_urllib_request.urlopen(url)
3539 webpage_bytes = urlh.read()
3540 webpage = webpage_bytes.decode('utf-8', 'ignore')
3541 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3542 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3543 return
3544
3545 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3546 def _findProp(rexp, default=None):
3547 m = re.search(rexp, webpage)
3548 if m:
3549 return unescapeHTML(m.group(1))
3550 else:
3551 return default
3552
3553 shortened_video_id = video_id.rpartition('/')[2]
3554 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3555 info = {
3556 'id': shortened_video_id,
3557 'url': video_url,
3558 'ext': 'mp4',
3559 'title': title,
3560 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3561 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3562 }
3563 return [info]
3564
3565 class JustinTVIE(InfoExtractor):
3566 """Information extractor for justin.tv and twitch.tv"""
3567 # TODO: One broadcast may be split into multiple videos. The key
3568 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3569 # starts at 1 and increases. Can we treat all parts as one video?
3570
3571 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3572 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3573 _JUSTIN_PAGE_LIMIT = 100
3574 IE_NAME = u'justin.tv'
3575
3576 def report_extraction(self, file_id):
3577 """Report information extraction."""
3578 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3579
3580 def report_download_page(self, channel, offset):
3581 """Report attempt to download a single page of videos."""
3582 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3583 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3584
3585 # Return count of items, list of *valid* items
3586 def _parse_page(self, url):
3587 try:
3588 urlh = compat_urllib_request.urlopen(url)
3589 webpage_bytes = urlh.read()
3590 webpage = webpage_bytes.decode('utf-8', 'ignore')
3591 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3592 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3593 return
3594
3595 response = json.loads(webpage)
3596 info = []
3597 for clip in response:
3598 video_url = clip['video_file_url']
3599 if video_url:
3600 video_extension = os.path.splitext(video_url)[1][1:]
3601 video_date = re.sub('-', '', clip['created_on'][:10])
3602 info.append({
3603 'id': clip['id'],
3604 'url': video_url,
3605 'title': clip['title'],
3606 'uploader': clip.get('user_id', clip.get('channel_id')),
3607 'upload_date': video_date,
3608 'ext': video_extension,
3609 })
3610 return (len(response), info)
3611
3612 def _real_extract(self, url):
3613 mobj = re.match(self._VALID_URL, url)
3614 if mobj is None:
3615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3616 return
3617
3618 api = 'http://api.justin.tv'
3619 video_id = mobj.group(mobj.lastindex)
3620 paged = False
3621 if mobj.lastindex == 1:
3622 paged = True
3623 api += '/channel/archives/%s.json'
3624 else:
3625 api += '/clip/show/%s.json'
3626 api = api % (video_id,)
3627
3628 self.report_extraction(video_id)
3629
3630 info = []
3631 offset = 0
3632 limit = self._JUSTIN_PAGE_LIMIT
3633 while True:
3634 if paged:
3635 self.report_download_page(video_id, offset)
3636 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3637 page_count, page_info = self._parse_page(page_url)
3638 info.extend(page_info)
3639 if not paged or page_count != limit:
3640 break
3641 offset += limit
3642 return info
3643
3644 class FunnyOrDieIE(InfoExtractor):
3645 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3646
3647 def report_extraction(self, video_id):
3648 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3649
3650 def _real_extract(self, url):
3651 mobj = re.match(self._VALID_URL, url)
3652 if mobj is None:
3653 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3654 return
3655
3656 video_id = mobj.group('id')
3657 self.report_extraction(video_id)
3658 try:
3659 urlh = compat_urllib_request.urlopen(url)
3660 webpage_bytes = urlh.read()
3661 webpage = webpage_bytes.decode('utf-8', 'ignore')
3662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3663 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3664 return
3665
3666 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3667 if not m:
3668 self._downloader.trouble(u'ERROR: unable to find video information')
3669 video_url = unescapeHTML(m.group('url'))
3670
3671 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3672 if not m:
3673 self._downloader.trouble(u'Cannot find video title')
3674 title = unescapeHTML(m.group('title'))
3675
3676 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3677 if m:
3678 desc = unescapeHTML(m.group('desc'))
3679 else:
3680 desc = None
3681
3682 info = {
3683 'id': video_id,
3684 'url': video_url,
3685 'ext': 'mp4',
3686 'title': title,
3687 'description': desc,
3688 }
3689 return [info]
3690
3691 class TweetReelIE(InfoExtractor):
3692 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3693
3694 def report_extraction(self, video_id):
3695 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3696
3697 def _real_extract(self, url):
3698 mobj = re.match(self._VALID_URL, url)
3699 if mobj is None:
3700 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3701 return
3702
3703 video_id = mobj.group('id')
3704 self.report_extraction(video_id)
3705 try:
3706 urlh = compat_urllib_request.urlopen(url)
3707 webpage_bytes = urlh.read()
3708 webpage = webpage_bytes.decode('utf-8', 'ignore')
3709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3710 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3711 return
3712
3713 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3714 if not m:
3715 self._downloader.trouble(u'ERROR: Cannot find status ID')
3716 status_id = m.group(1)
3717
3718 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3719 if not m:
3720 self._downloader.trouble(u'WARNING: Cannot find description')
3721 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3722
3723 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3724 if not m:
3725 self._downloader.trouble(u'ERROR: Cannot find uploader')
3726 uploader = unescapeHTML(m.group('uploader'))
3727 uploader_id = unescapeHTML(m.group('uploader_id'))
3728
3729 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3730 if not m:
3731 self._downloader.trouble(u'ERROR: Cannot find upload date')
3732 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3733
3734 title = desc
3735 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3736
3737 info = {
3738 'id': video_id,
3739 'url': video_url,
3740 'ext': 'mov',
3741 'title': title,
3742 'description': desc,
3743 'uploader': uploader,
3744 'uploader_id': uploader_id,
3745 'internal_id': status_id,
3746 'upload_date': upload_date
3747 }
3748 return [info]
3749
3750 class SteamIE(InfoExtractor):
3751 _VALID_URL = r"""http://store.steampowered.com/
3752 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3753 (?P<gameID>\d+)/?
3754 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3755 """
3756
3757 def suitable(self, url):
3758 """Receives a URL and returns True if suitable for this IE."""
3759 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3760
3761 def report_download_video_page(self, game_id):
3762 self._downloader.to_screen(u'[%s] %s: Downloading video page' % (self.IE_NAME, game_id))
3763
3764 def _real_extract(self, url):
3765 m = re.match(self._VALID_URL, url, re.VERBOSE)
3766 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3767 gameID = m.group('gameID')
3768 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3769 try:
3770 self.report_download_video_page(gameID)
3771 urlh = compat_urllib_request.urlopen(videourl)
3772 webpage_bytes = urlh.read()
3773 webpage = webpage_bytes.decode('utf-8', 'ignore')
3774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3776 return
3777 mweb = re.finditer(urlRE, webpage)
3778 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3779 titles = list(re.finditer(namesRE, webpage))
3780 videos = []
3781 i = 0
3782 for vid in mweb:
3783 video_id = vid.group('videoID')
3784 title = titles[i].group('videoName')
3785 video_url=vid.group('videoURL')
3786 if not video_url:
3787 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3788 i += 1
3789 info = {
3790 'id':video_id,
3791 'url':video_url,
3792 'ext': 'flv',
3793 'title': title
3794 }
3795 videos.append(info)
3796 return videos
3797
3798 class UstreamIE(InfoExtractor):
3799 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3800 IE_NAME = u'ustream'
3801
3802 def _real_extract(self, url):
3803 m = re.match(self._VALID_URL, url)
3804 video_id = m.group('videoID')
3805 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3806 try:
3807 urlh = compat_urllib_request.urlopen(url)
3808 webpage_bytes = urlh.read()
3809 webpage = webpage_bytes.decode('utf-8', 'ignore')
3810 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3811 raise ExtractorError(u'unable to download webpage: %s' % compat_str(err))
3812 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3813 title = m.group('title')
3814 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3815 uploader = m.group('uploader')
3816 info = {
3817 'id':video_id,
3818 'url':video_url,
3819 'ext': 'flv',
3820 'title': title,
3821 'uploader': uploader
3822 }
3823 return [info]
3824
3825
3826 def gen_extractors():
3827 """ Return a list of an instance of every supported extractor.
3828 The order does matter; the first extractor matched is the one handling the URL.
3829 """
3830 return [
3831 YoutubePlaylistIE(),
3832 YoutubeChannelIE(),
3833 YoutubeUserIE(),
3834 YoutubeSearchIE(),
3835 YoutubeIE(),
3836 MetacafeIE(),
3837 DailymotionIE(),
3838 GoogleSearchIE(),
3839 PhotobucketIE(),
3840 YahooIE(),
3841 YahooSearchIE(),
3842 DepositFilesIE(),
3843 FacebookIE(),
3844 BlipTVUserIE(),
3845 BlipTVIE(),
3846 VimeoIE(),
3847 MyVideoIE(),
3848 ComedyCentralIE(),
3849 EscapistIE(),
3850 CollegeHumorIE(),
3851 XVideosIE(),
3852 SoundcloudIE(),
3853 InfoQIE(),
3854 MixcloudIE(),
3855 StanfordOpenClassroomIE(),
3856 MTVIE(),
3857 YoukuIE(),
3858 XNXXIE(),
3859 GooglePlusIE(),
3860 ArteTvIE(),
3861 NBAIE(),
3862 JustinTVIE(),
3863 FunnyOrDieIE(),
3864 TweetReelIE(),
3865 SteamIE(),
3866 UstreamIE(),
3867 GenericIE()
3868 ]
3869
3870