]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
ustreamIE
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
50
51 The fields should all be Unicode strings.
52
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
56
57 _real_extract() must return a *list* of information dictionaries as
58 described above.
59
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
62 """
63
64 _ready = False
65 _downloader = None
66 _WORKING = True
67
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
70 self._ready = False
71 self.set_downloader(downloader)
72
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
76
77 def working(self):
78 """Getter method for _WORKING."""
79 return self._WORKING
80
81 def initialize(self):
82 """Initializes an instance (authentication, etc)."""
83 if not self._ready:
84 self._real_initialize()
85 self._ready = True
86
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
89 self.initialize()
90 return self._real_extract(url)
91
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
95
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
98 pass
99
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
102 pass
103
104 @property
105 def IE_NAME(self):
106 return type(self).__name__[:-2]
107
108 class YoutubeIE(InfoExtractor):
109 """Information extractor for youtube.com."""
110
111 _VALID_URL = r"""^
112 (
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
124 v=
125 )
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
130 $"""
131 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE = 'youtube'
136 # Listed in order of quality
137 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions = {
140 '13': '3gp',
141 '17': 'mp4',
142 '18': 'mp4',
143 '22': 'mp4',
144 '37': 'mp4',
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
146 '43': 'webm',
147 '44': 'webm',
148 '45': 'webm',
149 '46': 'webm',
150 }
151 _video_dimensions = {
152 '5': '240x400',
153 '6': '???',
154 '13': '???',
155 '17': '144x176',
156 '18': '360x640',
157 '22': '720x1280',
158 '34': '360x640',
159 '35': '480x854',
160 '37': '1080x1920',
161 '38': '3072x4096',
162 '43': '360x640',
163 '44': '480x854',
164 '45': '720x1280',
165 '46': '1080x1920',
166 }
167 IE_NAME = u'youtube'
168
169 def suitable(self, url):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
172
173 def report_lang(self):
174 """Report attempt to set language."""
175 self._downloader.to_screen(u'[youtube] Setting language')
176
177 def report_login(self):
178 """Report attempt to log in."""
179 self._downloader.to_screen(u'[youtube] Logging in')
180
181 def report_age_confirmation(self):
182 """Report attempt to confirm age."""
183 self._downloader.to_screen(u'[youtube] Confirming age')
184
185 def report_video_webpage_download(self, video_id):
186 """Report attempt to download video webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
188
189 def report_video_info_webpage_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
192
193 def report_video_subtitles_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
196
197 def report_information_extraction(self, video_id):
198 """Report attempt to extract video information."""
199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
200
201 def report_unavailable_format(self, video_id, format):
202 """Report extracted video URL."""
203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
204
205 def report_rtmp_download(self):
206 """Indicate the download will use the RTMP protocol."""
207 self._downloader.to_screen(u'[youtube] RTMP download detected')
208
209 def _closed_captions_xml_to_srt(self, xml_string):
210 srt = ''
211 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212 # TODO parse xml instead of regex
213 for n, (start, dur_tag, dur, caption) in enumerate(texts):
214 if not dur: dur = '4'
215 start = float(start)
216 end = start + float(dur)
217 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219 caption = unescapeHTML(caption)
220 caption = unescapeHTML(caption) # double cycle, intentional
221 srt += str(n+1) + '\n'
222 srt += start + ' --> ' + end + '\n'
223 srt += caption + '\n\n'
224 return srt
225
226 def _extract_subtitles(self, video_id):
227 self.report_video_subtitles_download(video_id)
228 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
229 try:
230 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235 if not srt_lang_list:
236 return (u'WARNING: video has no closed captions', None)
237 if self._downloader.params.get('subtitleslang', False):
238 srt_lang = self._downloader.params.get('subtitleslang')
239 elif 'en' in srt_lang_list:
240 srt_lang = 'en'
241 else:
242 srt_lang = list(srt_lang_list.keys())[0]
243 if not srt_lang in srt_lang_list:
244 return (u'WARNING: no closed captions found in the specified language', None)
245 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
246 try:
247 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
250 if not srt_xml:
251 return (u'WARNING: unable to download video subtitles', None)
252 return (None, self._closed_captions_xml_to_srt(srt_xml))
253
254 def _print_formats(self, formats):
255 print('Available formats:')
256 for x in formats:
257 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
258
259 def _real_initialize(self):
260 if self._downloader is None:
261 return
262
263 username = None
264 password = None
265 downloader_params = self._downloader.params
266
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params.get('username', None) is not None:
269 username = downloader_params['username']
270 password = downloader_params['password']
271 elif downloader_params.get('usenetrc', False):
272 try:
273 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
274 if info is not None:
275 username = info[0]
276 password = info[2]
277 else:
278 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279 except (IOError, netrc.NetrcParseError) as err:
280 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
281 return
282
283 # Set language
284 request = compat_urllib_request.Request(self._LANG_URL)
285 try:
286 self.report_lang()
287 compat_urllib_request.urlopen(request).read()
288 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
290 return
291
292 # No authentication to be performed
293 if username is None:
294 return
295
296 # Log in
297 login_form = {
298 'current_form': 'loginForm',
299 'next': '/',
300 'action_login': 'Log In',
301 'username': username,
302 'password': password,
303 }
304 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
305 try:
306 self.report_login()
307 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
310 return
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
313 return
314
315 # Confirm age
316 age_form = {
317 'next_url': '/',
318 'action_confirm': 'Confirm',
319 }
320 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
321 try:
322 self.report_age_confirmation()
323 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
326 return
327
328 def _extract_id(self, url):
329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
330 if mobj is None:
331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
332 return
333 video_id = mobj.group(2)
334 return video_id
335
336 def _real_extract(self, url):
337 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
338 mobj = re.search(self._NEXT_URL_RE, url)
339 if mobj:
340 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
341 video_id = self._extract_id(url)
342
343 # Get video webpage
344 self.report_video_webpage_download(video_id)
345 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
346 request = compat_urllib_request.Request(url)
347 try:
348 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
349 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
351 return
352
353 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
354
355 # Attempt to extract SWF player URL
356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
357 if mobj is not None:
358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
359 else:
360 player_url = None
361
362 # Get video info
363 self.report_video_info_webpage_download(video_id)
364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
366 % (video_id, el_type))
367 request = compat_urllib_request.Request(video_info_url)
368 try:
369 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
371 video_info = compat_parse_qs(video_info_webpage)
372 if 'token' in video_info:
373 break
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
376 return
377 if 'token' not in video_info:
378 if 'reason' in video_info:
379 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
380 else:
381 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
382 return
383
384 # Check for "rental" videos
385 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
386 self._downloader.trouble(u'ERROR: "rental" videos not supported')
387 return
388
389 # Start extracting information
390 self.report_information_extraction(video_id)
391
392 # uploader
393 if 'author' not in video_info:
394 self._downloader.trouble(u'ERROR: unable to extract uploader name')
395 return
396 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
397
398 # uploader_id
399 video_uploader_id = None
400 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
401 if mobj is not None:
402 video_uploader_id = mobj.group(1)
403 else:
404 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
405
406 # title
407 if 'title' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract video title')
409 return
410 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
411
412 # thumbnail image
413 if 'thumbnail_url' not in video_info:
414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
415 video_thumbnail = ''
416 else: # don't panic if we can't find it
417 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
418
419 # upload date
420 upload_date = None
421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
422 if mobj is not None:
423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
425 for expression in format_expressions:
426 try:
427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
428 except:
429 pass
430
431 # description
432 video_description = get_element_by_id("eow-description", video_webpage)
433 if video_description:
434 video_description = clean_html(video_description)
435 else:
436 video_description = ''
437
438 # closed captions
439 video_subtitles = None
440 if self._downloader.params.get('writesubtitles', False):
441 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
442 if srt_error:
443 self._downloader.trouble(srt_error)
444
445 if 'length_seconds' not in video_info:
446 self._downloader.trouble(u'WARNING: unable to extract video duration')
447 video_duration = ''
448 else:
449 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
450
451 # token
452 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
453
454 # Decide which formats to download
455 req_format = self._downloader.params.get('format', None)
456
457 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
458 self.report_rtmp_download()
459 video_url_list = [(None, video_info['conn'][0])]
460 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
461 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
462 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
463 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
464 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
465
466 format_limit = self._downloader.params.get('format_limit', None)
467 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
468 if format_limit is not None and format_limit in available_formats:
469 format_list = available_formats[available_formats.index(format_limit):]
470 else:
471 format_list = available_formats
472 existing_formats = [x for x in format_list if x in url_map]
473 if len(existing_formats) == 0:
474 self._downloader.trouble(u'ERROR: no known formats available for video')
475 return
476 if self._downloader.params.get('listformats', None):
477 self._print_formats(existing_formats)
478 return
479 if req_format is None or req_format == 'best':
480 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
481 elif req_format == 'worst':
482 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
483 elif req_format in ('-1', 'all'):
484 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
485 else:
486 # Specific formats. We pick the first in a slash-delimeted sequence.
487 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
488 req_formats = req_format.split('/')
489 video_url_list = None
490 for rf in req_formats:
491 if rf in url_map:
492 video_url_list = [(rf, url_map[rf])]
493 break
494 if video_url_list is None:
495 self._downloader.trouble(u'ERROR: requested format not available')
496 return
497 else:
498 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
499 return
500
501 results = []
502 for format_param, video_real_url in video_url_list:
503 # Extension
504 video_extension = self._video_extensions.get(format_param, 'flv')
505
506 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
507 self._video_dimensions.get(format_param, '???'))
508
509 results.append({
510 'id': video_id,
511 'url': video_real_url,
512 'uploader': video_uploader,
513 'uploader_id': video_uploader_id,
514 'upload_date': upload_date,
515 'title': video_title,
516 'ext': video_extension,
517 'format': video_format,
518 'thumbnail': video_thumbnail,
519 'description': video_description,
520 'player_url': player_url,
521 'subtitles': video_subtitles,
522 'duration': video_duration
523 })
524 return results
525
526
527 class MetacafeIE(InfoExtractor):
528 """Information Extractor for metacafe.com."""
529
530 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
531 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
532 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
533 IE_NAME = u'metacafe'
534
535 def __init__(self, downloader=None):
536 InfoExtractor.__init__(self, downloader)
537
538 def report_disclaimer(self):
539 """Report disclaimer retrieval."""
540 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
541
542 def report_age_confirmation(self):
543 """Report attempt to confirm age."""
544 self._downloader.to_screen(u'[metacafe] Confirming age')
545
546 def report_download_webpage(self, video_id):
547 """Report webpage download."""
548 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
549
550 def report_extraction(self, video_id):
551 """Report information extraction."""
552 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
553
554 def _real_initialize(self):
555 # Retrieve disclaimer
556 request = compat_urllib_request.Request(self._DISCLAIMER)
557 try:
558 self.report_disclaimer()
559 disclaimer = compat_urllib_request.urlopen(request).read()
560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
561 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
562 return
563
564 # Confirm age
565 disclaimer_form = {
566 'filters': '0',
567 'submit': "Continue - I'm over 18",
568 }
569 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
570 try:
571 self.report_age_confirmation()
572 disclaimer = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
575 return
576
577 def _real_extract(self, url):
578 # Extract id and simplified title from URL
579 mobj = re.match(self._VALID_URL, url)
580 if mobj is None:
581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
582 return
583
584 video_id = mobj.group(1)
585
586 # Check if video comes from YouTube
587 mobj2 = re.match(r'^yt-(.*)$', video_id)
588 if mobj2 is not None:
589 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
590 return
591
592 # Retrieve video webpage to extract further information
593 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
594 try:
595 self.report_download_webpage(video_id)
596 webpage = compat_urllib_request.urlopen(request).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
599 return
600
601 # Extract URL, uploader and title from webpage
602 self.report_extraction(video_id)
603 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
604 if mobj is not None:
605 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
606 video_extension = mediaURL[-3:]
607
608 # Extract gdaKey if available
609 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
610 if mobj is None:
611 video_url = mediaURL
612 else:
613 gdaKey = mobj.group(1)
614 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
615 else:
616 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
617 if mobj is None:
618 self._downloader.trouble(u'ERROR: unable to extract media URL')
619 return
620 vardict = compat_parse_qs(mobj.group(1))
621 if 'mediaData' not in vardict:
622 self._downloader.trouble(u'ERROR: unable to extract media URL')
623 return
624 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
625 if mobj is None:
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
627 return
628 mediaURL = mobj.group(1).replace('\\/', '/')
629 video_extension = mediaURL[-3:]
630 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
631
632 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
633 if mobj is None:
634 self._downloader.trouble(u'ERROR: unable to extract title')
635 return
636 video_title = mobj.group(1).decode('utf-8')
637
638 mobj = re.search(r'submitter=(.*?);', webpage)
639 if mobj is None:
640 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
641 return
642 video_uploader = mobj.group(1)
643
644 return [{
645 'id': video_id.decode('utf-8'),
646 'url': video_url.decode('utf-8'),
647 'uploader': video_uploader.decode('utf-8'),
648 'upload_date': None,
649 'title': video_title,
650 'ext': video_extension.decode('utf-8'),
651 }]
652
653
654 class DailymotionIE(InfoExtractor):
655 """Information Extractor for Dailymotion"""
656
657 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
658 IE_NAME = u'dailymotion'
659
660 def __init__(self, downloader=None):
661 InfoExtractor.__init__(self, downloader)
662
663 def report_download_webpage(self, video_id):
664 """Report webpage download."""
665 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
666
667 def report_extraction(self, video_id):
668 """Report information extraction."""
669 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
670
671 def _real_extract(self, url):
672 # Extract id and simplified title from URL
673 mobj = re.match(self._VALID_URL, url)
674 if mobj is None:
675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
676 return
677
678 video_id = mobj.group(1).split('_')[0].split('?')[0]
679
680 video_extension = 'mp4'
681
682 # Retrieve video webpage to extract further information
683 request = compat_urllib_request.Request(url)
684 request.add_header('Cookie', 'family_filter=off')
685 try:
686 self.report_download_webpage(video_id)
687 webpage_bytes = compat_urllib_request.urlopen(request).read()
688 webpage = webpage_bytes.decode('utf-8')
689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
690 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
691 return
692
693 # Extract URL, uploader and title from webpage
694 self.report_extraction(video_id)
695 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
696 if mobj is None:
697 self._downloader.trouble(u'ERROR: unable to extract media URL')
698 return
699 flashvars = compat_urllib_parse.unquote(mobj.group(1))
700
701 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
702 if key in flashvars:
703 max_quality = key
704 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
705 break
706 else:
707 self._downloader.trouble(u'ERROR: unable to extract video URL')
708 return
709
710 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
711 if mobj is None:
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
713 return
714
715 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
716
717 # TODO: support choosing qualities
718
719 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
720 if mobj is None:
721 self._downloader.trouble(u'ERROR: unable to extract title')
722 return
723 video_title = unescapeHTML(mobj.group('title'))
724
725 video_uploader = None
726 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
727 if mobj is None:
728 # lookin for official user
729 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
730 if mobj_official is None:
731 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
732 else:
733 video_uploader = mobj_official.group(1)
734 else:
735 video_uploader = mobj.group(1)
736
737 video_upload_date = None
738 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
739 if mobj is not None:
740 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
741
742 return [{
743 'id': video_id,
744 'url': video_url,
745 'uploader': video_uploader,
746 'upload_date': video_upload_date,
747 'title': video_title,
748 'ext': video_extension,
749 }]
750
751
752 class PhotobucketIE(InfoExtractor):
753 """Information extractor for photobucket.com."""
754
755 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
756 IE_NAME = u'photobucket'
757
758 def __init__(self, downloader=None):
759 InfoExtractor.__init__(self, downloader)
760
761 def report_download_webpage(self, video_id):
762 """Report webpage download."""
763 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
764
765 def report_extraction(self, video_id):
766 """Report information extraction."""
767 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
768
769 def _real_extract(self, url):
770 # Extract id from URL
771 mobj = re.match(self._VALID_URL, url)
772 if mobj is None:
773 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
774 return
775
776 video_id = mobj.group(1)
777
778 video_extension = 'flv'
779
780 # Retrieve video webpage to extract further information
781 request = compat_urllib_request.Request(url)
782 try:
783 self.report_download_webpage(video_id)
784 webpage = compat_urllib_request.urlopen(request).read()
785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
786 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
787 return
788
789 # Extract URL, uploader, and title from webpage
790 self.report_extraction(video_id)
791 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
792 if mobj is None:
793 self._downloader.trouble(u'ERROR: unable to extract media URL')
794 return
795 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
796
797 video_url = mediaURL
798
799 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
800 if mobj is None:
801 self._downloader.trouble(u'ERROR: unable to extract title')
802 return
803 video_title = mobj.group(1).decode('utf-8')
804
805 video_uploader = mobj.group(2).decode('utf-8')
806
807 return [{
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
810 'uploader': video_uploader,
811 'upload_date': None,
812 'title': video_title,
813 'ext': video_extension.decode('utf-8'),
814 }]
815
816
817 class YahooIE(InfoExtractor):
818 """Information extractor for video.yahoo.com."""
819
820 _WORKING = False
821 # _VALID_URL matches all Yahoo! Video URLs
822 # _VPAGE_URL matches only the extractable '/watch/' URLs
823 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
824 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
825 IE_NAME = u'video.yahoo'
826
827 def __init__(self, downloader=None):
828 InfoExtractor.__init__(self, downloader)
829
830 def report_download_webpage(self, video_id):
831 """Report webpage download."""
832 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
833
834 def report_extraction(self, video_id):
835 """Report information extraction."""
836 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
837
838 def _real_extract(self, url, new_video=True):
839 # Extract ID from URL
840 mobj = re.match(self._VALID_URL, url)
841 if mobj is None:
842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
843 return
844
845 video_id = mobj.group(2)
846 video_extension = 'flv'
847
848 # Rewrite valid but non-extractable URLs as
849 # extractable English language /watch/ URLs
850 if re.match(self._VPAGE_URL, url) is None:
851 request = compat_urllib_request.Request(url)
852 try:
853 webpage = compat_urllib_request.urlopen(request).read()
854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
855 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
856 return
857
858 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
859 if mobj is None:
860 self._downloader.trouble(u'ERROR: Unable to extract id field')
861 return
862 yahoo_id = mobj.group(1)
863
864 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
865 if mobj is None:
866 self._downloader.trouble(u'ERROR: Unable to extract vid field')
867 return
868 yahoo_vid = mobj.group(1)
869
870 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
871 return self._real_extract(url, new_video=False)
872
873 # Retrieve video webpage to extract further information
874 request = compat_urllib_request.Request(url)
875 try:
876 self.report_download_webpage(video_id)
877 webpage = compat_urllib_request.urlopen(request).read()
878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
880 return
881
882 # Extract uploader and title from webpage
883 self.report_extraction(video_id)
884 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
885 if mobj is None:
886 self._downloader.trouble(u'ERROR: unable to extract video title')
887 return
888 video_title = mobj.group(1).decode('utf-8')
889
890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
891 if mobj is None:
892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
893 return
894 video_uploader = mobj.group(1).decode('utf-8')
895
896 # Extract video thumbnail
897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
898 if mobj is None:
899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
900 return
901 video_thumbnail = mobj.group(1).decode('utf-8')
902
903 # Extract video description
904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
905 if mobj is None:
906 self._downloader.trouble(u'ERROR: unable to extract video description')
907 return
908 video_description = mobj.group(1).decode('utf-8')
909 if not video_description:
910 video_description = 'No description available.'
911
912 # Extract video height and width
913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
914 if mobj is None:
915 self._downloader.trouble(u'ERROR: unable to extract video height')
916 return
917 yv_video_height = mobj.group(1)
918
919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
920 if mobj is None:
921 self._downloader.trouble(u'ERROR: unable to extract video width')
922 return
923 yv_video_width = mobj.group(1)
924
925 # Retrieve video playlist to extract media URL
926 # I'm not completely sure what all these options are, but we
927 # seem to need most of them, otherwise the server sends a 401.
928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
930 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
933 try:
934 self.report_download_webpage(video_id)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
938 return
939
940 # Extract media URL from playlist XML
941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
942 if mobj is None:
943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
944 return
945 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
946 video_url = unescapeHTML(video_url)
947
948 return [{
949 'id': video_id.decode('utf-8'),
950 'url': video_url,
951 'uploader': video_uploader,
952 'upload_date': None,
953 'title': video_title,
954 'ext': video_extension.decode('utf-8'),
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description,
957 }]
958
959
960 class VimeoIE(InfoExtractor):
961 """Information extractor for vimeo.com."""
962
963 # _VALID_URL matches Vimeo URLs
964 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
965 IE_NAME = u'vimeo'
966
967 def __init__(self, downloader=None):
968 InfoExtractor.__init__(self, downloader)
969
970 def report_download_webpage(self, video_id):
971 """Report webpage download."""
972 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
973
974 def report_extraction(self, video_id):
975 """Report information extraction."""
976 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
977
978 def _real_extract(self, url, new_video=True):
979 # Extract ID from URL
980 mobj = re.match(self._VALID_URL, url)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
983 return
984
985 video_id = mobj.group(1)
986
987 # Retrieve video webpage to extract further information
988 request = compat_urllib_request.Request(url, None, std_headers)
989 try:
990 self.report_download_webpage(video_id)
991 webpage_bytes = compat_urllib_request.urlopen(request).read()
992 webpage = webpage_bytes.decode('utf-8')
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
995 return
996
997 # Now we begin extracting as much information as we can from what we
998 # retrieved. First we extract the information common to all extractors,
999 # and latter we extract those that are Vimeo specific.
1000 self.report_extraction(video_id)
1001
1002 # Extract the config JSON
1003 try:
1004 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005 config = json.loads(config)
1006 except:
1007 self._downloader.trouble(u'ERROR: unable to extract info section')
1008 return
1009
1010 # Extract title
1011 video_title = config["video"]["title"]
1012
1013 # Extract uploader and uploader_id
1014 video_uploader = config["video"]["owner"]["name"]
1015 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1016
1017 # Extract video thumbnail
1018 video_thumbnail = config["video"]["thumbnail"]
1019
1020 # Extract video description
1021 video_description = get_element_by_attribute("itemprop", "description", webpage)
1022 if video_description: video_description = clean_html(video_description)
1023 else: video_description = ''
1024
1025 # Extract upload date
1026 video_upload_date = None
1027 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028 if mobj is not None:
1029 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1030
1031 # Vimeo specific: extract request signature and timestamp
1032 sig = config['request']['signature']
1033 timestamp = config['request']['timestamp']
1034
1035 # Vimeo specific: extract video codec and quality information
1036 # First consider quality, then codecs, then take everything
1037 # TODO bind to format param
1038 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039 files = { 'hd': [], 'sd': [], 'other': []}
1040 for codec_name, codec_extension in codecs:
1041 if codec_name in config["video"]["files"]:
1042 if 'hd' in config["video"]["files"][codec_name]:
1043 files['hd'].append((codec_name, codec_extension, 'hd'))
1044 elif 'sd' in config["video"]["files"][codec_name]:
1045 files['sd'].append((codec_name, codec_extension, 'sd'))
1046 else:
1047 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1048
1049 for quality in ('hd', 'sd', 'other'):
1050 if len(files[quality]) > 0:
1051 video_quality = files[quality][0][2]
1052 video_codec = files[quality][0][0]
1053 video_extension = files[quality][0][1]
1054 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1055 break
1056 else:
1057 self._downloader.trouble(u'ERROR: no known codec found')
1058 return
1059
1060 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1062
1063 return [{
1064 'id': video_id,
1065 'url': video_url,
1066 'uploader': video_uploader,
1067 'uploader_id': video_uploader_id,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1073 }]
1074
1075
1076 class ArteTvIE(InfoExtractor):
1077 """arte.tv information extractor."""
1078
1079 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080 _LIVE_URL = r'index-[0-9]+\.html$'
1081
1082 IE_NAME = u'arte.tv'
1083
1084 def __init__(self, downloader=None):
1085 InfoExtractor.__init__(self, downloader)
1086
1087 def report_download_webpage(self, video_id):
1088 """Report webpage download."""
1089 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1090
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1094
1095 def fetch_webpage(self, url):
1096 request = compat_urllib_request.Request(url)
1097 try:
1098 self.report_download_webpage(url)
1099 webpage = compat_urllib_request.urlopen(request).read()
1100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1102 return
1103 except ValueError as err:
1104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1105 return
1106 return webpage
1107
1108 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109 page = self.fetch_webpage(url)
1110 mobj = re.search(regex, page, regexFlags)
1111 info = {}
1112
1113 if mobj is None:
1114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115 return
1116
1117 for (i, key, err) in matchTuples:
1118 if mobj.group(i) is None:
1119 self._downloader.trouble(err)
1120 return
1121 else:
1122 info[key] = mobj.group(i)
1123
1124 return info
1125
1126 def extractLiveStream(self, url):
1127 video_lang = url.split('/')[-4]
1128 info = self.grep_webpage(
1129 url,
1130 r'src="(.*?/videothek_js.*?\.js)',
1131 0,
1132 [
1133 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1134 ]
1135 )
1136 http_host = url.split('/')[2]
1137 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138 info = self.grep_webpage(
1139 next_url,
1140 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141 '(http://.*?\.swf).*?' +
1142 '(rtmp://.*?)\'',
1143 re.DOTALL,
1144 [
1145 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1146 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1148 ]
1149 )
1150 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1151
1152 def extractPlus7Stream(self, url):
1153 video_lang = url.split('/')[-3]
1154 info = self.grep_webpage(
1155 url,
1156 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1157 0,
1158 [
1159 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1160 ]
1161 )
1162 next_url = compat_urllib_parse.unquote(info.get('url'))
1163 info = self.grep_webpage(
1164 next_url,
1165 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1166 0,
1167 [
1168 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1169 ]
1170 )
1171 next_url = compat_urllib_parse.unquote(info.get('url'))
1172
1173 info = self.grep_webpage(
1174 next_url,
1175 r'<video id="(.*?)".*?>.*?' +
1176 '<name>(.*?)</name>.*?' +
1177 '<dateVideo>(.*?)</dateVideo>.*?' +
1178 '<url quality="hd">(.*?)</url>',
1179 re.DOTALL,
1180 [
1181 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1182 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1184 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1185 ]
1186 )
1187
1188 return {
1189 'id': info.get('id'),
1190 'url': compat_urllib_parse.unquote(info.get('url')),
1191 'uploader': u'arte.tv',
1192 'upload_date': info.get('date'),
1193 'title': info.get('title').decode('utf-8'),
1194 'ext': u'mp4',
1195 'format': u'NA',
1196 'player_url': None,
1197 }
1198
1199 def _real_extract(self, url):
1200 video_id = url.split('/')[-1]
1201 self.report_extraction(video_id)
1202
1203 if re.search(self._LIVE_URL, video_id) is not None:
1204 self.extractLiveStream(url)
1205 return
1206 else:
1207 info = self.extractPlus7Stream(url)
1208
1209 return [info]
1210
1211
1212 class GenericIE(InfoExtractor):
1213 """Generic last-resort information extractor."""
1214
1215 _VALID_URL = r'.*'
1216 IE_NAME = u'generic'
1217
1218 def __init__(self, downloader=None):
1219 InfoExtractor.__init__(self, downloader)
1220
1221 def report_download_webpage(self, video_id):
1222 """Report webpage download."""
1223 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1225
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1229
1230 def report_following_redirect(self, new_url):
1231 """Report information extraction."""
1232 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1233
1234 def _test_redirect(self, url):
1235 """Check if it is a redirect, like url shorteners, in case restart chain."""
1236 class HeadRequest(compat_urllib_request.Request):
1237 def get_method(self):
1238 return "HEAD"
1239
1240 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1241 """
1242 Subclass the HTTPRedirectHandler to make it use our
1243 HeadRequest also on the redirected URL
1244 """
1245 def redirect_request(self, req, fp, code, msg, headers, newurl):
1246 if code in (301, 302, 303, 307):
1247 newurl = newurl.replace(' ', '%20')
1248 newheaders = dict((k,v) for k,v in req.headers.items()
1249 if k.lower() not in ("content-length", "content-type"))
1250 return HeadRequest(newurl,
1251 headers=newheaders,
1252 origin_req_host=req.get_origin_req_host(),
1253 unverifiable=True)
1254 else:
1255 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1256
1257 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1258 """
1259 Fallback to GET if HEAD is not allowed (405 HTTP error)
1260 """
1261 def http_error_405(self, req, fp, code, msg, headers):
1262 fp.read()
1263 fp.close()
1264
1265 newheaders = dict((k,v) for k,v in req.headers.items()
1266 if k.lower() not in ("content-length", "content-type"))
1267 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1268 headers=newheaders,
1269 origin_req_host=req.get_origin_req_host(),
1270 unverifiable=True))
1271
1272 # Build our opener
1273 opener = compat_urllib_request.OpenerDirector()
1274 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275 HTTPMethodFallback, HEADRedirectHandler,
1276 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277 opener.add_handler(handler())
1278
1279 response = opener.open(HeadRequest(url))
1280 new_url = response.geturl()
1281
1282 if url == new_url:
1283 return False
1284
1285 self.report_following_redirect(new_url)
1286 self._downloader.download([new_url])
1287 return True
1288
1289 def _real_extract(self, url):
1290 if self._test_redirect(url): return
1291
1292 video_id = url.split('/')[-1]
1293 request = compat_urllib_request.Request(url)
1294 try:
1295 self.report_download_webpage(video_id)
1296 webpage = compat_urllib_request.urlopen(request).read()
1297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1299 return
1300 except ValueError as err:
1301 # since this is the last-resort InfoExtractor, if
1302 # this error is thrown, it'll be thrown here
1303 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1304 return
1305
1306 self.report_extraction(video_id)
1307 # Start with something easy: JW Player in SWFObject
1308 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1309 if mobj is None:
1310 # Broaden the search a little bit
1311 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1312 if mobj is None:
1313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314 return
1315
1316 # It's possible that one of the regexes
1317 # matched, but returned an empty group:
1318 if mobj.group(1) is None:
1319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 return
1321
1322 video_url = compat_urllib_parse.unquote(mobj.group(1))
1323 video_id = os.path.basename(video_url)
1324
1325 # here's a fun little line of code for you:
1326 video_extension = os.path.splitext(video_id)[1][1:]
1327 video_id = os.path.splitext(video_id)[0]
1328
1329 # it's tempting to parse this further, but you would
1330 # have to take into account all the variations like
1331 # Video Title - Site Name
1332 # Site Name | Video Title
1333 # Video Title - Tagline | Site Name
1334 # and so on and so forth; it's just not practical
1335 mobj = re.search(r'<title>(.*)</title>', webpage)
1336 if mobj is None:
1337 self._downloader.trouble(u'ERROR: unable to extract title')
1338 return
1339 video_title = mobj.group(1)
1340
1341 # video uploader is domain name
1342 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1343 if mobj is None:
1344 self._downloader.trouble(u'ERROR: unable to extract title')
1345 return
1346 video_uploader = mobj.group(1)
1347
1348 return [{
1349 'id': video_id,
1350 'url': video_url,
1351 'uploader': video_uploader,
1352 'upload_date': None,
1353 'title': video_title,
1354 'ext': video_extension,
1355 }]
1356
1357
1358 class YoutubeSearchIE(InfoExtractor):
1359 """Information Extractor for YouTube search queries."""
1360 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362 _max_youtube_results = 1000
1363 IE_NAME = u'youtube:search'
1364
1365 def __init__(self, downloader=None):
1366 InfoExtractor.__init__(self, downloader)
1367
1368 def report_download_page(self, query, pagenum):
1369 """Report attempt to download search page with given number."""
1370 query = query.decode(preferredencoding())
1371 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1372
1373 def _real_extract(self, query):
1374 mobj = re.match(self._VALID_URL, query)
1375 if mobj is None:
1376 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1377 return
1378
1379 prefix, query = query.split(':')
1380 prefix = prefix[8:]
1381 query = query.encode('utf-8')
1382 if prefix == '':
1383 self._download_n_results(query, 1)
1384 return
1385 elif prefix == 'all':
1386 self._download_n_results(query, self._max_youtube_results)
1387 return
1388 else:
1389 try:
1390 n = int(prefix)
1391 if n <= 0:
1392 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1393 return
1394 elif n > self._max_youtube_results:
1395 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396 n = self._max_youtube_results
1397 self._download_n_results(query, n)
1398 return
1399 except ValueError: # parsing prefix as integer fails
1400 self._download_n_results(query, 1)
1401 return
1402
1403 def _download_n_results(self, query, n):
1404 """Downloads a specified number of results for a query"""
1405
1406 video_ids = []
1407 pagenum = 0
1408 limit = n
1409
1410 while (50 * pagenum) < limit:
1411 self.report_download_page(query, pagenum+1)
1412 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413 request = compat_urllib_request.Request(result_url)
1414 try:
1415 data = compat_urllib_request.urlopen(request).read()
1416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1418 return
1419 api_response = json.loads(data)['data']
1420
1421 new_ids = list(video['id'] for video in api_response['items'])
1422 video_ids += new_ids
1423
1424 limit = min(n, api_response['totalItems'])
1425 pagenum += 1
1426
1427 if len(video_ids) > n:
1428 video_ids = video_ids[:n]
1429 for id in video_ids:
1430 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1431 return
1432
1433
1434 class GoogleSearchIE(InfoExtractor):
1435 """Information Extractor for Google Video search queries."""
1436 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440 _max_google_results = 1000
1441 IE_NAME = u'video.google:search'
1442
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1445
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download playlist page with given number."""
1448 query = query.decode(preferredencoding())
1449 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1450
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1453 if mobj is None:
1454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455 return
1456
1457 prefix, query = query.split(':')
1458 prefix = prefix[8:]
1459 query = query.encode('utf-8')
1460 if prefix == '':
1461 self._download_n_results(query, 1)
1462 return
1463 elif prefix == 'all':
1464 self._download_n_results(query, self._max_google_results)
1465 return
1466 else:
1467 try:
1468 n = int(prefix)
1469 if n <= 0:
1470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471 return
1472 elif n > self._max_google_results:
1473 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474 n = self._max_google_results
1475 self._download_n_results(query, n)
1476 return
1477 except ValueError: # parsing prefix as integer fails
1478 self._download_n_results(query, 1)
1479 return
1480
1481 def _download_n_results(self, query, n):
1482 """Downloads a specified number of results for a query"""
1483
1484 video_ids = []
1485 pagenum = 0
1486
1487 while True:
1488 self.report_download_page(query, pagenum)
1489 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490 request = compat_urllib_request.Request(result_url)
1491 try:
1492 page = compat_urllib_request.urlopen(request).read()
1493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1495 return
1496
1497 # Extract video identifiers
1498 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499 video_id = mobj.group(1)
1500 if video_id not in video_ids:
1501 video_ids.append(video_id)
1502 if len(video_ids) == n:
1503 # Specified n videos reached
1504 for id in video_ids:
1505 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506 return
1507
1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511 return
1512
1513 pagenum = pagenum + 1
1514
1515
1516 class YahooSearchIE(InfoExtractor):
1517 """Information Extractor for Yahoo! Video search queries."""
1518
1519 _WORKING = False
1520 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1521 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1522 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1523 _MORE_PAGES_INDICATOR = r'\s*Next'
1524 _max_yahoo_results = 1000
1525 IE_NAME = u'video.yahoo:search'
1526
1527 def __init__(self, downloader=None):
1528 InfoExtractor.__init__(self, downloader)
1529
1530 def report_download_page(self, query, pagenum):
1531 """Report attempt to download playlist page with given number."""
1532 query = query.decode(preferredencoding())
1533 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1534
1535 def _real_extract(self, query):
1536 mobj = re.match(self._VALID_URL, query)
1537 if mobj is None:
1538 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1539 return
1540
1541 prefix, query = query.split(':')
1542 prefix = prefix[8:]
1543 query = query.encode('utf-8')
1544 if prefix == '':
1545 self._download_n_results(query, 1)
1546 return
1547 elif prefix == 'all':
1548 self._download_n_results(query, self._max_yahoo_results)
1549 return
1550 else:
1551 try:
1552 n = int(prefix)
1553 if n <= 0:
1554 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1555 return
1556 elif n > self._max_yahoo_results:
1557 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1558 n = self._max_yahoo_results
1559 self._download_n_results(query, n)
1560 return
1561 except ValueError: # parsing prefix as integer fails
1562 self._download_n_results(query, 1)
1563 return
1564
1565 def _download_n_results(self, query, n):
1566 """Downloads a specified number of results for a query"""
1567
1568 video_ids = []
1569 already_seen = set()
1570 pagenum = 1
1571
1572 while True:
1573 self.report_download_page(query, pagenum)
1574 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1575 request = compat_urllib_request.Request(result_url)
1576 try:
1577 page = compat_urllib_request.urlopen(request).read()
1578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1579 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1580 return
1581
1582 # Extract video identifiers
1583 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1584 video_id = mobj.group(1)
1585 if video_id not in already_seen:
1586 video_ids.append(video_id)
1587 already_seen.add(video_id)
1588 if len(video_ids) == n:
1589 # Specified n videos reached
1590 for id in video_ids:
1591 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592 return
1593
1594 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597 return
1598
1599 pagenum = pagenum + 1
1600
1601
1602 class YoutubePlaylistIE(InfoExtractor):
1603 """Information Extractor for YouTube playlists."""
1604
1605 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1606 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1607 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1608 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1609 IE_NAME = u'youtube:playlist'
1610
1611 def __init__(self, downloader=None):
1612 InfoExtractor.__init__(self, downloader)
1613
1614 def report_download_page(self, playlist_id, pagenum):
1615 """Report attempt to download playlist page with given number."""
1616 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1617
1618 def _real_extract(self, url):
1619 # Extract playlist id
1620 mobj = re.match(self._VALID_URL, url)
1621 if mobj is None:
1622 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1623 return
1624
1625 # Single video case
1626 if mobj.group(3) is not None:
1627 self._downloader.download([mobj.group(3)])
1628 return
1629
1630 # Download playlist pages
1631 # prefix is 'p' as default for playlists but there are other types that need extra care
1632 playlist_prefix = mobj.group(1)
1633 if playlist_prefix == 'a':
1634 playlist_access = 'artist'
1635 else:
1636 playlist_prefix = 'p'
1637 playlist_access = 'view_play_list'
1638 playlist_id = mobj.group(2)
1639 video_ids = []
1640 pagenum = 1
1641
1642 while True:
1643 self.report_download_page(playlist_id, pagenum)
1644 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1645 request = compat_urllib_request.Request(url)
1646 try:
1647 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650 return
1651
1652 # Extract video identifiers
1653 ids_in_page = []
1654 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1655 if mobj.group(1) not in ids_in_page:
1656 ids_in_page.append(mobj.group(1))
1657 video_ids.extend(ids_in_page)
1658
1659 if self._MORE_PAGES_INDICATOR not in page:
1660 break
1661 pagenum = pagenum + 1
1662
1663 total = len(video_ids)
1664
1665 playliststart = self._downloader.params.get('playliststart', 1) - 1
1666 playlistend = self._downloader.params.get('playlistend', -1)
1667 if playlistend == -1:
1668 video_ids = video_ids[playliststart:]
1669 else:
1670 video_ids = video_ids[playliststart:playlistend]
1671
1672 if len(video_ids) == total:
1673 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1674 else:
1675 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1676
1677 for id in video_ids:
1678 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1679 return
1680
1681
1682 class YoutubeChannelIE(InfoExtractor):
1683 """Information Extractor for YouTube channels."""
1684
1685 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1686 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688 IE_NAME = u'youtube:channel'
1689
1690 def report_download_page(self, channel_id, pagenum):
1691 """Report attempt to download channel page with given number."""
1692 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1693
1694 def _real_extract(self, url):
1695 # Extract channel id
1696 mobj = re.match(self._VALID_URL, url)
1697 if mobj is None:
1698 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1699 return
1700
1701 # Download channel pages
1702 channel_id = mobj.group(1)
1703 video_ids = []
1704 pagenum = 1
1705
1706 while True:
1707 self.report_download_page(channel_id, pagenum)
1708 url = self._TEMPLATE_URL % (channel_id, pagenum)
1709 request = compat_urllib_request.Request(url)
1710 try:
1711 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714 return
1715
1716 # Extract video identifiers
1717 ids_in_page = []
1718 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1722
1723 if self._MORE_PAGES_INDICATOR not in page:
1724 break
1725 pagenum = pagenum + 1
1726
1727 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1728
1729 for id in video_ids:
1730 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1731 return
1732
1733
1734 class YoutubeUserIE(InfoExtractor):
1735 """Information Extractor for YouTube users."""
1736
1737 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1738 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1739 _GDATA_PAGE_SIZE = 50
1740 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1741 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1742 IE_NAME = u'youtube:user'
1743
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1746
1747 def report_download_page(self, username, start_index):
1748 """Report attempt to download user page."""
1749 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1750 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1751
1752 def _real_extract(self, url):
1753 # Extract username
1754 mobj = re.match(self._VALID_URL, url)
1755 if mobj is None:
1756 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 return
1758
1759 username = mobj.group(1)
1760
1761 # Download video ids using YouTube Data API. Result size per
1762 # query is limited (currently to 50 videos) so we need to query
1763 # page by page until there are no video ids - it means we got
1764 # all of them.
1765
1766 video_ids = []
1767 pagenum = 0
1768
1769 while True:
1770 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1771 self.report_download_page(username, start_index)
1772
1773 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1774
1775 try:
1776 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1779 return
1780
1781 # Extract video identifiers
1782 ids_in_page = []
1783
1784 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785 if mobj.group(1) not in ids_in_page:
1786 ids_in_page.append(mobj.group(1))
1787
1788 video_ids.extend(ids_in_page)
1789
1790 # A little optimization - if current page is not
1791 # "full", ie. does not contain PAGE_SIZE video ids then
1792 # we can assume that this page is the last one - there
1793 # are no more ids on further pages - no need to query
1794 # again.
1795
1796 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1797 break
1798
1799 pagenum += 1
1800
1801 all_ids_count = len(video_ids)
1802 playliststart = self._downloader.params.get('playliststart', 1) - 1
1803 playlistend = self._downloader.params.get('playlistend', -1)
1804
1805 if playlistend == -1:
1806 video_ids = video_ids[playliststart:]
1807 else:
1808 video_ids = video_ids[playliststart:playlistend]
1809
1810 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1811 (username, all_ids_count, len(video_ids)))
1812
1813 for video_id in video_ids:
1814 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1815
1816
1817 class BlipTVUserIE(InfoExtractor):
1818 """Information Extractor for blip.tv users."""
1819
1820 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1821 _PAGE_SIZE = 12
1822 IE_NAME = u'blip.tv:user'
1823
1824 def __init__(self, downloader=None):
1825 InfoExtractor.__init__(self, downloader)
1826
1827 def report_download_page(self, username, pagenum):
1828 """Report attempt to download user page."""
1829 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1830 (self.IE_NAME, username, pagenum))
1831
1832 def _real_extract(self, url):
1833 # Extract username
1834 mobj = re.match(self._VALID_URL, url)
1835 if mobj is None:
1836 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837 return
1838
1839 username = mobj.group(1)
1840
1841 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1842
1843 request = compat_urllib_request.Request(url)
1844
1845 try:
1846 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847 mobj = re.search(r'data-users-id="([^"]+)"', page)
1848 page_base = page_base % mobj.group(1)
1849 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1850 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1851 return
1852
1853
1854 # Download video ids using BlipTV Ajax calls. Result size per
1855 # query is limited (currently to 12 videos) so we need to query
1856 # page by page until there are no video ids - it means we got
1857 # all of them.
1858
1859 video_ids = []
1860 pagenum = 1
1861
1862 while True:
1863 self.report_download_page(username, pagenum)
1864
1865 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1866
1867 try:
1868 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1870 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871 return
1872
1873 # Extract video identifiers
1874 ids_in_page = []
1875
1876 for mobj in re.finditer(r'href="/([^"]+)"', page):
1877 if mobj.group(1) not in ids_in_page:
1878 ids_in_page.append(unescapeHTML(mobj.group(1)))
1879
1880 video_ids.extend(ids_in_page)
1881
1882 # A little optimization - if current page is not
1883 # "full", ie. does not contain PAGE_SIZE video ids then
1884 # we can assume that this page is the last one - there
1885 # are no more ids on further pages - no need to query
1886 # again.
1887
1888 if len(ids_in_page) < self._PAGE_SIZE:
1889 break
1890
1891 pagenum += 1
1892
1893 all_ids_count = len(video_ids)
1894 playliststart = self._downloader.params.get('playliststart', 1) - 1
1895 playlistend = self._downloader.params.get('playlistend', -1)
1896
1897 if playlistend == -1:
1898 video_ids = video_ids[playliststart:]
1899 else:
1900 video_ids = video_ids[playliststart:playlistend]
1901
1902 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1903 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1904
1905 for video_id in video_ids:
1906 self._downloader.download([u'http://blip.tv/'+video_id])
1907
1908
1909 class DepositFilesIE(InfoExtractor):
1910 """Information extractor for depositfiles.com"""
1911
1912 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1913 IE_NAME = u'DepositFiles'
1914
1915 def __init__(self, downloader=None):
1916 InfoExtractor.__init__(self, downloader)
1917
1918 def report_download_webpage(self, file_id):
1919 """Report webpage download."""
1920 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1921
1922 def report_extraction(self, file_id):
1923 """Report information extraction."""
1924 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1925
1926 def _real_extract(self, url):
1927 file_id = url.split('/')[-1]
1928 # Rebuild url in english locale
1929 url = 'http://depositfiles.com/en/files/' + file_id
1930
1931 # Retrieve file webpage with 'Free download' button pressed
1932 free_download_indication = { 'gateway_result' : '1' }
1933 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1934 try:
1935 self.report_download_webpage(file_id)
1936 webpage = compat_urllib_request.urlopen(request).read()
1937 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1939 return
1940
1941 # Search for the real file URL
1942 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1943 if (mobj is None) or (mobj.group(1) is None):
1944 # Try to figure out reason of the error.
1945 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1946 if (mobj is not None) and (mobj.group(1) is not None):
1947 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1948 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1949 else:
1950 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1951 return
1952
1953 file_url = mobj.group(1)
1954 file_extension = os.path.splitext(file_url)[1][1:]
1955
1956 # Search for file title
1957 mobj = re.search(r'<b title="(.*?)">', webpage)
1958 if mobj is None:
1959 self._downloader.trouble(u'ERROR: unable to extract title')
1960 return
1961 file_title = mobj.group(1).decode('utf-8')
1962
1963 return [{
1964 'id': file_id.decode('utf-8'),
1965 'url': file_url.decode('utf-8'),
1966 'uploader': None,
1967 'upload_date': None,
1968 'title': file_title,
1969 'ext': file_extension.decode('utf-8'),
1970 }]
1971
1972
1973 class FacebookIE(InfoExtractor):
1974 """Information Extractor for Facebook"""
1975
1976 _WORKING = False
1977 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1978 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1979 _NETRC_MACHINE = 'facebook'
1980 _available_formats = ['video', 'highqual', 'lowqual']
1981 _video_extensions = {
1982 'video': 'mp4',
1983 'highqual': 'mp4',
1984 'lowqual': 'mp4',
1985 }
1986 IE_NAME = u'facebook'
1987
1988 def __init__(self, downloader=None):
1989 InfoExtractor.__init__(self, downloader)
1990
1991 def _reporter(self, message):
1992 """Add header and report message."""
1993 self._downloader.to_screen(u'[facebook] %s' % message)
1994
1995 def report_login(self):
1996 """Report attempt to log in."""
1997 self._reporter(u'Logging in')
1998
1999 def report_video_webpage_download(self, video_id):
2000 """Report attempt to download video webpage."""
2001 self._reporter(u'%s: Downloading video webpage' % video_id)
2002
2003 def report_information_extraction(self, video_id):
2004 """Report attempt to extract video information."""
2005 self._reporter(u'%s: Extracting video information' % video_id)
2006
2007 def _parse_page(self, video_webpage):
2008 """Extract video information from page"""
2009 # General data
2010 data = {'title': r'\("video_title", "(.*?)"\)',
2011 'description': r'<div class="datawrap">(.*?)</div>',
2012 'owner': r'\("video_owner_name", "(.*?)"\)',
2013 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2014 }
2015 video_info = {}
2016 for piece in data.keys():
2017 mobj = re.search(data[piece], video_webpage)
2018 if mobj is not None:
2019 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2020
2021 # Video urls
2022 video_urls = {}
2023 for fmt in self._available_formats:
2024 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2025 if mobj is not None:
2026 # URL is in a Javascript segment inside an escaped Unicode format within
2027 # the generally utf-8 page
2028 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2029 video_info['video_urls'] = video_urls
2030
2031 return video_info
2032
2033 def _real_initialize(self):
2034 if self._downloader is None:
2035 return
2036
2037 useremail = None
2038 password = None
2039 downloader_params = self._downloader.params
2040
2041 # Attempt to use provided username and password or .netrc data
2042 if downloader_params.get('username', None) is not None:
2043 useremail = downloader_params['username']
2044 password = downloader_params['password']
2045 elif downloader_params.get('usenetrc', False):
2046 try:
2047 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2048 if info is not None:
2049 useremail = info[0]
2050 password = info[2]
2051 else:
2052 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2053 except (IOError, netrc.NetrcParseError) as err:
2054 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2055 return
2056
2057 if useremail is None:
2058 return
2059
2060 # Log in
2061 login_form = {
2062 'email': useremail,
2063 'pass': password,
2064 'login': 'Log+In'
2065 }
2066 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2067 try:
2068 self.report_login()
2069 login_results = compat_urllib_request.urlopen(request).read()
2070 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2071 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2072 return
2073 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2074 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2075 return
2076
2077 def _real_extract(self, url):
2078 mobj = re.match(self._VALID_URL, url)
2079 if mobj is None:
2080 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2081 return
2082 video_id = mobj.group('ID')
2083
2084 # Get video webpage
2085 self.report_video_webpage_download(video_id)
2086 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2087 try:
2088 page = compat_urllib_request.urlopen(request)
2089 video_webpage = page.read()
2090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2091 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2092 return
2093
2094 # Start extracting information
2095 self.report_information_extraction(video_id)
2096
2097 # Extract information
2098 video_info = self._parse_page(video_webpage)
2099
2100 # uploader
2101 if 'owner' not in video_info:
2102 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2103 return
2104 video_uploader = video_info['owner']
2105
2106 # title
2107 if 'title' not in video_info:
2108 self._downloader.trouble(u'ERROR: unable to extract video title')
2109 return
2110 video_title = video_info['title']
2111 video_title = video_title.decode('utf-8')
2112
2113 # thumbnail image
2114 if 'thumbnail' not in video_info:
2115 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2116 video_thumbnail = ''
2117 else:
2118 video_thumbnail = video_info['thumbnail']
2119
2120 # upload date
2121 upload_date = None
2122 if 'upload_date' in video_info:
2123 upload_time = video_info['upload_date']
2124 timetuple = email.utils.parsedate_tz(upload_time)
2125 if timetuple is not None:
2126 try:
2127 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2128 except:
2129 pass
2130
2131 # description
2132 video_description = video_info.get('description', 'No description available.')
2133
2134 url_map = video_info['video_urls']
2135 if url_map:
2136 # Decide which formats to download
2137 req_format = self._downloader.params.get('format', None)
2138 format_limit = self._downloader.params.get('format_limit', None)
2139
2140 if format_limit is not None and format_limit in self._available_formats:
2141 format_list = self._available_formats[self._available_formats.index(format_limit):]
2142 else:
2143 format_list = self._available_formats
2144 existing_formats = [x for x in format_list if x in url_map]
2145 if len(existing_formats) == 0:
2146 self._downloader.trouble(u'ERROR: no known formats available for video')
2147 return
2148 if req_format is None:
2149 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2150 elif req_format == 'worst':
2151 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2152 elif req_format == '-1':
2153 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2154 else:
2155 # Specific format
2156 if req_format not in url_map:
2157 self._downloader.trouble(u'ERROR: requested format not available')
2158 return
2159 video_url_list = [(req_format, url_map[req_format])] # Specific format
2160
2161 results = []
2162 for format_param, video_real_url in video_url_list:
2163 # Extension
2164 video_extension = self._video_extensions.get(format_param, 'mp4')
2165
2166 results.append({
2167 'id': video_id.decode('utf-8'),
2168 'url': video_real_url.decode('utf-8'),
2169 'uploader': video_uploader.decode('utf-8'),
2170 'upload_date': upload_date,
2171 'title': video_title,
2172 'ext': video_extension.decode('utf-8'),
2173 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2174 'thumbnail': video_thumbnail.decode('utf-8'),
2175 'description': video_description.decode('utf-8'),
2176 })
2177 return results
2178
2179 class BlipTVIE(InfoExtractor):
2180 """Information extractor for blip.tv"""
2181
2182 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2183 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2184 IE_NAME = u'blip.tv'
2185
2186 def report_extraction(self, file_id):
2187 """Report information extraction."""
2188 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2189
2190 def report_direct_download(self, title):
2191 """Report information extraction."""
2192 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2193
2194 def _real_extract(self, url):
2195 mobj = re.match(self._VALID_URL, url)
2196 if mobj is None:
2197 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2198 return
2199
2200 if '?' in url:
2201 cchar = '&'
2202 else:
2203 cchar = '?'
2204 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2205 request = compat_urllib_request.Request(json_url)
2206 self.report_extraction(mobj.group(1))
2207 info = None
2208 try:
2209 urlh = compat_urllib_request.urlopen(request)
2210 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2211 basename = url.split('/')[-1]
2212 title,ext = os.path.splitext(basename)
2213 title = title.decode('UTF-8')
2214 ext = ext.replace('.', '')
2215 self.report_direct_download(title)
2216 info = {
2217 'id': title,
2218 'url': url,
2219 'uploader': None,
2220 'upload_date': None,
2221 'title': title,
2222 'ext': ext,
2223 'urlhandle': urlh
2224 }
2225 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2226 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2227 return
2228 if info is None: # Regular URL
2229 try:
2230 json_code_bytes = urlh.read()
2231 json_code = json_code_bytes.decode('utf-8')
2232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2233 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2234 return
2235
2236 try:
2237 json_data = json.loads(json_code)
2238 if 'Post' in json_data:
2239 data = json_data['Post']
2240 else:
2241 data = json_data
2242
2243 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2244 video_url = data['media']['url']
2245 umobj = re.match(self._URL_EXT, video_url)
2246 if umobj is None:
2247 raise ValueError('Can not determine filename extension')
2248 ext = umobj.group(1)
2249
2250 info = {
2251 'id': data['item_id'],
2252 'url': video_url,
2253 'uploader': data['display_name'],
2254 'upload_date': upload_date,
2255 'title': data['title'],
2256 'ext': ext,
2257 'format': data['media']['mimeType'],
2258 'thumbnail': data['thumbnailUrl'],
2259 'description': data['description'],
2260 'player_url': data['embedUrl']
2261 }
2262 except (ValueError,KeyError) as err:
2263 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2264 return
2265
2266 std_headers['User-Agent'] = 'iTunes/10.6.1'
2267 return [info]
2268
2269
2270 class MyVideoIE(InfoExtractor):
2271 """Information Extractor for myvideo.de."""
2272
2273 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2274 IE_NAME = u'myvideo'
2275
2276 def __init__(self, downloader=None):
2277 InfoExtractor.__init__(self, downloader)
2278
2279 def report_download_webpage(self, video_id):
2280 """Report webpage download."""
2281 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2282
2283 def report_extraction(self, video_id):
2284 """Report information extraction."""
2285 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2286
2287 def _real_extract(self,url):
2288 mobj = re.match(self._VALID_URL, url)
2289 if mobj is None:
2290 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2291 return
2292
2293 video_id = mobj.group(1)
2294
2295 # Get video webpage
2296 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2297 try:
2298 self.report_download_webpage(video_id)
2299 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2301 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2302 return
2303
2304 self.report_extraction(video_id)
2305 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2306 webpage)
2307 if mobj is None:
2308 self._downloader.trouble(u'ERROR: unable to extract media URL')
2309 return
2310 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2311
2312 mobj = re.search('<title>([^<]+)</title>', webpage)
2313 if mobj is None:
2314 self._downloader.trouble(u'ERROR: unable to extract title')
2315 return
2316
2317 video_title = mobj.group(1)
2318
2319 return [{
2320 'id': video_id,
2321 'url': video_url,
2322 'uploader': None,
2323 'upload_date': None,
2324 'title': video_title,
2325 'ext': u'flv',
2326 }]
2327
2328 class ComedyCentralIE(InfoExtractor):
2329 """Information extractor for The Daily Show and Colbert Report """
2330
2331 # urls can be abbreviations like :thedailyshow or :colbert
2332 # urls for episodes like:
2333 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2334 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2335 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2336 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2337 |(https?://)?(www\.)?
2338 (?P<showname>thedailyshow|colbertnation)\.com/
2339 (full-episodes/(?P<episode>.*)|
2340 (?P<clip>
2341 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2342 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2343 $"""
2344 IE_NAME = u'comedycentral'
2345
2346 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2347
2348 _video_extensions = {
2349 '3500': 'mp4',
2350 '2200': 'mp4',
2351 '1700': 'mp4',
2352 '1200': 'mp4',
2353 '750': 'mp4',
2354 '400': 'mp4',
2355 }
2356 _video_dimensions = {
2357 '3500': '1280x720',
2358 '2200': '960x540',
2359 '1700': '768x432',
2360 '1200': '640x360',
2361 '750': '512x288',
2362 '400': '384x216',
2363 }
2364
2365 def suitable(self, url):
2366 """Receives a URL and returns True if suitable for this IE."""
2367 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2368
2369 def report_extraction(self, episode_id):
2370 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2371
2372 def report_config_download(self, episode_id):
2373 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2374
2375 def report_index_download(self, episode_id):
2376 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2377
2378 def report_player_url(self, episode_id):
2379 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2380
2381
2382 def _print_formats(self, formats):
2383 print('Available formats:')
2384 for x in formats:
2385 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2386
2387
2388 def _real_extract(self, url):
2389 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2390 if mobj is None:
2391 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2392 return
2393
2394 if mobj.group('shortname'):
2395 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2396 url = u'http://www.thedailyshow.com/full-episodes/'
2397 else:
2398 url = u'http://www.colbertnation.com/full-episodes/'
2399 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2400 assert mobj is not None
2401
2402 if mobj.group('clip'):
2403 if mobj.group('showname') == 'thedailyshow':
2404 epTitle = mobj.group('tdstitle')
2405 else:
2406 epTitle = mobj.group('cntitle')
2407 dlNewest = False
2408 else:
2409 dlNewest = not mobj.group('episode')
2410 if dlNewest:
2411 epTitle = mobj.group('showname')
2412 else:
2413 epTitle = mobj.group('episode')
2414
2415 req = compat_urllib_request.Request(url)
2416 self.report_extraction(epTitle)
2417 try:
2418 htmlHandle = compat_urllib_request.urlopen(req)
2419 html = htmlHandle.read()
2420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2422 return
2423 if dlNewest:
2424 url = htmlHandle.geturl()
2425 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2426 if mobj is None:
2427 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2428 return
2429 if mobj.group('episode') == '':
2430 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2431 return
2432 epTitle = mobj.group('episode')
2433
2434 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2435
2436 if len(mMovieParams) == 0:
2437 # The Colbert Report embeds the information in a without
2438 # a URL prefix; so extract the alternate reference
2439 # and then add the URL prefix manually.
2440
2441 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2442 if len(altMovieParams) == 0:
2443 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2444 return
2445 else:
2446 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2447
2448 playerUrl_raw = mMovieParams[0][0]
2449 self.report_player_url(epTitle)
2450 try:
2451 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2452 playerUrl = urlHandle.geturl()
2453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2455 return
2456
2457 uri = mMovieParams[0][1]
2458 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2459 self.report_index_download(epTitle)
2460 try:
2461 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2464 return
2465
2466 results = []
2467
2468 idoc = xml.etree.ElementTree.fromstring(indexXml)
2469 itemEls = idoc.findall('.//item')
2470 for itemEl in itemEls:
2471 mediaId = itemEl.findall('./guid')[0].text
2472 shortMediaId = mediaId.split(':')[-1]
2473 showId = mediaId.split(':')[-2].replace('.com', '')
2474 officialTitle = itemEl.findall('./title')[0].text
2475 officialDate = itemEl.findall('./pubDate')[0].text
2476
2477 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2478 compat_urllib_parse.urlencode({'uri': mediaId}))
2479 configReq = compat_urllib_request.Request(configUrl)
2480 self.report_config_download(epTitle)
2481 try:
2482 configXml = compat_urllib_request.urlopen(configReq).read()
2483 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2485 return
2486
2487 cdoc = xml.etree.ElementTree.fromstring(configXml)
2488 turls = []
2489 for rendition in cdoc.findall('.//rendition'):
2490 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2491 turls.append(finfo)
2492
2493 if len(turls) == 0:
2494 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2495 continue
2496
2497 if self._downloader.params.get('listformats', None):
2498 self._print_formats([i[0] for i in turls])
2499 return
2500
2501 # For now, just pick the highest bitrate
2502 format,video_url = turls[-1]
2503
2504 # Get the format arg from the arg stream
2505 req_format = self._downloader.params.get('format', None)
2506
2507 # Select format if we can find one
2508 for f,v in turls:
2509 if f == req_format:
2510 format, video_url = f, v
2511 break
2512
2513 # Patch to download from alternative CDN, which does not
2514 # break on current RTMPDump builds
2515 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2516 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2517
2518 if video_url.startswith(broken_cdn):
2519 video_url = video_url.replace(broken_cdn, better_cdn)
2520
2521 effTitle = showId + u'-' + epTitle
2522 info = {
2523 'id': shortMediaId,
2524 'url': video_url,
2525 'uploader': showId,
2526 'upload_date': officialDate,
2527 'title': effTitle,
2528 'ext': 'mp4',
2529 'format': format,
2530 'thumbnail': None,
2531 'description': officialTitle,
2532 'player_url': None #playerUrl
2533 }
2534
2535 results.append(info)
2536
2537 return results
2538
2539
2540 class EscapistIE(InfoExtractor):
2541 """Information extractor for The Escapist """
2542
2543 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2544 IE_NAME = u'escapist'
2545
2546 def report_extraction(self, showName):
2547 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2548
2549 def report_config_download(self, showName):
2550 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2551
2552 def _real_extract(self, url):
2553 mobj = re.match(self._VALID_URL, url)
2554 if mobj is None:
2555 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2556 return
2557 showName = mobj.group('showname')
2558 videoId = mobj.group('episode')
2559
2560 self.report_extraction(showName)
2561 try:
2562 webPage = compat_urllib_request.urlopen(url)
2563 webPageBytes = webPage.read()
2564 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2565 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2566 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2567 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2568 return
2569
2570 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2571 description = unescapeHTML(descMatch.group(1))
2572 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2573 imgUrl = unescapeHTML(imgMatch.group(1))
2574 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2575 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2576 configUrlMatch = re.search('config=(.*)$', playerUrl)
2577 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2578
2579 self.report_config_download(showName)
2580 try:
2581 configJSON = compat_urllib_request.urlopen(configUrl)
2582 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2583 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2584 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2585 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2586 return
2587
2588 # Technically, it's JavaScript, not JSON
2589 configJSON = configJSON.replace("'", '"')
2590
2591 try:
2592 config = json.loads(configJSON)
2593 except (ValueError,) as err:
2594 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2595 return
2596
2597 playlist = config['playlist']
2598 videoUrl = playlist[1]['url']
2599
2600 info = {
2601 'id': videoId,
2602 'url': videoUrl,
2603 'uploader': showName,
2604 'upload_date': None,
2605 'title': showName,
2606 'ext': 'flv',
2607 'thumbnail': imgUrl,
2608 'description': description,
2609 'player_url': playerUrl,
2610 }
2611
2612 return [info]
2613
2614
2615 class CollegeHumorIE(InfoExtractor):
2616 """Information extractor for collegehumor.com"""
2617
2618 _WORKING = False
2619 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2620 IE_NAME = u'collegehumor'
2621
2622 def report_manifest(self, video_id):
2623 """Report information extraction."""
2624 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2625
2626 def report_extraction(self, video_id):
2627 """Report information extraction."""
2628 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2629
2630 def _real_extract(self, url):
2631 mobj = re.match(self._VALID_URL, url)
2632 if mobj is None:
2633 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2634 return
2635 video_id = mobj.group('videoid')
2636
2637 info = {
2638 'id': video_id,
2639 'uploader': None,
2640 'upload_date': None,
2641 }
2642
2643 self.report_extraction(video_id)
2644 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2645 try:
2646 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2649 return
2650
2651 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2652 try:
2653 videoNode = mdoc.findall('./video')[0]
2654 info['description'] = videoNode.findall('./description')[0].text
2655 info['title'] = videoNode.findall('./caption')[0].text
2656 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2657 manifest_url = videoNode.findall('./file')[0].text
2658 except IndexError:
2659 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2660 return
2661
2662 manifest_url += '?hdcore=2.10.3'
2663 self.report_manifest(video_id)
2664 try:
2665 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2666 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2668 return
2669
2670 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2671 try:
2672 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2673 node_id = media_node.attrib['url']
2674 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2675 except IndexError as err:
2676 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2677 return
2678
2679 url_pr = compat_urllib_parse_urlparse(manifest_url)
2680 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2681
2682 info['url'] = url
2683 info['ext'] = 'f4f'
2684 return [info]
2685
2686
2687 class XVideosIE(InfoExtractor):
2688 """Information extractor for xvideos.com"""
2689
2690 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2691 IE_NAME = u'xvideos'
2692
2693 def report_webpage(self, video_id):
2694 """Report information extraction."""
2695 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2696
2697 def report_extraction(self, video_id):
2698 """Report information extraction."""
2699 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2700
2701 def _real_extract(self, url):
2702 mobj = re.match(self._VALID_URL, url)
2703 if mobj is None:
2704 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2705 return
2706 video_id = mobj.group(1)
2707
2708 self.report_webpage(video_id)
2709
2710 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2711 try:
2712 webpage_bytes = compat_urllib_request.urlopen(request).read()
2713 webpage = webpage_bytes.decode('utf-8', 'replace')
2714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2716 return
2717
2718 self.report_extraction(video_id)
2719
2720
2721 # Extract video URL
2722 mobj = re.search(r'flv_url=(.+?)&', webpage)
2723 if mobj is None:
2724 self._downloader.trouble(u'ERROR: unable to extract video url')
2725 return
2726 video_url = compat_urllib_parse.unquote(mobj.group(1))
2727
2728
2729 # Extract title
2730 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2731 if mobj is None:
2732 self._downloader.trouble(u'ERROR: unable to extract video title')
2733 return
2734 video_title = mobj.group(1)
2735
2736
2737 # Extract video thumbnail
2738 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2739 if mobj is None:
2740 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2741 return
2742 video_thumbnail = mobj.group(0)
2743
2744 info = {
2745 'id': video_id,
2746 'url': video_url,
2747 'uploader': None,
2748 'upload_date': None,
2749 'title': video_title,
2750 'ext': 'flv',
2751 'thumbnail': video_thumbnail,
2752 'description': None,
2753 }
2754
2755 return [info]
2756
2757
2758 class SoundcloudIE(InfoExtractor):
2759 """Information extractor for soundcloud.com
2760 To access the media, the uid of the song and a stream token
2761 must be extracted from the page source and the script must make
2762 a request to media.soundcloud.com/crossdomain.xml. Then
2763 the media can be grabbed by requesting from an url composed
2764 of the stream token and uid
2765 """
2766
2767 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2768 IE_NAME = u'soundcloud'
2769
2770 def __init__(self, downloader=None):
2771 InfoExtractor.__init__(self, downloader)
2772
2773 def report_resolve(self, video_id):
2774 """Report information extraction."""
2775 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2776
2777 def report_extraction(self, video_id):
2778 """Report information extraction."""
2779 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2780
2781 def _real_extract(self, url):
2782 mobj = re.match(self._VALID_URL, url)
2783 if mobj is None:
2784 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2785 return
2786
2787 # extract uploader (which is in the url)
2788 uploader = mobj.group(1)
2789 # extract simple title (uploader + slug of song title)
2790 slug_title = mobj.group(2)
2791 simple_title = uploader + u'-' + slug_title
2792
2793 self.report_resolve('%s/%s' % (uploader, slug_title))
2794
2795 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2796 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2797 request = compat_urllib_request.Request(resolv_url)
2798 try:
2799 info_json_bytes = compat_urllib_request.urlopen(request).read()
2800 info_json = info_json_bytes.decode('utf-8')
2801 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2802 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2803 return
2804
2805 info = json.loads(info_json)
2806 video_id = info['id']
2807 self.report_extraction('%s/%s' % (uploader, slug_title))
2808
2809 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810 request = compat_urllib_request.Request(streams_url)
2811 try:
2812 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2813 stream_json = stream_json_bytes.decode('utf-8')
2814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2816 return
2817
2818 streams = json.loads(stream_json)
2819 mediaURL = streams['http_mp3_128_url']
2820
2821 return [{
2822 'id': info['id'],
2823 'url': mediaURL,
2824 'uploader': info['user']['username'],
2825 'upload_date': info['created_at'],
2826 'title': info['title'],
2827 'ext': u'mp3',
2828 'description': info['description'],
2829 }]
2830
2831
2832 class InfoQIE(InfoExtractor):
2833 """Information extractor for infoq.com"""
2834
2835 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2836 IE_NAME = u'infoq'
2837
2838 def report_webpage(self, video_id):
2839 """Report information extraction."""
2840 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2841
2842 def report_extraction(self, video_id):
2843 """Report information extraction."""
2844 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2845
2846 def _real_extract(self, url):
2847 mobj = re.match(self._VALID_URL, url)
2848 if mobj is None:
2849 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2850 return
2851
2852 self.report_webpage(url)
2853
2854 request = compat_urllib_request.Request(url)
2855 try:
2856 webpage = compat_urllib_request.urlopen(request).read()
2857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2858 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2859 return
2860
2861 self.report_extraction(url)
2862
2863
2864 # Extract video URL
2865 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2866 if mobj is None:
2867 self._downloader.trouble(u'ERROR: unable to extract video url')
2868 return
2869 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2870
2871
2872 # Extract title
2873 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2874 if mobj is None:
2875 self._downloader.trouble(u'ERROR: unable to extract video title')
2876 return
2877 video_title = mobj.group(1).decode('utf-8')
2878
2879 # Extract description
2880 video_description = u'No description available.'
2881 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2882 if mobj is not None:
2883 video_description = mobj.group(1).decode('utf-8')
2884
2885 video_filename = video_url.split('/')[-1]
2886 video_id, extension = video_filename.split('.')
2887
2888 info = {
2889 'id': video_id,
2890 'url': video_url,
2891 'uploader': None,
2892 'upload_date': None,
2893 'title': video_title,
2894 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2895 'thumbnail': None,
2896 'description': video_description,
2897 }
2898
2899 return [info]
2900
2901 class MixcloudIE(InfoExtractor):
2902 """Information extractor for www.mixcloud.com"""
2903
2904 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2905 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2906 IE_NAME = u'mixcloud'
2907
2908 def __init__(self, downloader=None):
2909 InfoExtractor.__init__(self, downloader)
2910
2911 def report_download_json(self, file_id):
2912 """Report JSON download."""
2913 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2914
2915 def report_extraction(self, file_id):
2916 """Report information extraction."""
2917 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2918
2919 def get_urls(self, jsonData, fmt, bitrate='best'):
2920 """Get urls from 'audio_formats' section in json"""
2921 file_url = None
2922 try:
2923 bitrate_list = jsonData[fmt]
2924 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2925 bitrate = max(bitrate_list) # select highest
2926
2927 url_list = jsonData[fmt][bitrate]
2928 except TypeError: # we have no bitrate info.
2929 url_list = jsonData[fmt]
2930 return url_list
2931
2932 def check_urls(self, url_list):
2933 """Returns 1st active url from list"""
2934 for url in url_list:
2935 try:
2936 compat_urllib_request.urlopen(url)
2937 return url
2938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2939 url = None
2940
2941 return None
2942
2943 def _print_formats(self, formats):
2944 print('Available formats:')
2945 for fmt in formats.keys():
2946 for b in formats[fmt]:
2947 try:
2948 ext = formats[fmt][b][0]
2949 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2950 except TypeError: # we have no bitrate info
2951 ext = formats[fmt][0]
2952 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2953 break
2954
2955 def _real_extract(self, url):
2956 mobj = re.match(self._VALID_URL, url)
2957 if mobj is None:
2958 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2959 return
2960 # extract uploader & filename from url
2961 uploader = mobj.group(1).decode('utf-8')
2962 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2963
2964 # construct API request
2965 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2966 # retrieve .json file with links to files
2967 request = compat_urllib_request.Request(file_url)
2968 try:
2969 self.report_download_json(file_url)
2970 jsonData = compat_urllib_request.urlopen(request).read()
2971 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2972 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2973 return
2974
2975 # parse JSON
2976 json_data = json.loads(jsonData)
2977 player_url = json_data['player_swf_url']
2978 formats = dict(json_data['audio_formats'])
2979
2980 req_format = self._downloader.params.get('format', None)
2981 bitrate = None
2982
2983 if self._downloader.params.get('listformats', None):
2984 self._print_formats(formats)
2985 return
2986
2987 if req_format is None or req_format == 'best':
2988 for format_param in formats.keys():
2989 url_list = self.get_urls(formats, format_param)
2990 # check urls
2991 file_url = self.check_urls(url_list)
2992 if file_url is not None:
2993 break # got it!
2994 else:
2995 if req_format not in formats:
2996 self._downloader.trouble(u'ERROR: format is not available')
2997 return
2998
2999 url_list = self.get_urls(formats, req_format)
3000 file_url = self.check_urls(url_list)
3001 format_param = req_format
3002
3003 return [{
3004 'id': file_id.decode('utf-8'),
3005 'url': file_url.decode('utf-8'),
3006 'uploader': uploader.decode('utf-8'),
3007 'upload_date': None,
3008 'title': json_data['name'],
3009 'ext': file_url.split('.')[-1].decode('utf-8'),
3010 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3011 'thumbnail': json_data['thumbnail_url'],
3012 'description': json_data['description'],
3013 'player_url': player_url.decode('utf-8'),
3014 }]
3015
3016 class StanfordOpenClassroomIE(InfoExtractor):
3017 """Information extractor for Stanford's Open ClassRoom"""
3018
3019 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3020 IE_NAME = u'stanfordoc'
3021
3022 def report_download_webpage(self, objid):
3023 """Report information extraction."""
3024 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3025
3026 def report_extraction(self, video_id):
3027 """Report information extraction."""
3028 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3029
3030 def _real_extract(self, url):
3031 mobj = re.match(self._VALID_URL, url)
3032 if mobj is None:
3033 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3034 return
3035
3036 if mobj.group('course') and mobj.group('video'): # A specific video
3037 course = mobj.group('course')
3038 video = mobj.group('video')
3039 info = {
3040 'id': course + '_' + video,
3041 'uploader': None,
3042 'upload_date': None,
3043 }
3044
3045 self.report_extraction(info['id'])
3046 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3047 xmlUrl = baseUrl + video + '.xml'
3048 try:
3049 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3051 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3052 return
3053 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3054 try:
3055 info['title'] = mdoc.findall('./title')[0].text
3056 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3057 except IndexError:
3058 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3059 return
3060 info['ext'] = info['url'].rpartition('.')[2]
3061 return [info]
3062 elif mobj.group('course'): # A course page
3063 course = mobj.group('course')
3064 info = {
3065 'id': course,
3066 'type': 'playlist',
3067 'uploader': None,
3068 'upload_date': None,
3069 }
3070
3071 self.report_download_webpage(info['id'])
3072 try:
3073 coursepage = compat_urllib_request.urlopen(url).read()
3074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3075 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3076 return
3077
3078 m = re.search('<h1>([^<]+)</h1>', coursepage)
3079 if m:
3080 info['title'] = unescapeHTML(m.group(1))
3081 else:
3082 info['title'] = info['id']
3083
3084 m = re.search('<description>([^<]+)</description>', coursepage)
3085 if m:
3086 info['description'] = unescapeHTML(m.group(1))
3087
3088 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3089 info['list'] = [
3090 {
3091 'type': 'reference',
3092 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3093 }
3094 for vpage in links]
3095 results = []
3096 for entry in info['list']:
3097 assert entry['type'] == 'reference'
3098 results += self.extract(entry['url'])
3099 return results
3100
3101 else: # Root page
3102 info = {
3103 'id': 'Stanford OpenClassroom',
3104 'type': 'playlist',
3105 'uploader': None,
3106 'upload_date': None,
3107 }
3108
3109 self.report_download_webpage(info['id'])
3110 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3111 try:
3112 rootpage = compat_urllib_request.urlopen(rootURL).read()
3113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3114 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3115 return
3116
3117 info['title'] = info['id']
3118
3119 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3120 info['list'] = [
3121 {
3122 'type': 'reference',
3123 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3124 }
3125 for cpage in links]
3126
3127 results = []
3128 for entry in info['list']:
3129 assert entry['type'] == 'reference'
3130 results += self.extract(entry['url'])
3131 return results
3132
3133 class MTVIE(InfoExtractor):
3134 """Information extractor for MTV.com"""
3135
3136 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3137 IE_NAME = u'mtv'
3138
3139 def report_webpage(self, video_id):
3140 """Report information extraction."""
3141 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3142
3143 def report_extraction(self, video_id):
3144 """Report information extraction."""
3145 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3146
3147 def _real_extract(self, url):
3148 mobj = re.match(self._VALID_URL, url)
3149 if mobj is None:
3150 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3151 return
3152 if not mobj.group('proto'):
3153 url = 'http://' + url
3154 video_id = mobj.group('videoid')
3155 self.report_webpage(video_id)
3156
3157 request = compat_urllib_request.Request(url)
3158 try:
3159 webpage = compat_urllib_request.urlopen(request).read()
3160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3162 return
3163
3164 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3165 if mobj is None:
3166 self._downloader.trouble(u'ERROR: unable to extract song name')
3167 return
3168 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3169 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3170 if mobj is None:
3171 self._downloader.trouble(u'ERROR: unable to extract performer')
3172 return
3173 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3174 video_title = performer + ' - ' + song_name
3175
3176 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3177 if mobj is None:
3178 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3179 return
3180 mtvn_uri = mobj.group(1)
3181
3182 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3183 if mobj is None:
3184 self._downloader.trouble(u'ERROR: unable to extract content id')
3185 return
3186 content_id = mobj.group(1)
3187
3188 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3189 self.report_extraction(video_id)
3190 request = compat_urllib_request.Request(videogen_url)
3191 try:
3192 metadataXml = compat_urllib_request.urlopen(request).read()
3193 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3194 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3195 return
3196
3197 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3198 renditions = mdoc.findall('.//rendition')
3199
3200 # For now, always pick the highest quality.
3201 rendition = renditions[-1]
3202
3203 try:
3204 _,_,ext = rendition.attrib['type'].partition('/')
3205 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3206 video_url = rendition.find('./src').text
3207 except KeyError:
3208 self._downloader.trouble('Invalid rendition field.')
3209 return
3210
3211 info = {
3212 'id': video_id,
3213 'url': video_url,
3214 'uploader': performer,
3215 'upload_date': None,
3216 'title': video_title,
3217 'ext': ext,
3218 'format': format,
3219 }
3220
3221 return [info]
3222
3223
3224 class YoukuIE(InfoExtractor):
3225
3226 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3227 IE_NAME = u'Youku'
3228
3229 def __init__(self, downloader=None):
3230 InfoExtractor.__init__(self, downloader)
3231
3232 def report_download_webpage(self, file_id):
3233 """Report webpage download."""
3234 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3235
3236 def report_extraction(self, file_id):
3237 """Report information extraction."""
3238 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3239
3240 def _gen_sid(self):
3241 nowTime = int(time.time() * 1000)
3242 random1 = random.randint(1000,1998)
3243 random2 = random.randint(1000,9999)
3244
3245 return "%d%d%d" %(nowTime,random1,random2)
3246
3247 def _get_file_ID_mix_string(self, seed):
3248 mixed = []
3249 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3250 seed = float(seed)
3251 for i in range(len(source)):
3252 seed = (seed * 211 + 30031 ) % 65536
3253 index = math.floor(seed / 65536 * len(source) )
3254 mixed.append(source[int(index)])
3255 source.remove(source[int(index)])
3256 #return ''.join(mixed)
3257 return mixed
3258
3259 def _get_file_id(self, fileId, seed):
3260 mixed = self._get_file_ID_mix_string(seed)
3261 ids = fileId.split('*')
3262 realId = []
3263 for ch in ids:
3264 if ch:
3265 realId.append(mixed[int(ch)])
3266 return ''.join(realId)
3267
3268 def _real_extract(self, url):
3269 mobj = re.match(self._VALID_URL, url)
3270 if mobj is None:
3271 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3272 return
3273 video_id = mobj.group('ID')
3274
3275 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3276
3277 request = compat_urllib_request.Request(info_url, None, std_headers)
3278 try:
3279 self.report_download_webpage(video_id)
3280 jsondata = compat_urllib_request.urlopen(request).read()
3281 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3282 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3283 return
3284
3285 self.report_extraction(video_id)
3286 try:
3287 jsonstr = jsondata.decode('utf-8')
3288 config = json.loads(jsonstr)
3289
3290 video_title = config['data'][0]['title']
3291 seed = config['data'][0]['seed']
3292
3293 format = self._downloader.params.get('format', None)
3294 supported_format = list(config['data'][0]['streamfileids'].keys())
3295
3296 if format is None or format == 'best':
3297 if 'hd2' in supported_format:
3298 format = 'hd2'
3299 else:
3300 format = 'flv'
3301 ext = u'flv'
3302 elif format == 'worst':
3303 format = 'mp4'
3304 ext = u'mp4'
3305 else:
3306 format = 'flv'
3307 ext = u'flv'
3308
3309
3310 fileid = config['data'][0]['streamfileids'][format]
3311 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3312 except (UnicodeDecodeError, ValueError, KeyError):
3313 self._downloader.trouble(u'ERROR: unable to extract info section')
3314 return
3315
3316 files_info=[]
3317 sid = self._gen_sid()
3318 fileid = self._get_file_id(fileid, seed)
3319
3320 #column 8,9 of fileid represent the segment number
3321 #fileid[7:9] should be changed
3322 for index, key in enumerate(keys):
3323
3324 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3325 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3326
3327 info = {
3328 'id': '%s_part%02d' % (video_id, index),
3329 'url': download_url,
3330 'uploader': None,
3331 'upload_date': None,
3332 'title': video_title,
3333 'ext': ext,
3334 }
3335 files_info.append(info)
3336
3337 return files_info
3338
3339
3340 class XNXXIE(InfoExtractor):
3341 """Information extractor for xnxx.com"""
3342
3343 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3344 IE_NAME = u'xnxx'
3345 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3346 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3347 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3348
3349 def report_webpage(self, video_id):
3350 """Report information extraction"""
3351 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3352
3353 def report_extraction(self, video_id):
3354 """Report information extraction"""
3355 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3356
3357 def _real_extract(self, url):
3358 mobj = re.match(self._VALID_URL, url)
3359 if mobj is None:
3360 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3361 return
3362 video_id = mobj.group(1)
3363
3364 self.report_webpage(video_id)
3365
3366 # Get webpage content
3367 try:
3368 webpage_bytes = compat_urllib_request.urlopen(url).read()
3369 webpage = webpage_bytes.decode('utf-8')
3370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3371 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3372 return
3373
3374 result = re.search(self.VIDEO_URL_RE, webpage)
3375 if result is None:
3376 self._downloader.trouble(u'ERROR: unable to extract video url')
3377 return
3378 video_url = compat_urllib_parse.unquote(result.group(1))
3379
3380 result = re.search(self.VIDEO_TITLE_RE, webpage)
3381 if result is None:
3382 self._downloader.trouble(u'ERROR: unable to extract video title')
3383 return
3384 video_title = result.group(1)
3385
3386 result = re.search(self.VIDEO_THUMB_RE, webpage)
3387 if result is None:
3388 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3389 return
3390 video_thumbnail = result.group(1)
3391
3392 return [{
3393 'id': video_id,
3394 'url': video_url,
3395 'uploader': None,
3396 'upload_date': None,
3397 'title': video_title,
3398 'ext': 'flv',
3399 'thumbnail': video_thumbnail,
3400 'description': None,
3401 }]
3402
3403
3404 class GooglePlusIE(InfoExtractor):
3405 """Information extractor for plus.google.com."""
3406
3407 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3408 IE_NAME = u'plus.google'
3409
3410 def __init__(self, downloader=None):
3411 InfoExtractor.__init__(self, downloader)
3412
3413 def report_extract_entry(self, url):
3414 """Report downloading extry"""
3415 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3416
3417 def report_date(self, upload_date):
3418 """Report downloading extry"""
3419 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3420
3421 def report_uploader(self, uploader):
3422 """Report downloading extry"""
3423 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3424
3425 def report_title(self, video_title):
3426 """Report downloading extry"""
3427 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3428
3429 def report_extract_vid_page(self, video_page):
3430 """Report information extraction."""
3431 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3432
3433 def _real_extract(self, url):
3434 # Extract id from URL
3435 mobj = re.match(self._VALID_URL, url)
3436 if mobj is None:
3437 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3438 return
3439
3440 post_url = mobj.group(0)
3441 video_id = mobj.group(1)
3442
3443 video_extension = 'flv'
3444
3445 # Step 1, Retrieve post webpage to extract further information
3446 self.report_extract_entry(post_url)
3447 request = compat_urllib_request.Request(post_url)
3448 try:
3449 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3452 return
3453
3454 # Extract update date
3455 upload_date = None
3456 pattern = 'title="Timestamp">(.*?)</a>'
3457 mobj = re.search(pattern, webpage)
3458 if mobj:
3459 upload_date = mobj.group(1)
3460 # Convert timestring to a format suitable for filename
3461 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3462 upload_date = upload_date.strftime('%Y%m%d')
3463 self.report_date(upload_date)
3464
3465 # Extract uploader
3466 uploader = None
3467 pattern = r'rel\="author".*?>(.*?)</a>'
3468 mobj = re.search(pattern, webpage)
3469 if mobj:
3470 uploader = mobj.group(1)
3471 self.report_uploader(uploader)
3472
3473 # Extract title
3474 # Get the first line for title
3475 video_title = u'NA'
3476 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3477 mobj = re.search(pattern, webpage)
3478 if mobj:
3479 video_title = mobj.group(1)
3480 self.report_title(video_title)
3481
3482 # Step 2, Stimulate clicking the image box to launch video
3483 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3484 mobj = re.search(pattern, webpage)
3485 if mobj is None:
3486 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3487
3488 video_page = mobj.group(1)
3489 request = compat_urllib_request.Request(video_page)
3490 try:
3491 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3494 return
3495 self.report_extract_vid_page(video_page)
3496
3497
3498 # Extract video links on video page
3499 """Extract video links of all sizes"""
3500 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3501 mobj = re.findall(pattern, webpage)
3502 if len(mobj) == 0:
3503 self._downloader.trouble(u'ERROR: unable to extract video links')
3504
3505 # Sort in resolution
3506 links = sorted(mobj)
3507
3508 # Choose the lowest of the sort, i.e. highest resolution
3509 video_url = links[-1]
3510 # Only get the url. The resolution part in the tuple has no use anymore
3511 video_url = video_url[-1]
3512 # Treat escaped \u0026 style hex
3513 try:
3514 video_url = video_url.decode("unicode_escape")
3515 except AttributeError: # Python 3
3516 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3517
3518
3519 return [{
3520 'id': video_id,
3521 'url': video_url,
3522 'uploader': uploader,
3523 'upload_date': upload_date,
3524 'title': video_title,
3525 'ext': video_extension,
3526 }]
3527
3528 class NBAIE(InfoExtractor):
3529 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3530 IE_NAME = u'nba'
3531
3532 def report_extraction(self, video_id):
3533 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3534
3535 def _real_extract(self, url):
3536 mobj = re.match(self._VALID_URL, url)
3537 if mobj is None:
3538 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3539 return
3540
3541 video_id = mobj.group(1)
3542 if video_id.endswith('/index.html'):
3543 video_id = video_id[:-len('/index.html')]
3544
3545 self.report_extraction(video_id)
3546 try:
3547 urlh = compat_urllib_request.urlopen(url)
3548 webpage_bytes = urlh.read()
3549 webpage = webpage_bytes.decode('utf-8', 'ignore')
3550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3551 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3552 return
3553
3554 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3555 def _findProp(rexp, default=None):
3556 m = re.search(rexp, webpage)
3557 if m:
3558 return unescapeHTML(m.group(1))
3559 else:
3560 return default
3561
3562 shortened_video_id = video_id.rpartition('/')[2]
3563 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3564 info = {
3565 'id': shortened_video_id,
3566 'url': video_url,
3567 'ext': 'mp4',
3568 'title': title,
3569 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3570 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3571 }
3572 return [info]
3573
3574 class JustinTVIE(InfoExtractor):
3575 """Information extractor for justin.tv and twitch.tv"""
3576 # TODO: One broadcast may be split into multiple videos. The key
3577 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3578 # starts at 1 and increases. Can we treat all parts as one video?
3579
3580 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3581 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3582 _JUSTIN_PAGE_LIMIT = 100
3583 IE_NAME = u'justin.tv'
3584
3585 def report_extraction(self, file_id):
3586 """Report information extraction."""
3587 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3588
3589 def report_download_page(self, channel, offset):
3590 """Report attempt to download a single page of videos."""
3591 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3592 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3593
3594 # Return count of items, list of *valid* items
3595 def _parse_page(self, url):
3596 try:
3597 urlh = compat_urllib_request.urlopen(url)
3598 webpage_bytes = urlh.read()
3599 webpage = webpage_bytes.decode('utf-8', 'ignore')
3600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3601 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3602 return
3603
3604 response = json.loads(webpage)
3605 info = []
3606 for clip in response:
3607 video_url = clip['video_file_url']
3608 if video_url:
3609 video_extension = os.path.splitext(video_url)[1][1:]
3610 video_date = re.sub('-', '', clip['created_on'][:10])
3611 info.append({
3612 'id': clip['id'],
3613 'url': video_url,
3614 'title': clip['title'],
3615 'uploader': clip.get('user_id', clip.get('channel_id')),
3616 'upload_date': video_date,
3617 'ext': video_extension,
3618 })
3619 return (len(response), info)
3620
3621 def _real_extract(self, url):
3622 mobj = re.match(self._VALID_URL, url)
3623 if mobj is None:
3624 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3625 return
3626
3627 api = 'http://api.justin.tv'
3628 video_id = mobj.group(mobj.lastindex)
3629 paged = False
3630 if mobj.lastindex == 1:
3631 paged = True
3632 api += '/channel/archives/%s.json'
3633 else:
3634 api += '/clip/show/%s.json'
3635 api = api % (video_id,)
3636
3637 self.report_extraction(video_id)
3638
3639 info = []
3640 offset = 0
3641 limit = self._JUSTIN_PAGE_LIMIT
3642 while True:
3643 if paged:
3644 self.report_download_page(video_id, offset)
3645 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3646 page_count, page_info = self._parse_page(page_url)
3647 info.extend(page_info)
3648 if not paged or page_count != limit:
3649 break
3650 offset += limit
3651 return info
3652
3653 class FunnyOrDieIE(InfoExtractor):
3654 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3655 IE_NAME = u'FunnyOrDie'
3656
3657 def report_extraction(self, video_id):
3658 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3659
3660 def _real_extract(self, url):
3661 mobj = re.match(self._VALID_URL, url)
3662 if mobj is None:
3663 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3664 return
3665
3666 video_id = mobj.group('id')
3667 self.report_extraction(video_id)
3668 try:
3669 urlh = compat_urllib_request.urlopen(url)
3670 webpage_bytes = urlh.read()
3671 webpage = webpage_bytes.decode('utf-8', 'ignore')
3672 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3673 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3674 return
3675
3676 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3677 if not m:
3678 self._downloader.trouble(u'ERROR: unable to find video information')
3679 video_url = unescapeHTML(m.group('url'))
3680
3681 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3682 if not m:
3683 self._downloader.trouble(u'Cannot find video title')
3684 title = unescapeHTML(m.group('title'))
3685
3686 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3687 if m:
3688 desc = unescapeHTML(m.group('desc'))
3689 else:
3690 desc = None
3691
3692 info = {
3693 'id': video_id,
3694 'url': video_url,
3695 'ext': 'mp4',
3696 'title': title,
3697 'description': desc,
3698 }
3699 return [info]
3700
3701 class TweetReelIE(InfoExtractor):
3702 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3703
3704 def report_extraction(self, video_id):
3705 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3706
3707 def _real_extract(self, url):
3708 mobj = re.match(self._VALID_URL, url)
3709 if mobj is None:
3710 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3711 return
3712
3713 video_id = mobj.group('id')
3714 self.report_extraction(video_id)
3715 try:
3716 urlh = compat_urllib_request.urlopen(url)
3717 webpage_bytes = urlh.read()
3718 webpage = webpage_bytes.decode('utf-8', 'ignore')
3719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3721 return
3722
3723 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3724 if not m:
3725 self._downloader.trouble(u'ERROR: Cannot find status ID')
3726 status_id = m.group(1)
3727
3728 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3729 if not m:
3730 self._downloader.trouble(u'WARNING: Cannot find description')
3731 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3732
3733 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3734 if not m:
3735 self._downloader.trouble(u'ERROR: Cannot find uploader')
3736 uploader = unescapeHTML(m.group('uploader'))
3737 uploader_id = unescapeHTML(m.group('uploader_id'))
3738
3739 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3740 if not m:
3741 self._downloader.trouble(u'ERROR: Cannot find upload date')
3742 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3743
3744 title = desc
3745 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3746
3747 info = {
3748 'id': video_id,
3749 'url': video_url,
3750 'ext': 'mov',
3751 'title': title,
3752 'description': desc,
3753 'uploader': uploader,
3754 'uploader_id': uploader_id,
3755 'internal_id': status_id,
3756 'upload_date': upload_date
3757 }
3758 return [info]
3759
3760 class SteamIE(InfoExtractor):
3761 _VALID_URL = r"""http://store.steampowered.com/
3762 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3763 (?P<gameID>\d+)/?
3764 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3765 """
3766 IE_NAME = u'Steam'
3767
3768 def suitable(self, url):
3769 """Receives a URL and returns True if suitable for this IE."""
3770 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3771
3772 def report_download_video_page(self, game_id):
3773 self._downloader.to_screen(u'[%s] %s: Downloading video page' % (self.IE_NAME, game_id))
3774
3775 def _real_extract(self, url):
3776 m = re.match(self._VALID_URL, url, re.VERBOSE)
3777 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3778 gameID = m.group('gameID')
3779 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3780 try:
3781 self.report_download_video_page(gameID)
3782 urlh = compat_urllib_request.urlopen(videourl)
3783 webpage_bytes = urlh.read()
3784 webpage = webpage_bytes.decode('utf-8', 'ignore')
3785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3786 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3787 return
3788 mweb = re.finditer(urlRE, webpage)
3789 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3790 titles = list(re.finditer(namesRE, webpage))
3791 videos = []
3792 i = 0
3793 for vid in mweb:
3794 video_id = vid.group('videoID')
3795 title = titles[i].group('videoName')
3796 video_url=vid.group('videoURL')
3797 if not video_url:
3798 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3799 i += 1
3800 info = {
3801 'id':video_id,
3802 'url':video_url,
3803 'ext': 'flv',
3804 'title': title
3805 }
3806 videos.append(info)
3807 return videos
3808
3809 class UstreamIE(InfoExtractor):
3810 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3811 IE_NAME = u'ustream'
3812
3813 def _real_extract(self, url):
3814 m = re.match(self._VALID_URL, url)
3815 video_id = m.group('videoID')
3816 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3817 try:
3818 urlh = compat_urllib_request.urlopen(url)
3819 webpage_bytes = urlh.read()
3820 webpage = webpage_bytes.decode('utf-8', 'ignore')
3821 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3822 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3823 return
3824 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3825 title = m.group('title')
3826 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3827 uploader = m.group('uploader')
3828 info = {
3829 'id':video_id,
3830 'url':video_url,
3831 'ext': 'flv',
3832 'title': title,
3833 'uploader': uploader
3834 }
3835 return [info]
3836 pass