]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
various py3 fixes; all tests green on 3.3
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
49
50 The fields should all be Unicode strings.
51
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
55
56 _real_extract() must return a *list* of information dictionaries as
57 described above.
58
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
61 """
62
63 _ready = False
64 _downloader = None
65 _WORKING = True
66
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
69 self._ready = False
70 self.set_downloader(downloader)
71
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
75
76 def working(self):
77 """Getter method for _WORKING."""
78 return self._WORKING
79
80 def initialize(self):
81 """Initializes an instance (authentication, etc)."""
82 if not self._ready:
83 self._real_initialize()
84 self._ready = True
85
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
88 self.initialize()
89 return self._real_extract(url)
90
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
94
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
97 pass
98
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
101 pass
102
103
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
106
107 _VALID_URL = r"""^
108 (
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
120 v=
121 )
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
126 $"""
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
136 '13': '3gp',
137 '17': 'mp4',
138 '18': 'mp4',
139 '22': 'mp4',
140 '37': 'mp4',
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142 '43': 'webm',
143 '44': 'webm',
144 '45': 'webm',
145 '46': 'webm',
146 }
147 _video_dimensions = {
148 '5': '240x400',
149 '6': '???',
150 '13': '???',
151 '17': '144x176',
152 '18': '360x640',
153 '22': '720x1280',
154 '34': '360x640',
155 '35': '480x854',
156 '37': '1080x1920',
157 '38': '3072x4096',
158 '43': '360x640',
159 '44': '480x854',
160 '45': '720x1280',
161 '46': '1080x1920',
162 }
163 IE_NAME = u'youtube'
164
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
172
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
176
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
180
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205 def _closed_captions_xml_to_srt(self, xml_string):
206 srt = ''
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
211 start = float(start)
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
220 return srt
221
222 def _print_formats(self, formats):
223 print('Available formats:')
224 for x in formats:
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227 def _real_initialize(self):
228 if self._downloader is None:
229 return
230
231 username = None
232 password = None
233 downloader_params = self._downloader.params
234
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
240 try:
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242 if info is not None:
243 username = info[0]
244 password = info[2]
245 else:
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249 return
250
251 # Set language
252 request = compat_urllib_request.Request(self._LANG_URL)
253 try:
254 self.report_lang()
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258 return
259
260 # No authentication to be performed
261 if username is None:
262 return
263
264 # Log in
265 login_form = {
266 'current_form': 'loginForm',
267 'next': '/',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
271 }
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273 try:
274 self.report_login()
275 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278 return
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281 return
282
283 # Confirm age
284 age_form = {
285 'next_url': '/',
286 'action_confirm': 'Confirm',
287 }
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289 try:
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294 return
295
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
299 if mobj:
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304 if mobj is None:
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306 return
307 video_id = mobj.group(2)
308
309 # Get video webpage
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312 try:
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316 return
317
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322 if mobj is not None:
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324 else:
325 player_url = None
326
327 # Get video info
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
333 try:
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
338 break
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341 return
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345 else:
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347 return
348
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
352 return
353
354 # Start extracting information
355 self.report_information_extraction(video_id)
356
357 # uploader
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360 return
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363 # title
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
366 return
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369 # thumbnail image
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372 video_thumbnail = ''
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376 # upload date
377 upload_date = None
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379 if mobj is not None:
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
383 try:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385 except:
386 pass
387
388 # description
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
392 else:
393 video_description = ''
394
395 # closed captions
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
398 try:
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401 try:
402 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
412 srt_lang = 'en'
413 else:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418 try:
419 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422 if not srt_xml:
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
425 except Trouble as trouble:
426 self._downloader.trouble(str(trouble))
427
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
430 video_duration = ''
431 else:
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434 # token
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
439
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
453 else:
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
458 return
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
461 return
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468 else:
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
474 if rf in url_map:
475 video_url_list = [(rf, url_map[rf])]
476 break
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
479 return
480 else:
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482 return
483
484 results = []
485 for format_param, video_real_url in video_url_list:
486 # Extension
487 video_extension = self._video_extensions.get(format_param, 'flv')
488
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
491
492 results.append({
493 'id': video_id,
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
505 })
506 return results
507
508
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
511
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
516
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
519
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
527
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
539 try:
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544 return
545
546 # Confirm age
547 disclaimer_form = {
548 'filters': '0',
549 'submit': "Continue - I'm over 18",
550 }
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552 try:
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557 return
558
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
562 if mobj is None:
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564 return
565
566 video_id = mobj.group(1)
567
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572 return
573
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576 try:
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581 return
582
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586 if mobj is not None:
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
589
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592 if mobj is None:
593 video_url = mediaURL
594 else:
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597 else:
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599 if mobj is None:
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
601 return
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
605 return
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
609 return
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615 if mobj is None:
616 self._downloader.trouble(u'ERROR: unable to extract title')
617 return
618 video_title = mobj.group(1).decode('utf-8')
619
620 mobj = re.search(r'submitter=(.*?);', webpage)
621 if mobj is None:
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623 return
624 video_uploader = mobj.group(1)
625
626 return [{
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
630 'upload_date': None,
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
633 }]
634
635
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
638
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
641
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
644
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
656 if mobj is None:
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658 return
659
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662 video_extension = 'mp4'
663
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
667 try:
668 self.report_download_webpage(video_id)
669 webpage_bytes = compat_urllib_request.urlopen(request).read()
670 webpage = webpage_bytes.decode('utf-8')
671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
673 return
674
675 # Extract URL, uploader and title from webpage
676 self.report_extraction(video_id)
677 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
678 if mobj is None:
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
680 return
681 flashvars = compat_urllib_parse.unquote(mobj.group(1))
682
683 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
684 if key in flashvars:
685 max_quality = key
686 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
687 break
688 else:
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
690 return
691
692 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
693 if mobj is None:
694 self._downloader.trouble(u'ERROR: unable to extract video URL')
695 return
696
697 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
698
699 # TODO: support choosing qualities
700
701 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
702 if mobj is None:
703 self._downloader.trouble(u'ERROR: unable to extract title')
704 return
705 video_title = unescapeHTML(mobj.group('title'))
706
707 video_uploader = None
708 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
709 if mobj is None:
710 # lookin for official user
711 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712 if mobj_official is None:
713 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
714 else:
715 video_uploader = mobj_official.group(1)
716 else:
717 video_uploader = mobj.group(1)
718
719 video_upload_date = None
720 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
721 if mobj is not None:
722 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
723
724 return [{
725 'id': video_id,
726 'url': video_url,
727 'uploader': video_uploader,
728 'upload_date': video_upload_date,
729 'title': video_title,
730 'ext': video_extension,
731 }]
732
733
734 class GoogleIE(InfoExtractor):
735 """Information extractor for video.google.com."""
736
737 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
738 IE_NAME = u'video.google'
739
740 def __init__(self, downloader=None):
741 InfoExtractor.__init__(self, downloader)
742
743 def report_download_webpage(self, video_id):
744 """Report webpage download."""
745 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
746
747 def report_extraction(self, video_id):
748 """Report information extraction."""
749 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
750
751 def _real_extract(self, url):
752 # Extract id from URL
753 mobj = re.match(self._VALID_URL, url)
754 if mobj is None:
755 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
756 return
757
758 video_id = mobj.group(1)
759
760 video_extension = 'mp4'
761
762 # Retrieve video webpage to extract further information
763 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
764 try:
765 self.report_download_webpage(video_id)
766 webpage = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
769 return
770
771 # Extract URL, uploader, and title from webpage
772 self.report_extraction(video_id)
773 mobj = re.search(r"download_url:'([^']+)'", webpage)
774 if mobj is None:
775 video_extension = 'flv'
776 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
777 if mobj is None:
778 self._downloader.trouble(u'ERROR: unable to extract media URL')
779 return
780 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781 mediaURL = mediaURL.replace('\\x3d', '\x3d')
782 mediaURL = mediaURL.replace('\\x26', '\x26')
783
784 video_url = mediaURL
785
786 mobj = re.search(r'<title>(.*)</title>', webpage)
787 if mobj is None:
788 self._downloader.trouble(u'ERROR: unable to extract title')
789 return
790 video_title = mobj.group(1).decode('utf-8')
791
792 # Extract video description
793 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
794 if mobj is None:
795 self._downloader.trouble(u'ERROR: unable to extract video description')
796 return
797 video_description = mobj.group(1).decode('utf-8')
798 if not video_description:
799 video_description = 'No description available.'
800
801 # Extract video thumbnail
802 if self._downloader.params.get('forcethumbnail', False):
803 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
804 try:
805 webpage = compat_urllib_request.urlopen(request).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
808 return
809 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
810 if mobj is None:
811 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
812 return
813 video_thumbnail = mobj.group(1)
814 else: # we need something to pass to process_info
815 video_thumbnail = ''
816
817 return [{
818 'id': video_id.decode('utf-8'),
819 'url': video_url.decode('utf-8'),
820 'uploader': None,
821 'upload_date': None,
822 'title': video_title,
823 'ext': video_extension.decode('utf-8'),
824 }]
825
826
827 class PhotobucketIE(InfoExtractor):
828 """Information extractor for photobucket.com."""
829
830 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831 IE_NAME = u'photobucket'
832
833 def __init__(self, downloader=None):
834 InfoExtractor.__init__(self, downloader)
835
836 def report_download_webpage(self, video_id):
837 """Report webpage download."""
838 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
839
840 def report_extraction(self, video_id):
841 """Report information extraction."""
842 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
843
844 def _real_extract(self, url):
845 # Extract id from URL
846 mobj = re.match(self._VALID_URL, url)
847 if mobj is None:
848 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
849 return
850
851 video_id = mobj.group(1)
852
853 video_extension = 'flv'
854
855 # Retrieve video webpage to extract further information
856 request = compat_urllib_request.Request(url)
857 try:
858 self.report_download_webpage(video_id)
859 webpage = compat_urllib_request.urlopen(request).read()
860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
862 return
863
864 # Extract URL, uploader, and title from webpage
865 self.report_extraction(video_id)
866 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
867 if mobj is None:
868 self._downloader.trouble(u'ERROR: unable to extract media URL')
869 return
870 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
871
872 video_url = mediaURL
873
874 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
875 if mobj is None:
876 self._downloader.trouble(u'ERROR: unable to extract title')
877 return
878 video_title = mobj.group(1).decode('utf-8')
879
880 video_uploader = mobj.group(2).decode('utf-8')
881
882 return [{
883 'id': video_id.decode('utf-8'),
884 'url': video_url.decode('utf-8'),
885 'uploader': video_uploader,
886 'upload_date': None,
887 'title': video_title,
888 'ext': video_extension.decode('utf-8'),
889 }]
890
891
892 class YahooIE(InfoExtractor):
893 """Information extractor for video.yahoo.com."""
894
895 # _VALID_URL matches all Yahoo! Video URLs
896 # _VPAGE_URL matches only the extractable '/watch/' URLs
897 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
898 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
899 IE_NAME = u'video.yahoo'
900
901 def __init__(self, downloader=None):
902 InfoExtractor.__init__(self, downloader)
903
904 def report_download_webpage(self, video_id):
905 """Report webpage download."""
906 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
907
908 def report_extraction(self, video_id):
909 """Report information extraction."""
910 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
911
912 def _real_extract(self, url, new_video=True):
913 # Extract ID from URL
914 mobj = re.match(self._VALID_URL, url)
915 if mobj is None:
916 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
917 return
918
919 video_id = mobj.group(2)
920 video_extension = 'flv'
921
922 # Rewrite valid but non-extractable URLs as
923 # extractable English language /watch/ URLs
924 if re.match(self._VPAGE_URL, url) is None:
925 request = compat_urllib_request.Request(url)
926 try:
927 webpage = compat_urllib_request.urlopen(request).read()
928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
929 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
930 return
931
932 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
933 if mobj is None:
934 self._downloader.trouble(u'ERROR: Unable to extract id field')
935 return
936 yahoo_id = mobj.group(1)
937
938 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
939 if mobj is None:
940 self._downloader.trouble(u'ERROR: Unable to extract vid field')
941 return
942 yahoo_vid = mobj.group(1)
943
944 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
945 return self._real_extract(url, new_video=False)
946
947 # Retrieve video webpage to extract further information
948 request = compat_urllib_request.Request(url)
949 try:
950 self.report_download_webpage(video_id)
951 webpage = compat_urllib_request.urlopen(request).read()
952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
953 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
954 return
955
956 # Extract uploader and title from webpage
957 self.report_extraction(video_id)
958 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
959 if mobj is None:
960 self._downloader.trouble(u'ERROR: unable to extract video title')
961 return
962 video_title = mobj.group(1).decode('utf-8')
963
964 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
965 if mobj is None:
966 self._downloader.trouble(u'ERROR: unable to extract video uploader')
967 return
968 video_uploader = mobj.group(1).decode('utf-8')
969
970 # Extract video thumbnail
971 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
972 if mobj is None:
973 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
974 return
975 video_thumbnail = mobj.group(1).decode('utf-8')
976
977 # Extract video description
978 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
979 if mobj is None:
980 self._downloader.trouble(u'ERROR: unable to extract video description')
981 return
982 video_description = mobj.group(1).decode('utf-8')
983 if not video_description:
984 video_description = 'No description available.'
985
986 # Extract video height and width
987 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
988 if mobj is None:
989 self._downloader.trouble(u'ERROR: unable to extract video height')
990 return
991 yv_video_height = mobj.group(1)
992
993 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
994 if mobj is None:
995 self._downloader.trouble(u'ERROR: unable to extract video width')
996 return
997 yv_video_width = mobj.group(1)
998
999 # Retrieve video playlist to extract media URL
1000 # I'm not completely sure what all these options are, but we
1001 # seem to need most of them, otherwise the server sends a 401.
1002 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1003 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1004 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1005 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1006 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1007 try:
1008 self.report_download_webpage(video_id)
1009 webpage = compat_urllib_request.urlopen(request).read()
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1012 return
1013
1014 # Extract media URL from playlist XML
1015 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1016 if mobj is None:
1017 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1018 return
1019 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1020 video_url = unescapeHTML(video_url)
1021
1022 return [{
1023 'id': video_id.decode('utf-8'),
1024 'url': video_url,
1025 'uploader': video_uploader,
1026 'upload_date': None,
1027 'title': video_title,
1028 'ext': video_extension.decode('utf-8'),
1029 'thumbnail': video_thumbnail.decode('utf-8'),
1030 'description': video_description,
1031 }]
1032
1033
1034 class VimeoIE(InfoExtractor):
1035 """Information extractor for vimeo.com."""
1036
1037 # _VALID_URL matches Vimeo URLs
1038 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1039 IE_NAME = u'vimeo'
1040
1041 def __init__(self, downloader=None):
1042 InfoExtractor.__init__(self, downloader)
1043
1044 def report_download_webpage(self, video_id):
1045 """Report webpage download."""
1046 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1047
1048 def report_extraction(self, video_id):
1049 """Report information extraction."""
1050 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1051
1052 def _real_extract(self, url, new_video=True):
1053 # Extract ID from URL
1054 mobj = re.match(self._VALID_URL, url)
1055 if mobj is None:
1056 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1057 return
1058
1059 video_id = mobj.group(1)
1060
1061 # Retrieve video webpage to extract further information
1062 request = compat_urllib_request.Request(url, None, std_headers)
1063 try:
1064 self.report_download_webpage(video_id)
1065 webpage_bytes = compat_urllib_request.urlopen(request).read()
1066 webpage = webpage_bytes.decode('utf-8')
1067 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1068 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1069 return
1070
1071 # Now we begin extracting as much information as we can from what we
1072 # retrieved. First we extract the information common to all extractors,
1073 # and latter we extract those that are Vimeo specific.
1074 self.report_extraction(video_id)
1075
1076 # Extract the config JSON
1077 try:
1078 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1079 config = json.loads(config)
1080 except:
1081 self._downloader.trouble(u'ERROR: unable to extract info section')
1082 return
1083
1084 # Extract title
1085 video_title = config["video"]["title"]
1086
1087 # Extract uploader
1088 video_uploader = config["video"]["owner"]["name"]
1089
1090 # Extract video thumbnail
1091 video_thumbnail = config["video"]["thumbnail"]
1092
1093 # Extract video description
1094 video_description = get_element_by_id("description", webpage)
1095 if video_description: video_description = clean_html(video_description)
1096 else: video_description = ''
1097
1098 # Extract upload date
1099 video_upload_date = None
1100 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1101 if mobj is not None:
1102 video_upload_date = mobj.group(1)
1103
1104 # Vimeo specific: extract request signature and timestamp
1105 sig = config['request']['signature']
1106 timestamp = config['request']['timestamp']
1107
1108 # Vimeo specific: extract video codec and quality information
1109 # First consider quality, then codecs, then take everything
1110 # TODO bind to format param
1111 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1112 files = { 'hd': [], 'sd': [], 'other': []}
1113 for codec_name, codec_extension in codecs:
1114 if codec_name in config["video"]["files"]:
1115 if 'hd' in config["video"]["files"][codec_name]:
1116 files['hd'].append((codec_name, codec_extension, 'hd'))
1117 elif 'sd' in config["video"]["files"][codec_name]:
1118 files['sd'].append((codec_name, codec_extension, 'sd'))
1119 else:
1120 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1121
1122 for quality in ('hd', 'sd', 'other'):
1123 if len(files[quality]) > 0:
1124 video_quality = files[quality][0][2]
1125 video_codec = files[quality][0][0]
1126 video_extension = files[quality][0][1]
1127 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1128 break
1129 else:
1130 self._downloader.trouble(u'ERROR: no known codec found')
1131 return
1132
1133 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1134 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1135
1136 return [{
1137 'id': video_id,
1138 'url': video_url,
1139 'uploader': video_uploader,
1140 'upload_date': video_upload_date,
1141 'title': video_title,
1142 'ext': video_extension,
1143 'thumbnail': video_thumbnail,
1144 'description': video_description,
1145 }]
1146
1147
1148 class ArteTvIE(InfoExtractor):
1149 """arte.tv information extractor."""
1150
1151 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1152 _LIVE_URL = r'index-[0-9]+\.html$'
1153
1154 IE_NAME = u'arte.tv'
1155
1156 def __init__(self, downloader=None):
1157 InfoExtractor.__init__(self, downloader)
1158
1159 def report_download_webpage(self, video_id):
1160 """Report webpage download."""
1161 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1162
1163 def report_extraction(self, video_id):
1164 """Report information extraction."""
1165 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1166
1167 def fetch_webpage(self, url):
1168 self._downloader.increment_downloads()
1169 request = compat_urllib_request.Request(url)
1170 try:
1171 self.report_download_webpage(url)
1172 webpage = compat_urllib_request.urlopen(request).read()
1173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1175 return
1176 except ValueError as err:
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178 return
1179 return webpage
1180
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1184 info = {}
1185
1186 if mobj is None:
1187 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188 return
1189
1190 for (i, key, err) in matchTuples:
1191 if mobj.group(i) is None:
1192 self._downloader.trouble(err)
1193 return
1194 else:
1195 info[key] = mobj.group(i)
1196
1197 return info
1198
1199 def extractLiveStream(self, url):
1200 video_lang = url.split('/')[-4]
1201 info = self.grep_webpage(
1202 url,
1203 r'src="(.*?/videothek_js.*?\.js)',
1204 0,
1205 [
1206 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1207 ]
1208 )
1209 http_host = url.split('/')[2]
1210 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211 info = self.grep_webpage(
1212 next_url,
1213 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214 '(http://.*?\.swf).*?' +
1215 '(rtmp://.*?)\'',
1216 re.DOTALL,
1217 [
1218 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1219 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1220 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1221 ]
1222 )
1223 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224
1225 def extractPlus7Stream(self, url):
1226 video_lang = url.split('/')[-3]
1227 info = self.grep_webpage(
1228 url,
1229 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230 0,
1231 [
1232 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1233 ]
1234 )
1235 next_url = compat_urllib_parse.unquote(info.get('url'))
1236 info = self.grep_webpage(
1237 next_url,
1238 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239 0,
1240 [
1241 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1242 ]
1243 )
1244 next_url = compat_urllib_parse.unquote(info.get('url'))
1245
1246 info = self.grep_webpage(
1247 next_url,
1248 r'<video id="(.*?)".*?>.*?' +
1249 '<name>(.*?)</name>.*?' +
1250 '<dateVideo>(.*?)</dateVideo>.*?' +
1251 '<url quality="hd">(.*?)</url>',
1252 re.DOTALL,
1253 [
1254 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1255 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1256 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1257 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1258 ]
1259 )
1260
1261 return {
1262 'id': info.get('id'),
1263 'url': compat_urllib_parse.unquote(info.get('url')),
1264 'uploader': u'arte.tv',
1265 'upload_date': info.get('date'),
1266 'title': info.get('title'),
1267 'ext': u'mp4',
1268 'format': u'NA',
1269 'player_url': None,
1270 }
1271
1272 def _real_extract(self, url):
1273 video_id = url.split('/')[-1]
1274 self.report_extraction(video_id)
1275
1276 if re.search(self._LIVE_URL, video_id) is not None:
1277 self.extractLiveStream(url)
1278 return
1279 else:
1280 info = self.extractPlus7Stream(url)
1281
1282 return [info]
1283
1284
1285 class GenericIE(InfoExtractor):
1286 """Generic last-resort information extractor."""
1287
1288 _VALID_URL = r'.*'
1289 IE_NAME = u'generic'
1290
1291 def __init__(self, downloader=None):
1292 InfoExtractor.__init__(self, downloader)
1293
1294 def report_download_webpage(self, video_id):
1295 """Report webpage download."""
1296 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1297 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1298
1299 def report_extraction(self, video_id):
1300 """Report information extraction."""
1301 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1302
1303 def report_following_redirect(self, new_url):
1304 """Report information extraction."""
1305 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1306
1307 def _test_redirect(self, url):
1308 """Check if it is a redirect, like url shorteners, in case restart chain."""
1309 class HeadRequest(compat_urllib_request.Request):
1310 def get_method(self):
1311 return "HEAD"
1312
1313 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1314 """
1315 Subclass the HTTPRedirectHandler to make it use our
1316 HeadRequest also on the redirected URL
1317 """
1318 def redirect_request(self, req, fp, code, msg, headers, newurl):
1319 if code in (301, 302, 303, 307):
1320 newurl = newurl.replace(' ', '%20')
1321 newheaders = dict((k,v) for k,v in req.headers.items()
1322 if k.lower() not in ("content-length", "content-type"))
1323 return HeadRequest(newurl,
1324 headers=newheaders,
1325 origin_req_host=req.get_origin_req_host(),
1326 unverifiable=True)
1327 else:
1328 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1329
1330 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1331 """
1332 Fallback to GET if HEAD is not allowed (405 HTTP error)
1333 """
1334 def http_error_405(self, req, fp, code, msg, headers):
1335 fp.read()
1336 fp.close()
1337
1338 newheaders = dict((k,v) for k,v in req.headers.items()
1339 if k.lower() not in ("content-length", "content-type"))
1340 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1341 headers=newheaders,
1342 origin_req_host=req.get_origin_req_host(),
1343 unverifiable=True))
1344
1345 # Build our opener
1346 opener = compat_urllib_request.OpenerDirector()
1347 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1348 HTTPMethodFallback, HEADRedirectHandler,
1349 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1350 opener.add_handler(handler())
1351
1352 response = opener.open(HeadRequest(url))
1353 new_url = response.geturl()
1354
1355 if url == new_url:
1356 return False
1357
1358 self.report_following_redirect(new_url)
1359 self._downloader.download([new_url])
1360 return True
1361
1362 def _real_extract(self, url):
1363 if self._test_redirect(url): return
1364
1365 video_id = url.split('/')[-1]
1366 request = compat_urllib_request.Request(url)
1367 try:
1368 self.report_download_webpage(video_id)
1369 webpage = compat_urllib_request.urlopen(request).read()
1370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1371 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1372 return
1373 except ValueError as err:
1374 # since this is the last-resort InfoExtractor, if
1375 # this error is thrown, it'll be thrown here
1376 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377 return
1378
1379 self.report_extraction(video_id)
1380 # Start with something easy: JW Player in SWFObject
1381 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1382 if mobj is None:
1383 # Broaden the search a little bit
1384 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1385 if mobj is None:
1386 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387 return
1388
1389 # It's possible that one of the regexes
1390 # matched, but returned an empty group:
1391 if mobj.group(1) is None:
1392 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1393 return
1394
1395 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396 video_id = os.path.basename(video_url)
1397
1398 # here's a fun little line of code for you:
1399 video_extension = os.path.splitext(video_id)[1][1:]
1400 video_id = os.path.splitext(video_id)[0]
1401
1402 # it's tempting to parse this further, but you would
1403 # have to take into account all the variations like
1404 # Video Title - Site Name
1405 # Site Name | Video Title
1406 # Video Title - Tagline | Site Name
1407 # and so on and so forth; it's just not practical
1408 mobj = re.search(r'<title>(.*)</title>', webpage)
1409 if mobj is None:
1410 self._downloader.trouble(u'ERROR: unable to extract title')
1411 return
1412 video_title = mobj.group(1)
1413
1414 # video uploader is domain name
1415 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416 if mobj is None:
1417 self._downloader.trouble(u'ERROR: unable to extract title')
1418 return
1419 video_uploader = mobj.group(1)
1420
1421 return [{
1422 'id': video_id,
1423 'url': video_url,
1424 'uploader': video_uploader,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': video_extension,
1428 }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432 """Information Extractor for YouTube search queries."""
1433 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435 _max_youtube_results = 1000
1436 IE_NAME = u'youtube:search'
1437
1438 def __init__(self, downloader=None):
1439 InfoExtractor.__init__(self, downloader)
1440
1441 def report_download_page(self, query, pagenum):
1442 """Report attempt to download search page with given number."""
1443 query = query.decode(preferredencoding())
1444 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1445
1446 def _real_extract(self, query):
1447 mobj = re.match(self._VALID_URL, query)
1448 if mobj is None:
1449 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1450 return
1451
1452 prefix, query = query.split(':')
1453 prefix = prefix[8:]
1454 query = query.encode('utf-8')
1455 if prefix == '':
1456 self._download_n_results(query, 1)
1457 return
1458 elif prefix == 'all':
1459 self._download_n_results(query, self._max_youtube_results)
1460 return
1461 else:
1462 try:
1463 n = int(prefix)
1464 if n <= 0:
1465 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1466 return
1467 elif n > self._max_youtube_results:
1468 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1469 n = self._max_youtube_results
1470 self._download_n_results(query, n)
1471 return
1472 except ValueError: # parsing prefix as integer fails
1473 self._download_n_results(query, 1)
1474 return
1475
1476 def _download_n_results(self, query, n):
1477 """Downloads a specified number of results for a query"""
1478
1479 video_ids = []
1480 pagenum = 0
1481 limit = n
1482
1483 while (50 * pagenum) < limit:
1484 self.report_download_page(query, pagenum+1)
1485 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1486 request = compat_urllib_request.Request(result_url)
1487 try:
1488 data = compat_urllib_request.urlopen(request).read()
1489 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1490 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1491 return
1492 api_response = json.loads(data)['data']
1493
1494 new_ids = list(video['id'] for video in api_response['items'])
1495 video_ids += new_ids
1496
1497 limit = min(n, api_response['totalItems'])
1498 pagenum += 1
1499
1500 if len(video_ids) > n:
1501 video_ids = video_ids[:n]
1502 for id in video_ids:
1503 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1504 return
1505
1506
1507 class GoogleSearchIE(InfoExtractor):
1508 """Information Extractor for Google Video search queries."""
1509 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1510 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1511 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1512 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1513 _max_google_results = 1000
1514 IE_NAME = u'video.google:search'
1515
1516 def __init__(self, downloader=None):
1517 InfoExtractor.__init__(self, downloader)
1518
1519 def report_download_page(self, query, pagenum):
1520 """Report attempt to download playlist page with given number."""
1521 query = query.decode(preferredencoding())
1522 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1523
1524 def _real_extract(self, query):
1525 mobj = re.match(self._VALID_URL, query)
1526 if mobj is None:
1527 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1528 return
1529
1530 prefix, query = query.split(':')
1531 prefix = prefix[8:]
1532 query = query.encode('utf-8')
1533 if prefix == '':
1534 self._download_n_results(query, 1)
1535 return
1536 elif prefix == 'all':
1537 self._download_n_results(query, self._max_google_results)
1538 return
1539 else:
1540 try:
1541 n = int(prefix)
1542 if n <= 0:
1543 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1544 return
1545 elif n > self._max_google_results:
1546 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1547 n = self._max_google_results
1548 self._download_n_results(query, n)
1549 return
1550 except ValueError: # parsing prefix as integer fails
1551 self._download_n_results(query, 1)
1552 return
1553
1554 def _download_n_results(self, query, n):
1555 """Downloads a specified number of results for a query"""
1556
1557 video_ids = []
1558 pagenum = 0
1559
1560 while True:
1561 self.report_download_page(query, pagenum)
1562 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1563 request = compat_urllib_request.Request(result_url)
1564 try:
1565 page = compat_urllib_request.urlopen(request).read()
1566 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1567 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1568 return
1569
1570 # Extract video identifiers
1571 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1572 video_id = mobj.group(1)
1573 if video_id not in video_ids:
1574 video_ids.append(video_id)
1575 if len(video_ids) == n:
1576 # Specified n videos reached
1577 for id in video_ids:
1578 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579 return
1580
1581 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582 for id in video_ids:
1583 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1584 return
1585
1586 pagenum = pagenum + 1
1587
1588
1589 class YahooSearchIE(InfoExtractor):
1590 """Information Extractor for Yahoo! Video search queries."""
1591 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1592 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1593 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1594 _MORE_PAGES_INDICATOR = r'\s*Next'
1595 _max_yahoo_results = 1000
1596 IE_NAME = u'video.yahoo:search'
1597
1598 def __init__(self, downloader=None):
1599 InfoExtractor.__init__(self, downloader)
1600
1601 def report_download_page(self, query, pagenum):
1602 """Report attempt to download playlist page with given number."""
1603 query = query.decode(preferredencoding())
1604 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1605
1606 def _real_extract(self, query):
1607 mobj = re.match(self._VALID_URL, query)
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1610 return
1611
1612 prefix, query = query.split(':')
1613 prefix = prefix[8:]
1614 query = query.encode('utf-8')
1615 if prefix == '':
1616 self._download_n_results(query, 1)
1617 return
1618 elif prefix == 'all':
1619 self._download_n_results(query, self._max_yahoo_results)
1620 return
1621 else:
1622 try:
1623 n = int(prefix)
1624 if n <= 0:
1625 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1626 return
1627 elif n > self._max_yahoo_results:
1628 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1629 n = self._max_yahoo_results
1630 self._download_n_results(query, n)
1631 return
1632 except ValueError: # parsing prefix as integer fails
1633 self._download_n_results(query, 1)
1634 return
1635
1636 def _download_n_results(self, query, n):
1637 """Downloads a specified number of results for a query"""
1638
1639 video_ids = []
1640 already_seen = set()
1641 pagenum = 1
1642
1643 while True:
1644 self.report_download_page(query, pagenum)
1645 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1646 request = compat_urllib_request.Request(result_url)
1647 try:
1648 page = compat_urllib_request.urlopen(request).read()
1649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1650 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1651 return
1652
1653 # Extract video identifiers
1654 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1655 video_id = mobj.group(1)
1656 if video_id not in already_seen:
1657 video_ids.append(video_id)
1658 already_seen.add(video_id)
1659 if len(video_ids) == n:
1660 # Specified n videos reached
1661 for id in video_ids:
1662 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1663 return
1664
1665 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1666 for id in video_ids:
1667 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1668 return
1669
1670 pagenum = pagenum + 1
1671
1672
1673 class YoutubePlaylistIE(InfoExtractor):
1674 """Information Extractor for YouTube playlists."""
1675
1676 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1677 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1678 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1679 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1680 IE_NAME = u'youtube:playlist'
1681
1682 def __init__(self, downloader=None):
1683 InfoExtractor.__init__(self, downloader)
1684
1685 def report_download_page(self, playlist_id, pagenum):
1686 """Report attempt to download playlist page with given number."""
1687 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1688
1689 def _real_extract(self, url):
1690 # Extract playlist id
1691 mobj = re.match(self._VALID_URL, url)
1692 if mobj is None:
1693 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1694 return
1695
1696 # Single video case
1697 if mobj.group(3) is not None:
1698 self._downloader.download([mobj.group(3)])
1699 return
1700
1701 # Download playlist pages
1702 # prefix is 'p' as default for playlists but there are other types that need extra care
1703 playlist_prefix = mobj.group(1)
1704 if playlist_prefix == 'a':
1705 playlist_access = 'artist'
1706 else:
1707 playlist_prefix = 'p'
1708 playlist_access = 'view_play_list'
1709 playlist_id = mobj.group(2)
1710 video_ids = []
1711 pagenum = 1
1712
1713 while True:
1714 self.report_download_page(playlist_id, pagenum)
1715 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1716 request = compat_urllib_request.Request(url)
1717 try:
1718 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721 return
1722
1723 # Extract video identifiers
1724 ids_in_page = []
1725 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1726 if mobj.group(1) not in ids_in_page:
1727 ids_in_page.append(mobj.group(1))
1728 video_ids.extend(ids_in_page)
1729
1730 if self._MORE_PAGES_INDICATOR not in page:
1731 break
1732 pagenum = pagenum + 1
1733
1734 total = len(video_ids)
1735
1736 playliststart = self._downloader.params.get('playliststart', 1) - 1
1737 playlistend = self._downloader.params.get('playlistend', -1)
1738 if playlistend == -1:
1739 video_ids = video_ids[playliststart:]
1740 else:
1741 video_ids = video_ids[playliststart:playlistend]
1742
1743 if len(video_ids) == total:
1744 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1745 else:
1746 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1747
1748 for id in video_ids:
1749 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1750 return
1751
1752
1753 class YoutubeChannelIE(InfoExtractor):
1754 """Information Extractor for YouTube channels."""
1755
1756 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1757 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1758 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1759 IE_NAME = u'youtube:channel'
1760
1761 def report_download_page(self, channel_id, pagenum):
1762 """Report attempt to download channel page with given number."""
1763 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1764
1765 def _real_extract(self, url):
1766 # Extract channel id
1767 mobj = re.match(self._VALID_URL, url)
1768 if mobj is None:
1769 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1770 return
1771
1772 # Download channel pages
1773 channel_id = mobj.group(1)
1774 video_ids = []
1775 pagenum = 1
1776
1777 while True:
1778 self.report_download_page(channel_id, pagenum)
1779 url = self._TEMPLATE_URL % (channel_id, pagenum)
1780 request = compat_urllib_request.Request(url)
1781 try:
1782 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1783 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1784 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1785 return
1786
1787 # Extract video identifiers
1788 ids_in_page = []
1789 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(mobj.group(1))
1792 video_ids.extend(ids_in_page)
1793
1794 if self._MORE_PAGES_INDICATOR not in page:
1795 break
1796 pagenum = pagenum + 1
1797
1798 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1799
1800 for id in video_ids:
1801 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1802 return
1803
1804
1805 class YoutubeUserIE(InfoExtractor):
1806 """Information Extractor for YouTube users."""
1807
1808 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1809 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1810 _GDATA_PAGE_SIZE = 50
1811 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1812 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1813 IE_NAME = u'youtube:user'
1814
1815 def __init__(self, downloader=None):
1816 InfoExtractor.__init__(self, downloader)
1817
1818 def report_download_page(self, username, start_index):
1819 """Report attempt to download user page."""
1820 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1821 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1822
1823 def _real_extract(self, url):
1824 # Extract username
1825 mobj = re.match(self._VALID_URL, url)
1826 if mobj is None:
1827 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1828 return
1829
1830 username = mobj.group(1)
1831
1832 # Download video ids using YouTube Data API. Result size per
1833 # query is limited (currently to 50 videos) so we need to query
1834 # page by page until there are no video ids - it means we got
1835 # all of them.
1836
1837 video_ids = []
1838 pagenum = 0
1839
1840 while True:
1841 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1842 self.report_download_page(username, start_index)
1843
1844 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1845
1846 try:
1847 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1849 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1850 return
1851
1852 # Extract video identifiers
1853 ids_in_page = []
1854
1855 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1856 if mobj.group(1) not in ids_in_page:
1857 ids_in_page.append(mobj.group(1))
1858
1859 video_ids.extend(ids_in_page)
1860
1861 # A little optimization - if current page is not
1862 # "full", ie. does not contain PAGE_SIZE video ids then
1863 # we can assume that this page is the last one - there
1864 # are no more ids on further pages - no need to query
1865 # again.
1866
1867 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1868 break
1869
1870 pagenum += 1
1871
1872 all_ids_count = len(video_ids)
1873 playliststart = self._downloader.params.get('playliststart', 1) - 1
1874 playlistend = self._downloader.params.get('playlistend', -1)
1875
1876 if playlistend == -1:
1877 video_ids = video_ids[playliststart:]
1878 else:
1879 video_ids = video_ids[playliststart:playlistend]
1880
1881 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1882 (username, all_ids_count, len(video_ids)))
1883
1884 for video_id in video_ids:
1885 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1886
1887
1888 class BlipTVUserIE(InfoExtractor):
1889 """Information Extractor for blip.tv users."""
1890
1891 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1892 _PAGE_SIZE = 12
1893 IE_NAME = u'blip.tv:user'
1894
1895 def __init__(self, downloader=None):
1896 InfoExtractor.__init__(self, downloader)
1897
1898 def report_download_page(self, username, pagenum):
1899 """Report attempt to download user page."""
1900 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1901 (self.IE_NAME, username, pagenum))
1902
1903 def _real_extract(self, url):
1904 # Extract username
1905 mobj = re.match(self._VALID_URL, url)
1906 if mobj is None:
1907 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1908 return
1909
1910 username = mobj.group(1)
1911
1912 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1913
1914 request = compat_urllib_request.Request(url)
1915
1916 try:
1917 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918 mobj = re.search(r'data-users-id="([^"]+)"', page)
1919 page_base = page_base % mobj.group(1)
1920 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1921 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1922 return
1923
1924
1925 # Download video ids using BlipTV Ajax calls. Result size per
1926 # query is limited (currently to 12 videos) so we need to query
1927 # page by page until there are no video ids - it means we got
1928 # all of them.
1929
1930 video_ids = []
1931 pagenum = 1
1932
1933 while True:
1934 self.report_download_page(username, pagenum)
1935
1936 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1937
1938 try:
1939 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1940 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1942 return
1943
1944 # Extract video identifiers
1945 ids_in_page = []
1946
1947 for mobj in re.finditer(r'href="/([^"]+)"', page):
1948 if mobj.group(1) not in ids_in_page:
1949 ids_in_page.append(unescapeHTML(mobj.group(1)))
1950
1951 video_ids.extend(ids_in_page)
1952
1953 # A little optimization - if current page is not
1954 # "full", ie. does not contain PAGE_SIZE video ids then
1955 # we can assume that this page is the last one - there
1956 # are no more ids on further pages - no need to query
1957 # again.
1958
1959 if len(ids_in_page) < self._PAGE_SIZE:
1960 break
1961
1962 pagenum += 1
1963
1964 all_ids_count = len(video_ids)
1965 playliststart = self._downloader.params.get('playliststart', 1) - 1
1966 playlistend = self._downloader.params.get('playlistend', -1)
1967
1968 if playlistend == -1:
1969 video_ids = video_ids[playliststart:]
1970 else:
1971 video_ids = video_ids[playliststart:playlistend]
1972
1973 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1974 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1975
1976 for video_id in video_ids:
1977 self._downloader.download([u'http://blip.tv/'+video_id])
1978
1979
1980 class DepositFilesIE(InfoExtractor):
1981 """Information extractor for depositfiles.com"""
1982
1983 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1984 IE_NAME = u'DepositFiles'
1985
1986 def __init__(self, downloader=None):
1987 InfoExtractor.__init__(self, downloader)
1988
1989 def report_download_webpage(self, file_id):
1990 """Report webpage download."""
1991 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1992
1993 def report_extraction(self, file_id):
1994 """Report information extraction."""
1995 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1996
1997 def _real_extract(self, url):
1998 file_id = url.split('/')[-1]
1999 # Rebuild url in english locale
2000 url = 'http://depositfiles.com/en/files/' + file_id
2001
2002 # Retrieve file webpage with 'Free download' button pressed
2003 free_download_indication = { 'gateway_result' : '1' }
2004 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2005 try:
2006 self.report_download_webpage(file_id)
2007 webpage = compat_urllib_request.urlopen(request).read()
2008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2009 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2010 return
2011
2012 # Search for the real file URL
2013 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2014 if (mobj is None) or (mobj.group(1) is None):
2015 # Try to figure out reason of the error.
2016 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2017 if (mobj is not None) and (mobj.group(1) is not None):
2018 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2019 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2020 else:
2021 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2022 return
2023
2024 file_url = mobj.group(1)
2025 file_extension = os.path.splitext(file_url)[1][1:]
2026
2027 # Search for file title
2028 mobj = re.search(r'<b title="(.*?)">', webpage)
2029 if mobj is None:
2030 self._downloader.trouble(u'ERROR: unable to extract title')
2031 return
2032 file_title = mobj.group(1).decode('utf-8')
2033
2034 return [{
2035 'id': file_id.decode('utf-8'),
2036 'url': file_url.decode('utf-8'),
2037 'uploader': None,
2038 'upload_date': None,
2039 'title': file_title,
2040 'ext': file_extension.decode('utf-8'),
2041 }]
2042
2043
2044 class FacebookIE(InfoExtractor):
2045 """Information Extractor for Facebook"""
2046
2047 _WORKING = False
2048 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2049 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2050 _NETRC_MACHINE = 'facebook'
2051 _available_formats = ['video', 'highqual', 'lowqual']
2052 _video_extensions = {
2053 'video': 'mp4',
2054 'highqual': 'mp4',
2055 'lowqual': 'mp4',
2056 }
2057 IE_NAME = u'facebook'
2058
2059 def __init__(self, downloader=None):
2060 InfoExtractor.__init__(self, downloader)
2061
2062 def _reporter(self, message):
2063 """Add header and report message."""
2064 self._downloader.to_screen(u'[facebook] %s' % message)
2065
2066 def report_login(self):
2067 """Report attempt to log in."""
2068 self._reporter(u'Logging in')
2069
2070 def report_video_webpage_download(self, video_id):
2071 """Report attempt to download video webpage."""
2072 self._reporter(u'%s: Downloading video webpage' % video_id)
2073
2074 def report_information_extraction(self, video_id):
2075 """Report attempt to extract video information."""
2076 self._reporter(u'%s: Extracting video information' % video_id)
2077
2078 def _parse_page(self, video_webpage):
2079 """Extract video information from page"""
2080 # General data
2081 data = {'title': r'\("video_title", "(.*?)"\)',
2082 'description': r'<div class="datawrap">(.*?)</div>',
2083 'owner': r'\("video_owner_name", "(.*?)"\)',
2084 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2085 }
2086 video_info = {}
2087 for piece in data.keys():
2088 mobj = re.search(data[piece], video_webpage)
2089 if mobj is not None:
2090 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2091
2092 # Video urls
2093 video_urls = {}
2094 for fmt in self._available_formats:
2095 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2096 if mobj is not None:
2097 # URL is in a Javascript segment inside an escaped Unicode format within
2098 # the generally utf-8 page
2099 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2100 video_info['video_urls'] = video_urls
2101
2102 return video_info
2103
2104 def _real_initialize(self):
2105 if self._downloader is None:
2106 return
2107
2108 useremail = None
2109 password = None
2110 downloader_params = self._downloader.params
2111
2112 # Attempt to use provided username and password or .netrc data
2113 if downloader_params.get('username', None) is not None:
2114 useremail = downloader_params['username']
2115 password = downloader_params['password']
2116 elif downloader_params.get('usenetrc', False):
2117 try:
2118 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2119 if info is not None:
2120 useremail = info[0]
2121 password = info[2]
2122 else:
2123 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2124 except (IOError, netrc.NetrcParseError) as err:
2125 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2126 return
2127
2128 if useremail is None:
2129 return
2130
2131 # Log in
2132 login_form = {
2133 'email': useremail,
2134 'pass': password,
2135 'login': 'Log+In'
2136 }
2137 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2138 try:
2139 self.report_login()
2140 login_results = compat_urllib_request.urlopen(request).read()
2141 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2142 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2143 return
2144 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2145 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2146 return
2147
2148 def _real_extract(self, url):
2149 mobj = re.match(self._VALID_URL, url)
2150 if mobj is None:
2151 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2152 return
2153 video_id = mobj.group('ID')
2154
2155 # Get video webpage
2156 self.report_video_webpage_download(video_id)
2157 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2158 try:
2159 page = compat_urllib_request.urlopen(request)
2160 video_webpage = page.read()
2161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2162 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2163 return
2164
2165 # Start extracting information
2166 self.report_information_extraction(video_id)
2167
2168 # Extract information
2169 video_info = self._parse_page(video_webpage)
2170
2171 # uploader
2172 if 'owner' not in video_info:
2173 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2174 return
2175 video_uploader = video_info['owner']
2176
2177 # title
2178 if 'title' not in video_info:
2179 self._downloader.trouble(u'ERROR: unable to extract video title')
2180 return
2181 video_title = video_info['title']
2182 video_title = video_title.decode('utf-8')
2183
2184 # thumbnail image
2185 if 'thumbnail' not in video_info:
2186 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2187 video_thumbnail = ''
2188 else:
2189 video_thumbnail = video_info['thumbnail']
2190
2191 # upload date
2192 upload_date = None
2193 if 'upload_date' in video_info:
2194 upload_time = video_info['upload_date']
2195 timetuple = email.utils.parsedate_tz(upload_time)
2196 if timetuple is not None:
2197 try:
2198 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2199 except:
2200 pass
2201
2202 # description
2203 video_description = video_info.get('description', 'No description available.')
2204
2205 url_map = video_info['video_urls']
2206 if len(url_map.keys()) > 0:
2207 # Decide which formats to download
2208 req_format = self._downloader.params.get('format', None)
2209 format_limit = self._downloader.params.get('format_limit', None)
2210
2211 if format_limit is not None and format_limit in self._available_formats:
2212 format_list = self._available_formats[self._available_formats.index(format_limit):]
2213 else:
2214 format_list = self._available_formats
2215 existing_formats = [x for x in format_list if x in url_map]
2216 if len(existing_formats) == 0:
2217 self._downloader.trouble(u'ERROR: no known formats available for video')
2218 return
2219 if req_format is None:
2220 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2221 elif req_format == 'worst':
2222 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2223 elif req_format == '-1':
2224 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2225 else:
2226 # Specific format
2227 if req_format not in url_map:
2228 self._downloader.trouble(u'ERROR: requested format not available')
2229 return
2230 video_url_list = [(req_format, url_map[req_format])] # Specific format
2231
2232 results = []
2233 for format_param, video_real_url in video_url_list:
2234 # Extension
2235 video_extension = self._video_extensions.get(format_param, 'mp4')
2236
2237 results.append({
2238 'id': video_id.decode('utf-8'),
2239 'url': video_real_url.decode('utf-8'),
2240 'uploader': video_uploader.decode('utf-8'),
2241 'upload_date': upload_date,
2242 'title': video_title,
2243 'ext': video_extension.decode('utf-8'),
2244 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2245 'thumbnail': video_thumbnail.decode('utf-8'),
2246 'description': video_description.decode('utf-8'),
2247 })
2248 return results
2249
2250 class BlipTVIE(InfoExtractor):
2251 """Information extractor for blip.tv"""
2252
2253 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2254 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2255 IE_NAME = u'blip.tv'
2256
2257 def report_extraction(self, file_id):
2258 """Report information extraction."""
2259 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2260
2261 def report_direct_download(self, title):
2262 """Report information extraction."""
2263 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2264
2265 def _real_extract(self, url):
2266 mobj = re.match(self._VALID_URL, url)
2267 if mobj is None:
2268 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2269 return
2270
2271 if '?' in url:
2272 cchar = '&'
2273 else:
2274 cchar = '?'
2275 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2276 request = compat_urllib_request.Request(json_url)
2277 self.report_extraction(mobj.group(1))
2278 info = None
2279 try:
2280 urlh = compat_urllib_request.urlopen(request)
2281 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2282 basename = url.split('/')[-1]
2283 title,ext = os.path.splitext(basename)
2284 title = title.decode('UTF-8')
2285 ext = ext.replace('.', '')
2286 self.report_direct_download(title)
2287 info = {
2288 'id': title,
2289 'url': url,
2290 'uploader': None,
2291 'upload_date': None,
2292 'title': title,
2293 'ext': ext,
2294 'urlhandle': urlh
2295 }
2296 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2297 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2298 return
2299 if info is None: # Regular URL
2300 try:
2301 json_code_bytes = urlh.read()
2302 json_code = json_code_bytes.decode('utf-8')
2303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2304 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2305 return
2306
2307 try:
2308 json_data = json.loads(json_code)
2309 if 'Post' in json_data:
2310 data = json_data['Post']
2311 else:
2312 data = json_data
2313
2314 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2315 video_url = data['media']['url']
2316 umobj = re.match(self._URL_EXT, video_url)
2317 if umobj is None:
2318 raise ValueError('Can not determine filename extension')
2319 ext = umobj.group(1)
2320
2321 info = {
2322 'id': data['item_id'],
2323 'url': video_url,
2324 'uploader': data['display_name'],
2325 'upload_date': upload_date,
2326 'title': data['title'],
2327 'ext': ext,
2328 'format': data['media']['mimeType'],
2329 'thumbnail': data['thumbnailUrl'],
2330 'description': data['description'],
2331 'player_url': data['embedUrl']
2332 }
2333 except (ValueError,KeyError) as err:
2334 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2335 return
2336
2337 std_headers['User-Agent'] = 'iTunes/10.6.1'
2338 return [info]
2339
2340
2341 class MyVideoIE(InfoExtractor):
2342 """Information Extractor for myvideo.de."""
2343
2344 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2345 IE_NAME = u'myvideo'
2346
2347 def __init__(self, downloader=None):
2348 InfoExtractor.__init__(self, downloader)
2349
2350 def report_download_webpage(self, video_id):
2351 """Report webpage download."""
2352 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2353
2354 def report_extraction(self, video_id):
2355 """Report information extraction."""
2356 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2357
2358 def _real_extract(self,url):
2359 mobj = re.match(self._VALID_URL, url)
2360 if mobj is None:
2361 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2362 return
2363
2364 video_id = mobj.group(1)
2365
2366 # Get video webpage
2367 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2368 try:
2369 self.report_download_webpage(video_id)
2370 webpage = compat_urllib_request.urlopen(request).read()
2371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2372 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2373 return
2374
2375 self.report_extraction(video_id)
2376 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2377 webpage)
2378 if mobj is None:
2379 self._downloader.trouble(u'ERROR: unable to extract media URL')
2380 return
2381 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2382
2383 mobj = re.search('<title>([^<]+)</title>', webpage)
2384 if mobj is None:
2385 self._downloader.trouble(u'ERROR: unable to extract title')
2386 return
2387
2388 video_title = mobj.group(1)
2389
2390 return [{
2391 'id': video_id,
2392 'url': video_url,
2393 'uploader': None,
2394 'upload_date': None,
2395 'title': video_title,
2396 'ext': u'flv',
2397 }]
2398
2399 class ComedyCentralIE(InfoExtractor):
2400 """Information extractor for The Daily Show and Colbert Report """
2401
2402 # urls can be abbreviations like :thedailyshow or :colbert
2403 # urls for episodes like:
2404 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2405 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2406 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2407 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2408 |(https?://)?(www\.)?
2409 (?P<showname>thedailyshow|colbertnation)\.com/
2410 (full-episodes/(?P<episode>.*)|
2411 (?P<clip>
2412 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2413 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2414 $"""
2415 IE_NAME = u'comedycentral'
2416
2417 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2418
2419 _video_extensions = {
2420 '3500': 'mp4',
2421 '2200': 'mp4',
2422 '1700': 'mp4',
2423 '1200': 'mp4',
2424 '750': 'mp4',
2425 '400': 'mp4',
2426 }
2427 _video_dimensions = {
2428 '3500': '1280x720',
2429 '2200': '960x540',
2430 '1700': '768x432',
2431 '1200': '640x360',
2432 '750': '512x288',
2433 '400': '384x216',
2434 }
2435
2436 def suitable(self, url):
2437 """Receives a URL and returns True if suitable for this IE."""
2438 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2439
2440 def report_extraction(self, episode_id):
2441 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2442
2443 def report_config_download(self, episode_id):
2444 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2445
2446 def report_index_download(self, episode_id):
2447 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2448
2449 def report_player_url(self, episode_id):
2450 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2451
2452
2453 def _print_formats(self, formats):
2454 print('Available formats:')
2455 for x in formats:
2456 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2457
2458
2459 def _real_extract(self, url):
2460 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2461 if mobj is None:
2462 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2463 return
2464
2465 if mobj.group('shortname'):
2466 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2467 url = u'http://www.thedailyshow.com/full-episodes/'
2468 else:
2469 url = u'http://www.colbertnation.com/full-episodes/'
2470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2471 assert mobj is not None
2472
2473 if mobj.group('clip'):
2474 if mobj.group('showname') == 'thedailyshow':
2475 epTitle = mobj.group('tdstitle')
2476 else:
2477 epTitle = mobj.group('cntitle')
2478 dlNewest = False
2479 else:
2480 dlNewest = not mobj.group('episode')
2481 if dlNewest:
2482 epTitle = mobj.group('showname')
2483 else:
2484 epTitle = mobj.group('episode')
2485
2486 req = compat_urllib_request.Request(url)
2487 self.report_extraction(epTitle)
2488 try:
2489 htmlHandle = compat_urllib_request.urlopen(req)
2490 html = htmlHandle.read()
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2493 return
2494 if dlNewest:
2495 url = htmlHandle.geturl()
2496 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2497 if mobj is None:
2498 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2499 return
2500 if mobj.group('episode') == '':
2501 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2502 return
2503 epTitle = mobj.group('episode')
2504
2505 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2506
2507 if len(mMovieParams) == 0:
2508 # The Colbert Report embeds the information in a without
2509 # a URL prefix; so extract the alternate reference
2510 # and then add the URL prefix manually.
2511
2512 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2513 if len(altMovieParams) == 0:
2514 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2515 return
2516 else:
2517 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2518
2519 playerUrl_raw = mMovieParams[0][0]
2520 self.report_player_url(epTitle)
2521 try:
2522 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2523 playerUrl = urlHandle.geturl()
2524 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2526 return
2527
2528 uri = mMovieParams[0][1]
2529 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2530 self.report_index_download(epTitle)
2531 try:
2532 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2533 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2534 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2535 return
2536
2537 results = []
2538
2539 idoc = xml.etree.ElementTree.fromstring(indexXml)
2540 itemEls = idoc.findall('.//item')
2541 for itemEl in itemEls:
2542 mediaId = itemEl.findall('./guid')[0].text
2543 shortMediaId = mediaId.split(':')[-1]
2544 showId = mediaId.split(':')[-2].replace('.com', '')
2545 officialTitle = itemEl.findall('./title')[0].text
2546 officialDate = itemEl.findall('./pubDate')[0].text
2547
2548 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2549 compat_urllib_parse.urlencode({'uri': mediaId}))
2550 configReq = compat_urllib_request.Request(configUrl)
2551 self.report_config_download(epTitle)
2552 try:
2553 configXml = compat_urllib_request.urlopen(configReq).read()
2554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2556 return
2557
2558 cdoc = xml.etree.ElementTree.fromstring(configXml)
2559 turls = []
2560 for rendition in cdoc.findall('.//rendition'):
2561 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2562 turls.append(finfo)
2563
2564 if len(turls) == 0:
2565 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2566 continue
2567
2568 if self._downloader.params.get('listformats', None):
2569 self._print_formats([i[0] for i in turls])
2570 return
2571
2572 # For now, just pick the highest bitrate
2573 format,video_url = turls[-1]
2574
2575 # Get the format arg from the arg stream
2576 req_format = self._downloader.params.get('format', None)
2577
2578 # Select format if we can find one
2579 for f,v in turls:
2580 if f == req_format:
2581 format, video_url = f, v
2582 break
2583
2584 # Patch to download from alternative CDN, which does not
2585 # break on current RTMPDump builds
2586 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2587 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2588
2589 if video_url.startswith(broken_cdn):
2590 video_url = video_url.replace(broken_cdn, better_cdn)
2591
2592 effTitle = showId + u'-' + epTitle
2593 info = {
2594 'id': shortMediaId,
2595 'url': video_url,
2596 'uploader': showId,
2597 'upload_date': officialDate,
2598 'title': effTitle,
2599 'ext': 'mp4',
2600 'format': format,
2601 'thumbnail': None,
2602 'description': officialTitle,
2603 'player_url': None #playerUrl
2604 }
2605
2606 results.append(info)
2607
2608 return results
2609
2610
2611 class EscapistIE(InfoExtractor):
2612 """Information extractor for The Escapist """
2613
2614 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2615 IE_NAME = u'escapist'
2616
2617 def report_extraction(self, showName):
2618 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2619
2620 def report_config_download(self, showName):
2621 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2622
2623 def _real_extract(self, url):
2624 mobj = re.match(self._VALID_URL, url)
2625 if mobj is None:
2626 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2627 return
2628 showName = mobj.group('showname')
2629 videoId = mobj.group('episode')
2630
2631 self.report_extraction(showName)
2632 try:
2633 webPage = compat_urllib_request.urlopen(url)
2634 webPageBytes = webPage.read()
2635 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2636 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2637 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2638 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2639 return
2640
2641 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2642 description = unescapeHTML(descMatch.group(1))
2643 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2644 imgUrl = unescapeHTML(imgMatch.group(1))
2645 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2646 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2647 configUrlMatch = re.search('config=(.*)$', playerUrl)
2648 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2649
2650 self.report_config_download(showName)
2651 try:
2652 configJSON = compat_urllib_request.urlopen(configUrl).read()
2653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2654 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2655 return
2656
2657 # Technically, it's JavaScript, not JSON
2658 configJSON = configJSON.replace("'", '"')
2659
2660 try:
2661 config = json.loads(configJSON)
2662 except (ValueError,) as err:
2663 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2664 return
2665
2666 playlist = config['playlist']
2667 videoUrl = playlist[1]['url']
2668
2669 info = {
2670 'id': videoId,
2671 'url': videoUrl,
2672 'uploader': showName,
2673 'upload_date': None,
2674 'title': showName,
2675 'ext': 'flv',
2676 'thumbnail': imgUrl,
2677 'description': description,
2678 'player_url': playerUrl,
2679 }
2680
2681 return [info]
2682
2683
2684 class CollegeHumorIE(InfoExtractor):
2685 """Information extractor for collegehumor.com"""
2686
2687 _WORKING = False
2688 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2689 IE_NAME = u'collegehumor'
2690
2691 def report_manifest(self, video_id):
2692 """Report information extraction."""
2693 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2694
2695 def report_extraction(self, video_id):
2696 """Report information extraction."""
2697 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2701 if mobj is None:
2702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703 return
2704 video_id = mobj.group('videoid')
2705
2706 info = {
2707 'id': video_id,
2708 'uploader': None,
2709 'upload_date': None,
2710 }
2711
2712 self.report_extraction(video_id)
2713 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2714 try:
2715 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2716 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2717 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2718 return
2719
2720 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2721 try:
2722 videoNode = mdoc.findall('./video')[0]
2723 info['description'] = videoNode.findall('./description')[0].text
2724 info['title'] = videoNode.findall('./caption')[0].text
2725 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2726 manifest_url = videoNode.findall('./file')[0].text
2727 except IndexError:
2728 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2729 return
2730
2731 manifest_url += '?hdcore=2.10.3'
2732 self.report_manifest(video_id)
2733 try:
2734 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2737 return
2738
2739 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2740 try:
2741 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2742 node_id = media_node.attrib['url']
2743 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2744 except IndexError as err:
2745 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2746 return
2747
2748 url_pr = compat_urllib_parse_urlparse(manifest_url)
2749 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2750
2751 info['url'] = url
2752 info['ext'] = 'f4f'
2753 return [info]
2754
2755
2756 class XVideosIE(InfoExtractor):
2757 """Information extractor for xvideos.com"""
2758
2759 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2760 IE_NAME = u'xvideos'
2761
2762 def report_webpage(self, video_id):
2763 """Report information extraction."""
2764 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2765
2766 def report_extraction(self, video_id):
2767 """Report information extraction."""
2768 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2769
2770 def _real_extract(self, url):
2771 mobj = re.match(self._VALID_URL, url)
2772 if mobj is None:
2773 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774 return
2775 video_id = mobj.group(1)
2776
2777 self.report_webpage(video_id)
2778
2779 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2780 try:
2781 webpage_bytes = compat_urllib_request.urlopen(request).read()
2782 webpage = webpage_bytes.decode('utf-8', 'replace')
2783 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2784 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2785 return
2786
2787 self.report_extraction(video_id)
2788
2789
2790 # Extract video URL
2791 mobj = re.search(r'flv_url=(.+?)&', webpage)
2792 if mobj is None:
2793 self._downloader.trouble(u'ERROR: unable to extract video url')
2794 return
2795 video_url = compat_urllib_parse.unquote(mobj.group(1))
2796
2797
2798 # Extract title
2799 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2800 if mobj is None:
2801 self._downloader.trouble(u'ERROR: unable to extract video title')
2802 return
2803 video_title = mobj.group(1)
2804
2805
2806 # Extract video thumbnail
2807 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2808 if mobj is None:
2809 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2810 return
2811 video_thumbnail = mobj.group(0)
2812
2813 info = {
2814 'id': video_id,
2815 'url': video_url,
2816 'uploader': None,
2817 'upload_date': None,
2818 'title': video_title,
2819 'ext': 'flv',
2820 'thumbnail': video_thumbnail,
2821 'description': None,
2822 }
2823
2824 return [info]
2825
2826
2827 class SoundcloudIE(InfoExtractor):
2828 """Information extractor for soundcloud.com
2829 To access the media, the uid of the song and a stream token
2830 must be extracted from the page source and the script must make
2831 a request to media.soundcloud.com/crossdomain.xml. Then
2832 the media can be grabbed by requesting from an url composed
2833 of the stream token and uid
2834 """
2835
2836 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2837 IE_NAME = u'soundcloud'
2838
2839 def __init__(self, downloader=None):
2840 InfoExtractor.__init__(self, downloader)
2841
2842 def report_resolve(self, video_id):
2843 """Report information extraction."""
2844 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2845
2846 def report_extraction(self, video_id):
2847 """Report information extraction."""
2848 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2849
2850 def _real_extract(self, url):
2851 mobj = re.match(self._VALID_URL, url)
2852 if mobj is None:
2853 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2854 return
2855
2856 # extract uploader (which is in the url)
2857 uploader = mobj.group(1)
2858 # extract simple title (uploader + slug of song title)
2859 slug_title = mobj.group(2)
2860 simple_title = uploader + u'-' + slug_title
2861
2862 self.report_resolve('%s/%s' % (uploader, slug_title))
2863
2864 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2865 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2866 request = compat_urllib_request.Request(resolv_url)
2867 try:
2868 info_json_bytes = compat_urllib_request.urlopen(request).read()
2869 info_json = info_json_bytes.decode('utf-8')
2870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2872 return
2873
2874 info = json.loads(info_json)
2875 video_id = info['id']
2876 self.report_extraction('%s/%s' % (uploader, slug_title))
2877
2878 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2879 request = compat_urllib_request.Request(streams_url)
2880 try:
2881 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2882 stream_json = stream_json_bytes.decode('utf-8')
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2885 return
2886
2887 streams = json.loads(stream_json)
2888 mediaURL = streams['http_mp3_128_url']
2889
2890 return [{
2891 'id': info['id'],
2892 'url': mediaURL,
2893 'uploader': info['user']['username'],
2894 'upload_date': info['created_at'],
2895 'title': info['title'],
2896 'ext': u'mp3',
2897 'description': info['description'],
2898 }]
2899
2900
2901 class InfoQIE(InfoExtractor):
2902 """Information extractor for infoq.com"""
2903
2904 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2905 IE_NAME = u'infoq'
2906
2907 def report_webpage(self, video_id):
2908 """Report information extraction."""
2909 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2910
2911 def report_extraction(self, video_id):
2912 """Report information extraction."""
2913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2914
2915 def _real_extract(self, url):
2916 mobj = re.match(self._VALID_URL, url)
2917 if mobj is None:
2918 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2919 return
2920
2921 self.report_webpage(url)
2922
2923 request = compat_urllib_request.Request(url)
2924 try:
2925 webpage = compat_urllib_request.urlopen(request).read()
2926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2928 return
2929
2930 self.report_extraction(url)
2931
2932
2933 # Extract video URL
2934 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2935 if mobj is None:
2936 self._downloader.trouble(u'ERROR: unable to extract video url')
2937 return
2938 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2939
2940
2941 # Extract title
2942 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2943 if mobj is None:
2944 self._downloader.trouble(u'ERROR: unable to extract video title')
2945 return
2946 video_title = mobj.group(1).decode('utf-8')
2947
2948 # Extract description
2949 video_description = u'No description available.'
2950 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2951 if mobj is not None:
2952 video_description = mobj.group(1).decode('utf-8')
2953
2954 video_filename = video_url.split('/')[-1]
2955 video_id, extension = video_filename.split('.')
2956
2957 info = {
2958 'id': video_id,
2959 'url': video_url,
2960 'uploader': None,
2961 'upload_date': None,
2962 'title': video_title,
2963 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2964 'thumbnail': None,
2965 'description': video_description,
2966 }
2967
2968 return [info]
2969
2970 class MixcloudIE(InfoExtractor):
2971 """Information extractor for www.mixcloud.com"""
2972 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2973 IE_NAME = u'mixcloud'
2974
2975 def __init__(self, downloader=None):
2976 InfoExtractor.__init__(self, downloader)
2977
2978 def report_download_json(self, file_id):
2979 """Report JSON download."""
2980 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2981
2982 def report_extraction(self, file_id):
2983 """Report information extraction."""
2984 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2985
2986 def get_urls(self, jsonData, fmt, bitrate='best'):
2987 """Get urls from 'audio_formats' section in json"""
2988 file_url = None
2989 try:
2990 bitrate_list = jsonData[fmt]
2991 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2992 bitrate = max(bitrate_list) # select highest
2993
2994 url_list = jsonData[fmt][bitrate]
2995 except TypeError: # we have no bitrate info.
2996 url_list = jsonData[fmt]
2997 return url_list
2998
2999 def check_urls(self, url_list):
3000 """Returns 1st active url from list"""
3001 for url in url_list:
3002 try:
3003 compat_urllib_request.urlopen(url)
3004 return url
3005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3006 url = None
3007
3008 return None
3009
3010 def _print_formats(self, formats):
3011 print('Available formats:')
3012 for fmt in formats.keys():
3013 for b in formats[fmt]:
3014 try:
3015 ext = formats[fmt][b][0]
3016 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3017 except TypeError: # we have no bitrate info
3018 ext = formats[fmt][0]
3019 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3020 break
3021
3022 def _real_extract(self, url):
3023 mobj = re.match(self._VALID_URL, url)
3024 if mobj is None:
3025 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3026 return
3027 # extract uploader & filename from url
3028 uploader = mobj.group(1).decode('utf-8')
3029 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3030
3031 # construct API request
3032 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3033 # retrieve .json file with links to files
3034 request = compat_urllib_request.Request(file_url)
3035 try:
3036 self.report_download_json(file_url)
3037 jsonData = compat_urllib_request.urlopen(request).read()
3038 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3039 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3040 return
3041
3042 # parse JSON
3043 json_data = json.loads(jsonData)
3044 player_url = json_data['player_swf_url']
3045 formats = dict(json_data['audio_formats'])
3046
3047 req_format = self._downloader.params.get('format', None)
3048 bitrate = None
3049
3050 if self._downloader.params.get('listformats', None):
3051 self._print_formats(formats)
3052 return
3053
3054 if req_format is None or req_format == 'best':
3055 for format_param in formats.keys():
3056 url_list = self.get_urls(formats, format_param)
3057 # check urls
3058 file_url = self.check_urls(url_list)
3059 if file_url is not None:
3060 break # got it!
3061 else:
3062 if req_format not in formats.keys():
3063 self._downloader.trouble(u'ERROR: format is not available')
3064 return
3065
3066 url_list = self.get_urls(formats, req_format)
3067 file_url = self.check_urls(url_list)
3068 format_param = req_format
3069
3070 return [{
3071 'id': file_id.decode('utf-8'),
3072 'url': file_url.decode('utf-8'),
3073 'uploader': uploader.decode('utf-8'),
3074 'upload_date': None,
3075 'title': json_data['name'],
3076 'ext': file_url.split('.')[-1].decode('utf-8'),
3077 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3078 'thumbnail': json_data['thumbnail_url'],
3079 'description': json_data['description'],
3080 'player_url': player_url.decode('utf-8'),
3081 }]
3082
3083 class StanfordOpenClassroomIE(InfoExtractor):
3084 """Information extractor for Stanford's Open ClassRoom"""
3085
3086 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3087 IE_NAME = u'stanfordoc'
3088
3089 def report_download_webpage(self, objid):
3090 """Report information extraction."""
3091 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3092
3093 def report_extraction(self, video_id):
3094 """Report information extraction."""
3095 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3096
3097 def _real_extract(self, url):
3098 mobj = re.match(self._VALID_URL, url)
3099 if mobj is None:
3100 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3101 return
3102
3103 if mobj.group('course') and mobj.group('video'): # A specific video
3104 course = mobj.group('course')
3105 video = mobj.group('video')
3106 info = {
3107 'id': course + '_' + video,
3108 'uploader': None,
3109 'upload_date': None,
3110 }
3111
3112 self.report_extraction(info['id'])
3113 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3114 xmlUrl = baseUrl + video + '.xml'
3115 try:
3116 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3118 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3119 return
3120 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3121 try:
3122 info['title'] = mdoc.findall('./title')[0].text
3123 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3124 except IndexError:
3125 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3126 return
3127 info['ext'] = info['url'].rpartition('.')[2]
3128 return [info]
3129 elif mobj.group('course'): # A course page
3130 course = mobj.group('course')
3131 info = {
3132 'id': course,
3133 'type': 'playlist',
3134 'uploader': None,
3135 'upload_date': None,
3136 }
3137
3138 self.report_download_webpage(info['id'])
3139 try:
3140 coursepage = compat_urllib_request.urlopen(url).read()
3141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3143 return
3144
3145 m = re.search('<h1>([^<]+)</h1>', coursepage)
3146 if m:
3147 info['title'] = unescapeHTML(m.group(1))
3148 else:
3149 info['title'] = info['id']
3150
3151 m = re.search('<description>([^<]+)</description>', coursepage)
3152 if m:
3153 info['description'] = unescapeHTML(m.group(1))
3154
3155 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3156 info['list'] = [
3157 {
3158 'type': 'reference',
3159 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3160 }
3161 for vpage in links]
3162 results = []
3163 for entry in info['list']:
3164 assert entry['type'] == 'reference'
3165 results += self.extract(entry['url'])
3166 return results
3167
3168 else: # Root page
3169 info = {
3170 'id': 'Stanford OpenClassroom',
3171 'type': 'playlist',
3172 'uploader': None,
3173 'upload_date': None,
3174 }
3175
3176 self.report_download_webpage(info['id'])
3177 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3178 try:
3179 rootpage = compat_urllib_request.urlopen(rootURL).read()
3180 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3181 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3182 return
3183
3184 info['title'] = info['id']
3185
3186 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3187 info['list'] = [
3188 {
3189 'type': 'reference',
3190 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3191 }
3192 for cpage in links]
3193
3194 results = []
3195 for entry in info['list']:
3196 assert entry['type'] == 'reference'
3197 results += self.extract(entry['url'])
3198 return results
3199
3200 class MTVIE(InfoExtractor):
3201 """Information extractor for MTV.com"""
3202
3203 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3204 IE_NAME = u'mtv'
3205
3206 def report_webpage(self, video_id):
3207 """Report information extraction."""
3208 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3209
3210 def report_extraction(self, video_id):
3211 """Report information extraction."""
3212 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3213
3214 def _real_extract(self, url):
3215 mobj = re.match(self._VALID_URL, url)
3216 if mobj is None:
3217 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3218 return
3219 if not mobj.group('proto'):
3220 url = 'http://' + url
3221 video_id = mobj.group('videoid')
3222 self.report_webpage(video_id)
3223
3224 request = compat_urllib_request.Request(url)
3225 try:
3226 webpage = compat_urllib_request.urlopen(request).read()
3227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3229 return
3230
3231 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3232 if mobj is None:
3233 self._downloader.trouble(u'ERROR: unable to extract song name')
3234 return
3235 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3236 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3237 if mobj is None:
3238 self._downloader.trouble(u'ERROR: unable to extract performer')
3239 return
3240 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3241 video_title = performer + ' - ' + song_name
3242
3243 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3244 if mobj is None:
3245 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3246 return
3247 mtvn_uri = mobj.group(1)
3248
3249 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3250 if mobj is None:
3251 self._downloader.trouble(u'ERROR: unable to extract content id')
3252 return
3253 content_id = mobj.group(1)
3254
3255 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3256 self.report_extraction(video_id)
3257 request = compat_urllib_request.Request(videogen_url)
3258 try:
3259 metadataXml = compat_urllib_request.urlopen(request).read()
3260 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3261 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3262 return
3263
3264 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3265 renditions = mdoc.findall('.//rendition')
3266
3267 # For now, always pick the highest quality.
3268 rendition = renditions[-1]
3269
3270 try:
3271 _,_,ext = rendition.attrib['type'].partition('/')
3272 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3273 video_url = rendition.find('./src').text
3274 except KeyError:
3275 self._downloader.trouble('Invalid rendition field.')
3276 return
3277
3278 info = {
3279 'id': video_id,
3280 'url': video_url,
3281 'uploader': performer,
3282 'upload_date': None,
3283 'title': video_title,
3284 'ext': ext,
3285 'format': format,
3286 }
3287
3288 return [info]
3289
3290
3291 class YoukuIE(InfoExtractor):
3292
3293 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3294 IE_NAME = u'Youku'
3295
3296 def __init__(self, downloader=None):
3297 InfoExtractor.__init__(self, downloader)
3298
3299 def report_download_webpage(self, file_id):
3300 """Report webpage download."""
3301 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3302
3303 def report_extraction(self, file_id):
3304 """Report information extraction."""
3305 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3306
3307 def _gen_sid(self):
3308 nowTime = int(time.time() * 1000)
3309 random1 = random.randint(1000,1998)
3310 random2 = random.randint(1000,9999)
3311
3312 return "%d%d%d" %(nowTime,random1,random2)
3313
3314 def _get_file_ID_mix_string(self, seed):
3315 mixed = []
3316 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3317 seed = float(seed)
3318 for i in range(len(source)):
3319 seed = (seed * 211 + 30031 ) % 65536
3320 index = math.floor(seed / 65536 * len(source) )
3321 mixed.append(source[int(index)])
3322 source.remove(source[int(index)])
3323 #return ''.join(mixed)
3324 return mixed
3325
3326 def _get_file_id(self, fileId, seed):
3327 mixed = self._get_file_ID_mix_string(seed)
3328 ids = fileId.split('*')
3329 realId = []
3330 for ch in ids:
3331 if ch:
3332 realId.append(mixed[int(ch)])
3333 return ''.join(realId)
3334
3335 def _real_extract(self, url):
3336 mobj = re.match(self._VALID_URL, url)
3337 if mobj is None:
3338 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3339 return
3340 video_id = mobj.group('ID')
3341
3342 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3343
3344 request = compat_urllib_request.Request(info_url, None, std_headers)
3345 try:
3346 self.report_download_webpage(video_id)
3347 jsondata = compat_urllib_request.urlopen(request).read()
3348 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3349 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3350 return
3351
3352 self.report_extraction(video_id)
3353 try:
3354 jsonstr = jsondata.decode('utf-8')
3355 config = json.loads(jsonstr)
3356
3357 video_title = config['data'][0]['title']
3358 seed = config['data'][0]['seed']
3359
3360 format = self._downloader.params.get('format', None)
3361 supported_format = config['data'][0]['streamfileids'].keys()
3362
3363 if format is None or format == 'best':
3364 if 'hd2' in supported_format:
3365 format = 'hd2'
3366 else:
3367 format = 'flv'
3368 ext = u'flv'
3369 elif format == 'worst':
3370 format = 'mp4'
3371 ext = u'mp4'
3372 else:
3373 format = 'flv'
3374 ext = u'flv'
3375
3376
3377 fileid = config['data'][0]['streamfileids'][format]
3378 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3379 except (UnicodeDecodeError, ValueError, KeyError):
3380 self._downloader.trouble(u'ERROR: unable to extract info section')
3381 return
3382
3383 files_info=[]
3384 sid = self._gen_sid()
3385 fileid = self._get_file_id(fileid, seed)
3386
3387 #column 8,9 of fileid represent the segment number
3388 #fileid[7:9] should be changed
3389 for index, key in enumerate(keys):
3390
3391 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3392 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3393
3394 info = {
3395 'id': '%s_part%02d' % (video_id, index),
3396 'url': download_url,
3397 'uploader': None,
3398 'upload_date': None,
3399 'title': video_title,
3400 'ext': ext,
3401 }
3402 files_info.append(info)
3403
3404 return files_info
3405
3406
3407 class XNXXIE(InfoExtractor):
3408 """Information extractor for xnxx.com"""
3409
3410 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3411 IE_NAME = u'xnxx'
3412 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3413 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3414 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3415
3416 def report_webpage(self, video_id):
3417 """Report information extraction"""
3418 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3419
3420 def report_extraction(self, video_id):
3421 """Report information extraction"""
3422 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3423
3424 def _real_extract(self, url):
3425 mobj = re.match(self._VALID_URL, url)
3426 if mobj is None:
3427 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428 return
3429 video_id = mobj.group(1)
3430
3431 self.report_webpage(video_id)
3432
3433 # Get webpage content
3434 try:
3435 webpage_bytes = compat_urllib_request.urlopen(url).read()
3436 webpage = webpage_bytes.decode('utf-8')
3437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3439 return
3440
3441 result = re.search(self.VIDEO_URL_RE, webpage)
3442 if result is None:
3443 self._downloader.trouble(u'ERROR: unable to extract video url')
3444 return
3445 video_url = compat_urllib_parse.unquote(result.group(1))
3446
3447 result = re.search(self.VIDEO_TITLE_RE, webpage)
3448 if result is None:
3449 self._downloader.trouble(u'ERROR: unable to extract video title')
3450 return
3451 video_title = result.group(1)
3452
3453 result = re.search(self.VIDEO_THUMB_RE, webpage)
3454 if result is None:
3455 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3456 return
3457 video_thumbnail = result.group(1)
3458
3459 return [{
3460 'id': video_id,
3461 'url': video_url,
3462 'uploader': None,
3463 'upload_date': None,
3464 'title': video_title,
3465 'ext': 'flv',
3466 'thumbnail': video_thumbnail,
3467 'description': None,
3468 }]
3469
3470
3471 class GooglePlusIE(InfoExtractor):
3472 """Information extractor for plus.google.com."""
3473
3474 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3475 IE_NAME = u'plus.google'
3476
3477 def __init__(self, downloader=None):
3478 InfoExtractor.__init__(self, downloader)
3479
3480 def report_extract_entry(self, url):
3481 """Report downloading extry"""
3482 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3483
3484 def report_date(self, upload_date):
3485 """Report downloading extry"""
3486 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3487
3488 def report_uploader(self, uploader):
3489 """Report downloading extry"""
3490 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3491
3492 def report_title(self, video_title):
3493 """Report downloading extry"""
3494 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3495
3496 def report_extract_vid_page(self, video_page):
3497 """Report information extraction."""
3498 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3499
3500 def _real_extract(self, url):
3501 # Extract id from URL
3502 mobj = re.match(self._VALID_URL, url)
3503 if mobj is None:
3504 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3505 return
3506
3507 post_url = mobj.group(0)
3508 video_id = mobj.group(2)
3509
3510 video_extension = 'flv'
3511
3512 # Step 1, Retrieve post webpage to extract further information
3513 self.report_extract_entry(post_url)
3514 request = compat_urllib_request.Request(post_url)
3515 try:
3516 webpage = compat_urllib_request.urlopen(request).read()
3517 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3518 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3519 return
3520
3521 # Extract update date
3522 upload_date = None
3523 pattern = 'title="Timestamp">(.*?)</a>'
3524 mobj = re.search(pattern, webpage)
3525 if mobj:
3526 upload_date = mobj.group(1)
3527 # Convert timestring to a format suitable for filename
3528 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3529 upload_date = upload_date.strftime('%Y%m%d')
3530 self.report_date(upload_date)
3531
3532 # Extract uploader
3533 uploader = None
3534 pattern = r'rel\="author".*?>(.*?)</a>'
3535 mobj = re.search(pattern, webpage)
3536 if mobj:
3537 uploader = mobj.group(1)
3538 self.report_uploader(uploader)
3539
3540 # Extract title
3541 # Get the first line for title
3542 video_title = u'NA'
3543 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3544 mobj = re.search(pattern, webpage)
3545 if mobj:
3546 video_title = mobj.group(1)
3547 self.report_title(video_title)
3548
3549 # Step 2, Stimulate clicking the image box to launch video
3550 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3551 mobj = re.search(pattern, webpage)
3552 if mobj is None:
3553 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3554
3555 video_page = mobj.group(1)
3556 request = compat_urllib_request.Request(video_page)
3557 try:
3558 webpage = compat_urllib_request.urlopen(request).read()
3559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3560 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3561 return
3562 self.report_extract_vid_page(video_page)
3563
3564
3565 # Extract video links on video page
3566 """Extract video links of all sizes"""
3567 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3568 mobj = re.findall(pattern, webpage)
3569 if len(mobj) == 0:
3570 self._downloader.trouble(u'ERROR: unable to extract video links')
3571
3572 # Sort in resolution
3573 links = sorted(mobj)
3574
3575 # Choose the lowest of the sort, i.e. highest resolution
3576 video_url = links[-1]
3577 # Only get the url. The resolution part in the tuple has no use anymore
3578 video_url = video_url[-1]
3579 # Treat escaped \u0026 style hex
3580 video_url = unicode(video_url, "unicode_escape")
3581
3582
3583 return [{
3584 'id': video_id.decode('utf-8'),
3585 'url': video_url,
3586 'uploader': uploader.decode('utf-8'),
3587 'upload_date': upload_date.decode('utf-8'),
3588 'title': video_title.decode('utf-8'),
3589 'ext': video_extension.decode('utf-8'),
3590 }]
3591
3592 class NBAIE(InfoExtractor):
3593 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3594 IE_NAME = u'nba'
3595
3596 def report_extraction(self, video_id):
3597 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3598
3599 def _real_extract(self, url):
3600 mobj = re.match(self._VALID_URL, url)
3601 if mobj is None:
3602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3603 return
3604
3605 video_id = mobj.group(1)
3606 if video_id.endswith('/index.html'):
3607 video_id = video_id[:-len('/index.html')]
3608
3609 self.report_extraction(video_id)
3610 try:
3611 urlh = compat_urllib_request.urlopen(url)
3612 webpage_bytes = urlh.read()
3613 webpage = webpage_bytes.decode('utf-8', 'ignore')
3614 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3615 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3616 return
3617
3618 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3619 def _findProp(rexp, default=None):
3620 m = re.search(rexp, webpage)
3621 if m:
3622 return unescapeHTML(m.group(1))
3623 else:
3624 return default
3625
3626 shortened_video_id = video_id.rpartition('/')[2]
3627 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3628 info = {
3629 'id': shortened_video_id,
3630 'url': video_url,
3631 'ext': 'mp4',
3632 'title': title,
3633 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3634 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3635 }
3636 return [info]
3637
3638 class JustinTVIE(InfoExtractor):
3639 """Information extractor for justin.tv and twitch.tv"""
3640 # TODO: One broadcast may be split into multiple videos. The key
3641 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3642 # starts at 1 and increases. Can we treat all parts as one video?
3643
3644 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3645 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3646 _JUSTIN_PAGE_LIMIT = 100
3647 IE_NAME = u'justin.tv'
3648
3649 def report_extraction(self, file_id):
3650 """Report information extraction."""
3651 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3652
3653 def report_download_page(self, channel, offset):
3654 """Report attempt to download a single page of videos."""
3655 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3656 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3657
3658 # Return count of items, list of *valid* items
3659 def _parse_page(self, url):
3660 try:
3661 urlh = compat_urllib_request.urlopen(url)
3662 webpage_bytes = urlh.read()
3663 webpage = webpage_bytes.decode('utf-8', 'ignore')
3664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3665 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3666 return
3667
3668 response = json.loads(webpage)
3669 info = []
3670 for clip in response:
3671 video_url = clip['video_file_url']
3672 if video_url:
3673 video_extension = os.path.splitext(video_url)[1][1:]
3674 video_date = re.sub('-', '', clip['created_on'][:10])
3675 info.append({
3676 'id': clip['id'],
3677 'url': video_url,
3678 'title': clip['title'],
3679 'uploader': clip.get('user_id', clip.get('channel_id')),
3680 'upload_date': video_date,
3681 'ext': video_extension,
3682 })
3683 return (len(response), info)
3684
3685 def _real_extract(self, url):
3686 mobj = re.match(self._VALID_URL, url)
3687 if mobj is None:
3688 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3689 return
3690
3691 api = 'http://api.justin.tv'
3692 video_id = mobj.group(mobj.lastindex)
3693 paged = False
3694 if mobj.lastindex == 1:
3695 paged = True
3696 api += '/channel/archives/%s.json'
3697 else:
3698 api += '/clip/show/%s.json'
3699 api = api % (video_id,)
3700
3701 self.report_extraction(video_id)
3702
3703 info = []
3704 offset = 0
3705 limit = self._JUSTIN_PAGE_LIMIT
3706 while True:
3707 if paged:
3708 self.report_download_page(video_id, offset)
3709 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3710 page_count, page_info = self._parse_page(page_url)
3711 info.extend(page_info)
3712 if not paged or page_count != limit:
3713 break
3714 offset += limit
3715 return info