]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
a number of new tests and fixes; all tests green on 3.3
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
49
50 The fields should all be Unicode strings.
51
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
55
56 _real_extract() must return a *list* of information dictionaries as
57 described above.
58
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
61 """
62
63 _ready = False
64 _downloader = None
65 _WORKING = True
66
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
69 self._ready = False
70 self.set_downloader(downloader)
71
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
75
76 def working(self):
77 """Getter method for _WORKING."""
78 return self._WORKING
79
80 def initialize(self):
81 """Initializes an instance (authentication, etc)."""
82 if not self._ready:
83 self._real_initialize()
84 self._ready = True
85
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
88 self.initialize()
89 return self._real_extract(url)
90
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
94
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
97 pass
98
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
101 pass
102
103
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
106
107 _VALID_URL = r"""^
108 (
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
120 v=
121 )
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
126 $"""
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
136 '13': '3gp',
137 '17': 'mp4',
138 '18': 'mp4',
139 '22': 'mp4',
140 '37': 'mp4',
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142 '43': 'webm',
143 '44': 'webm',
144 '45': 'webm',
145 '46': 'webm',
146 }
147 _video_dimensions = {
148 '5': '240x400',
149 '6': '???',
150 '13': '???',
151 '17': '144x176',
152 '18': '360x640',
153 '22': '720x1280',
154 '34': '360x640',
155 '35': '480x854',
156 '37': '1080x1920',
157 '38': '3072x4096',
158 '43': '360x640',
159 '44': '480x854',
160 '45': '720x1280',
161 '46': '1080x1920',
162 }
163 IE_NAME = u'youtube'
164
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
172
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
176
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
180
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205 def _closed_captions_xml_to_srt(self, xml_string):
206 srt = ''
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
211 start = float(start)
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
220 return srt
221
222 def _print_formats(self, formats):
223 print('Available formats:')
224 for x in formats:
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227 def _real_initialize(self):
228 if self._downloader is None:
229 return
230
231 username = None
232 password = None
233 downloader_params = self._downloader.params
234
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
240 try:
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242 if info is not None:
243 username = info[0]
244 password = info[2]
245 else:
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249 return
250
251 # Set language
252 request = compat_urllib_request.Request(self._LANG_URL)
253 try:
254 self.report_lang()
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258 return
259
260 # No authentication to be performed
261 if username is None:
262 return
263
264 # Log in
265 login_form = {
266 'current_form': 'loginForm',
267 'next': '/',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
271 }
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273 try:
274 self.report_login()
275 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278 return
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281 return
282
283 # Confirm age
284 age_form = {
285 'next_url': '/',
286 'action_confirm': 'Confirm',
287 }
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289 try:
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294 return
295
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
299 if mobj:
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304 if mobj is None:
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306 return
307 video_id = mobj.group(2)
308
309 # Get video webpage
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312 try:
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316 return
317
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322 if mobj is not None:
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324 else:
325 player_url = None
326
327 # Get video info
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
333 try:
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
338 break
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341 return
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345 else:
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347 return
348
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
352 return
353
354 # Start extracting information
355 self.report_information_extraction(video_id)
356
357 # uploader
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360 return
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363 # title
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
366 return
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369 # thumbnail image
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372 video_thumbnail = ''
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376 # upload date
377 upload_date = None
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379 if mobj is not None:
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
383 try:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385 except:
386 pass
387
388 # description
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
392 else:
393 video_description = ''
394
395 # closed captions
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
398 try:
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401 try:
402 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
412 srt_lang = 'en'
413 else:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418 try:
419 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422 if not srt_xml:
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
425 except Trouble as trouble:
426 self._downloader.trouble(str(trouble))
427
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
430 video_duration = ''
431 else:
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434 # token
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
439
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
453 else:
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
458 return
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
461 return
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468 else:
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
474 if rf in url_map:
475 video_url_list = [(rf, url_map[rf])]
476 break
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
479 return
480 else:
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482 return
483
484 results = []
485 for format_param, video_real_url in video_url_list:
486 # Extension
487 video_extension = self._video_extensions.get(format_param, 'flv')
488
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
491
492 results.append({
493 'id': video_id,
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
505 })
506 return results
507
508
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
511
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
516
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
519
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
527
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
539 try:
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544 return
545
546 # Confirm age
547 disclaimer_form = {
548 'filters': '0',
549 'submit': "Continue - I'm over 18",
550 }
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552 try:
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557 return
558
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
562 if mobj is None:
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564 return
565
566 video_id = mobj.group(1)
567
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572 return
573
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576 try:
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581 return
582
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586 if mobj is not None:
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
589
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592 if mobj is None:
593 video_url = mediaURL
594 else:
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597 else:
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599 if mobj is None:
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
601 return
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
605 return
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
609 return
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615 if mobj is None:
616 self._downloader.trouble(u'ERROR: unable to extract title')
617 return
618 video_title = mobj.group(1).decode('utf-8')
619
620 mobj = re.search(r'submitter=(.*?);', webpage)
621 if mobj is None:
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623 return
624 video_uploader = mobj.group(1)
625
626 return [{
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
630 'upload_date': None,
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
633 }]
634
635
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
638
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
641
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
644
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
656 if mobj is None:
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658 return
659
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662 video_extension = 'mp4'
663
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
667 try:
668 self.report_download_webpage(video_id)
669 webpage_bytes = compat_urllib_request.urlopen(request).read()
670 webpage = webpage_bytes.decode('utf-8')
671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
673 return
674
675 # Extract URL, uploader and title from webpage
676 self.report_extraction(video_id)
677 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
678 if mobj is None:
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
680 return
681 flashvars = compat_urllib_parse.unquote(mobj.group(1))
682
683 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
684 if key in flashvars:
685 max_quality = key
686 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
687 break
688 else:
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
690 return
691
692 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
693 if mobj is None:
694 self._downloader.trouble(u'ERROR: unable to extract video URL')
695 return
696
697 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
698
699 # TODO: support choosing qualities
700
701 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
702 if mobj is None:
703 self._downloader.trouble(u'ERROR: unable to extract title')
704 return
705 video_title = unescapeHTML(mobj.group('title'))
706
707 video_uploader = None
708 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
709 if mobj is None:
710 # lookin for official user
711 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712 if mobj_official is None:
713 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
714 else:
715 video_uploader = mobj_official.group(1)
716 else:
717 video_uploader = mobj.group(1)
718
719 video_upload_date = None
720 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
721 if mobj is not None:
722 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
723
724 return [{
725 'id': video_id,
726 'url': video_url,
727 'uploader': video_uploader,
728 'upload_date': video_upload_date,
729 'title': video_title,
730 'ext': video_extension,
731 }]
732
733
734 class PhotobucketIE(InfoExtractor):
735 """Information extractor for photobucket.com."""
736
737 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
738 IE_NAME = u'photobucket'
739
740 def __init__(self, downloader=None):
741 InfoExtractor.__init__(self, downloader)
742
743 def report_download_webpage(self, video_id):
744 """Report webpage download."""
745 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
746
747 def report_extraction(self, video_id):
748 """Report information extraction."""
749 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
750
751 def _real_extract(self, url):
752 # Extract id from URL
753 mobj = re.match(self._VALID_URL, url)
754 if mobj is None:
755 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
756 return
757
758 video_id = mobj.group(1)
759
760 video_extension = 'flv'
761
762 # Retrieve video webpage to extract further information
763 request = compat_urllib_request.Request(url)
764 try:
765 self.report_download_webpage(video_id)
766 webpage = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
769 return
770
771 # Extract URL, uploader, and title from webpage
772 self.report_extraction(video_id)
773 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
774 if mobj is None:
775 self._downloader.trouble(u'ERROR: unable to extract media URL')
776 return
777 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
778
779 video_url = mediaURL
780
781 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
782 if mobj is None:
783 self._downloader.trouble(u'ERROR: unable to extract title')
784 return
785 video_title = mobj.group(1).decode('utf-8')
786
787 video_uploader = mobj.group(2).decode('utf-8')
788
789 return [{
790 'id': video_id.decode('utf-8'),
791 'url': video_url.decode('utf-8'),
792 'uploader': video_uploader,
793 'upload_date': None,
794 'title': video_title,
795 'ext': video_extension.decode('utf-8'),
796 }]
797
798
799 class YahooIE(InfoExtractor):
800 """Information extractor for video.yahoo.com."""
801
802 _WORKING = False
803 # _VALID_URL matches all Yahoo! Video URLs
804 # _VPAGE_URL matches only the extractable '/watch/' URLs
805 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
806 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
807 IE_NAME = u'video.yahoo'
808
809 def __init__(self, downloader=None):
810 InfoExtractor.__init__(self, downloader)
811
812 def report_download_webpage(self, video_id):
813 """Report webpage download."""
814 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
815
816 def report_extraction(self, video_id):
817 """Report information extraction."""
818 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
819
820 def _real_extract(self, url, new_video=True):
821 # Extract ID from URL
822 mobj = re.match(self._VALID_URL, url)
823 if mobj is None:
824 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
825 return
826
827 video_id = mobj.group(2)
828 video_extension = 'flv'
829
830 # Rewrite valid but non-extractable URLs as
831 # extractable English language /watch/ URLs
832 if re.match(self._VPAGE_URL, url) is None:
833 request = compat_urllib_request.Request(url)
834 try:
835 webpage = compat_urllib_request.urlopen(request).read()
836 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
837 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
838 return
839
840 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
841 if mobj is None:
842 self._downloader.trouble(u'ERROR: Unable to extract id field')
843 return
844 yahoo_id = mobj.group(1)
845
846 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
847 if mobj is None:
848 self._downloader.trouble(u'ERROR: Unable to extract vid field')
849 return
850 yahoo_vid = mobj.group(1)
851
852 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
853 return self._real_extract(url, new_video=False)
854
855 # Retrieve video webpage to extract further information
856 request = compat_urllib_request.Request(url)
857 try:
858 self.report_download_webpage(video_id)
859 webpage = compat_urllib_request.urlopen(request).read()
860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
862 return
863
864 # Extract uploader and title from webpage
865 self.report_extraction(video_id)
866 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
867 if mobj is None:
868 self._downloader.trouble(u'ERROR: unable to extract video title')
869 return
870 video_title = mobj.group(1).decode('utf-8')
871
872 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
873 if mobj is None:
874 self._downloader.trouble(u'ERROR: unable to extract video uploader')
875 return
876 video_uploader = mobj.group(1).decode('utf-8')
877
878 # Extract video thumbnail
879 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
880 if mobj is None:
881 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
882 return
883 video_thumbnail = mobj.group(1).decode('utf-8')
884
885 # Extract video description
886 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
887 if mobj is None:
888 self._downloader.trouble(u'ERROR: unable to extract video description')
889 return
890 video_description = mobj.group(1).decode('utf-8')
891 if not video_description:
892 video_description = 'No description available.'
893
894 # Extract video height and width
895 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
896 if mobj is None:
897 self._downloader.trouble(u'ERROR: unable to extract video height')
898 return
899 yv_video_height = mobj.group(1)
900
901 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
902 if mobj is None:
903 self._downloader.trouble(u'ERROR: unable to extract video width')
904 return
905 yv_video_width = mobj.group(1)
906
907 # Retrieve video playlist to extract media URL
908 # I'm not completely sure what all these options are, but we
909 # seem to need most of them, otherwise the server sends a 401.
910 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
911 yv_bitrate = '700' # according to Wikipedia this is hard-coded
912 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
913 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
914 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
915 try:
916 self.report_download_webpage(video_id)
917 webpage = compat_urllib_request.urlopen(request).read()
918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
920 return
921
922 # Extract media URL from playlist XML
923 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
924 if mobj is None:
925 self._downloader.trouble(u'ERROR: Unable to extract media URL')
926 return
927 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
928 video_url = unescapeHTML(video_url)
929
930 return [{
931 'id': video_id.decode('utf-8'),
932 'url': video_url,
933 'uploader': video_uploader,
934 'upload_date': None,
935 'title': video_title,
936 'ext': video_extension.decode('utf-8'),
937 'thumbnail': video_thumbnail.decode('utf-8'),
938 'description': video_description,
939 }]
940
941
942 class VimeoIE(InfoExtractor):
943 """Information extractor for vimeo.com."""
944
945 # _VALID_URL matches Vimeo URLs
946 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
947 IE_NAME = u'vimeo'
948
949 def __init__(self, downloader=None):
950 InfoExtractor.__init__(self, downloader)
951
952 def report_download_webpage(self, video_id):
953 """Report webpage download."""
954 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
955
956 def report_extraction(self, video_id):
957 """Report information extraction."""
958 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
959
960 def _real_extract(self, url, new_video=True):
961 # Extract ID from URL
962 mobj = re.match(self._VALID_URL, url)
963 if mobj is None:
964 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
965 return
966
967 video_id = mobj.group(1)
968
969 # Retrieve video webpage to extract further information
970 request = compat_urllib_request.Request(url, None, std_headers)
971 try:
972 self.report_download_webpage(video_id)
973 webpage_bytes = compat_urllib_request.urlopen(request).read()
974 webpage = webpage_bytes.decode('utf-8')
975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
976 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
977 return
978
979 # Now we begin extracting as much information as we can from what we
980 # retrieved. First we extract the information common to all extractors,
981 # and latter we extract those that are Vimeo specific.
982 self.report_extraction(video_id)
983
984 # Extract the config JSON
985 try:
986 config = webpage.split(' = {config:')[1].split(',assets:')[0]
987 config = json.loads(config)
988 except:
989 self._downloader.trouble(u'ERROR: unable to extract info section')
990 return
991
992 # Extract title
993 video_title = config["video"]["title"]
994
995 # Extract uploader
996 video_uploader = config["video"]["owner"]["name"]
997
998 # Extract video thumbnail
999 video_thumbnail = config["video"]["thumbnail"]
1000
1001 # Extract video description
1002 video_description = get_element_by_id("description", webpage)
1003 if video_description: video_description = clean_html(video_description)
1004 else: video_description = ''
1005
1006 # Extract upload date
1007 video_upload_date = None
1008 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1009 if mobj is not None:
1010 video_upload_date = mobj.group(1)
1011
1012 # Vimeo specific: extract request signature and timestamp
1013 sig = config['request']['signature']
1014 timestamp = config['request']['timestamp']
1015
1016 # Vimeo specific: extract video codec and quality information
1017 # First consider quality, then codecs, then take everything
1018 # TODO bind to format param
1019 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1020 files = { 'hd': [], 'sd': [], 'other': []}
1021 for codec_name, codec_extension in codecs:
1022 if codec_name in config["video"]["files"]:
1023 if 'hd' in config["video"]["files"][codec_name]:
1024 files['hd'].append((codec_name, codec_extension, 'hd'))
1025 elif 'sd' in config["video"]["files"][codec_name]:
1026 files['sd'].append((codec_name, codec_extension, 'sd'))
1027 else:
1028 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1029
1030 for quality in ('hd', 'sd', 'other'):
1031 if len(files[quality]) > 0:
1032 video_quality = files[quality][0][2]
1033 video_codec = files[quality][0][0]
1034 video_extension = files[quality][0][1]
1035 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1036 break
1037 else:
1038 self._downloader.trouble(u'ERROR: no known codec found')
1039 return
1040
1041 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1042 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1043
1044 return [{
1045 'id': video_id,
1046 'url': video_url,
1047 'uploader': video_uploader,
1048 'upload_date': video_upload_date,
1049 'title': video_title,
1050 'ext': video_extension,
1051 'thumbnail': video_thumbnail,
1052 'description': video_description,
1053 }]
1054
1055
1056 class ArteTvIE(InfoExtractor):
1057 """arte.tv information extractor."""
1058
1059 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1060 _LIVE_URL = r'index-[0-9]+\.html$'
1061
1062 IE_NAME = u'arte.tv'
1063
1064 def __init__(self, downloader=None):
1065 InfoExtractor.__init__(self, downloader)
1066
1067 def report_download_webpage(self, video_id):
1068 """Report webpage download."""
1069 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1070
1071 def report_extraction(self, video_id):
1072 """Report information extraction."""
1073 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1074
1075 def fetch_webpage(self, url):
1076 self._downloader.increment_downloads()
1077 request = compat_urllib_request.Request(url)
1078 try:
1079 self.report_download_webpage(url)
1080 webpage = compat_urllib_request.urlopen(request).read()
1081 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1082 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1083 return
1084 except ValueError as err:
1085 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1086 return
1087 return webpage
1088
1089 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1090 page = self.fetch_webpage(url)
1091 mobj = re.search(regex, page, regexFlags)
1092 info = {}
1093
1094 if mobj is None:
1095 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1096 return
1097
1098 for (i, key, err) in matchTuples:
1099 if mobj.group(i) is None:
1100 self._downloader.trouble(err)
1101 return
1102 else:
1103 info[key] = mobj.group(i)
1104
1105 return info
1106
1107 def extractLiveStream(self, url):
1108 video_lang = url.split('/')[-4]
1109 info = self.grep_webpage(
1110 url,
1111 r'src="(.*?/videothek_js.*?\.js)',
1112 0,
1113 [
1114 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1115 ]
1116 )
1117 http_host = url.split('/')[2]
1118 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1119 info = self.grep_webpage(
1120 next_url,
1121 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1122 '(http://.*?\.swf).*?' +
1123 '(rtmp://.*?)\'',
1124 re.DOTALL,
1125 [
1126 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1127 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1128 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1129 ]
1130 )
1131 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1132
1133 def extractPlus7Stream(self, url):
1134 video_lang = url.split('/')[-3]
1135 info = self.grep_webpage(
1136 url,
1137 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1138 0,
1139 [
1140 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1141 ]
1142 )
1143 next_url = compat_urllib_parse.unquote(info.get('url'))
1144 info = self.grep_webpage(
1145 next_url,
1146 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1147 0,
1148 [
1149 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1150 ]
1151 )
1152 next_url = compat_urllib_parse.unquote(info.get('url'))
1153
1154 info = self.grep_webpage(
1155 next_url,
1156 r'<video id="(.*?)".*?>.*?' +
1157 '<name>(.*?)</name>.*?' +
1158 '<dateVideo>(.*?)</dateVideo>.*?' +
1159 '<url quality="hd">(.*?)</url>',
1160 re.DOTALL,
1161 [
1162 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1163 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1164 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1165 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1166 ]
1167 )
1168
1169 return {
1170 'id': info.get('id'),
1171 'url': compat_urllib_parse.unquote(info.get('url')),
1172 'uploader': u'arte.tv',
1173 'upload_date': info.get('date'),
1174 'title': info.get('title').decode('utf-8'),
1175 'ext': u'mp4',
1176 'format': u'NA',
1177 'player_url': None,
1178 }
1179
1180 def _real_extract(self, url):
1181 video_id = url.split('/')[-1]
1182 self.report_extraction(video_id)
1183
1184 if re.search(self._LIVE_URL, video_id) is not None:
1185 self.extractLiveStream(url)
1186 return
1187 else:
1188 info = self.extractPlus7Stream(url)
1189
1190 return [info]
1191
1192
1193 class GenericIE(InfoExtractor):
1194 """Generic last-resort information extractor."""
1195
1196 _VALID_URL = r'.*'
1197 IE_NAME = u'generic'
1198
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1201
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1205 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1206
1207 def report_extraction(self, video_id):
1208 """Report information extraction."""
1209 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1210
1211 def report_following_redirect(self, new_url):
1212 """Report information extraction."""
1213 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1214
1215 def _test_redirect(self, url):
1216 """Check if it is a redirect, like url shorteners, in case restart chain."""
1217 class HeadRequest(compat_urllib_request.Request):
1218 def get_method(self):
1219 return "HEAD"
1220
1221 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1222 """
1223 Subclass the HTTPRedirectHandler to make it use our
1224 HeadRequest also on the redirected URL
1225 """
1226 def redirect_request(self, req, fp, code, msg, headers, newurl):
1227 if code in (301, 302, 303, 307):
1228 newurl = newurl.replace(' ', '%20')
1229 newheaders = dict((k,v) for k,v in req.headers.items()
1230 if k.lower() not in ("content-length", "content-type"))
1231 return HeadRequest(newurl,
1232 headers=newheaders,
1233 origin_req_host=req.get_origin_req_host(),
1234 unverifiable=True)
1235 else:
1236 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1237
1238 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1239 """
1240 Fallback to GET if HEAD is not allowed (405 HTTP error)
1241 """
1242 def http_error_405(self, req, fp, code, msg, headers):
1243 fp.read()
1244 fp.close()
1245
1246 newheaders = dict((k,v) for k,v in req.headers.items()
1247 if k.lower() not in ("content-length", "content-type"))
1248 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1249 headers=newheaders,
1250 origin_req_host=req.get_origin_req_host(),
1251 unverifiable=True))
1252
1253 # Build our opener
1254 opener = compat_urllib_request.OpenerDirector()
1255 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1256 HTTPMethodFallback, HEADRedirectHandler,
1257 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1258 opener.add_handler(handler())
1259
1260 response = opener.open(HeadRequest(url))
1261 new_url = response.geturl()
1262
1263 if url == new_url:
1264 return False
1265
1266 self.report_following_redirect(new_url)
1267 self._downloader.download([new_url])
1268 return True
1269
1270 def _real_extract(self, url):
1271 if self._test_redirect(url): return
1272
1273 video_id = url.split('/')[-1]
1274 request = compat_urllib_request.Request(url)
1275 try:
1276 self.report_download_webpage(video_id)
1277 webpage = compat_urllib_request.urlopen(request).read()
1278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1280 return
1281 except ValueError as err:
1282 # since this is the last-resort InfoExtractor, if
1283 # this error is thrown, it'll be thrown here
1284 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1285 return
1286
1287 self.report_extraction(video_id)
1288 # Start with something easy: JW Player in SWFObject
1289 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1290 if mobj is None:
1291 # Broaden the search a little bit
1292 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1293 if mobj is None:
1294 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1295 return
1296
1297 # It's possible that one of the regexes
1298 # matched, but returned an empty group:
1299 if mobj.group(1) is None:
1300 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1301 return
1302
1303 video_url = compat_urllib_parse.unquote(mobj.group(1))
1304 video_id = os.path.basename(video_url)
1305
1306 # here's a fun little line of code for you:
1307 video_extension = os.path.splitext(video_id)[1][1:]
1308 video_id = os.path.splitext(video_id)[0]
1309
1310 # it's tempting to parse this further, but you would
1311 # have to take into account all the variations like
1312 # Video Title - Site Name
1313 # Site Name | Video Title
1314 # Video Title - Tagline | Site Name
1315 # and so on and so forth; it's just not practical
1316 mobj = re.search(r'<title>(.*)</title>', webpage)
1317 if mobj is None:
1318 self._downloader.trouble(u'ERROR: unable to extract title')
1319 return
1320 video_title = mobj.group(1)
1321
1322 # video uploader is domain name
1323 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1324 if mobj is None:
1325 self._downloader.trouble(u'ERROR: unable to extract title')
1326 return
1327 video_uploader = mobj.group(1)
1328
1329 return [{
1330 'id': video_id,
1331 'url': video_url,
1332 'uploader': video_uploader,
1333 'upload_date': None,
1334 'title': video_title,
1335 'ext': video_extension,
1336 }]
1337
1338
1339 class YoutubeSearchIE(InfoExtractor):
1340 """Information Extractor for YouTube search queries."""
1341 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343 _max_youtube_results = 1000
1344 IE_NAME = u'youtube:search'
1345
1346 def __init__(self, downloader=None):
1347 InfoExtractor.__init__(self, downloader)
1348
1349 def report_download_page(self, query, pagenum):
1350 """Report attempt to download search page with given number."""
1351 query = query.decode(preferredencoding())
1352 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1353
1354 def _real_extract(self, query):
1355 mobj = re.match(self._VALID_URL, query)
1356 if mobj is None:
1357 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1358 return
1359
1360 prefix, query = query.split(':')
1361 prefix = prefix[8:]
1362 query = query.encode('utf-8')
1363 if prefix == '':
1364 self._download_n_results(query, 1)
1365 return
1366 elif prefix == 'all':
1367 self._download_n_results(query, self._max_youtube_results)
1368 return
1369 else:
1370 try:
1371 n = int(prefix)
1372 if n <= 0:
1373 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1374 return
1375 elif n > self._max_youtube_results:
1376 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1377 n = self._max_youtube_results
1378 self._download_n_results(query, n)
1379 return
1380 except ValueError: # parsing prefix as integer fails
1381 self._download_n_results(query, 1)
1382 return
1383
1384 def _download_n_results(self, query, n):
1385 """Downloads a specified number of results for a query"""
1386
1387 video_ids = []
1388 pagenum = 0
1389 limit = n
1390
1391 while (50 * pagenum) < limit:
1392 self.report_download_page(query, pagenum+1)
1393 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1394 request = compat_urllib_request.Request(result_url)
1395 try:
1396 data = compat_urllib_request.urlopen(request).read()
1397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1398 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1399 return
1400 api_response = json.loads(data)['data']
1401
1402 new_ids = list(video['id'] for video in api_response['items'])
1403 video_ids += new_ids
1404
1405 limit = min(n, api_response['totalItems'])
1406 pagenum += 1
1407
1408 if len(video_ids) > n:
1409 video_ids = video_ids[:n]
1410 for id in video_ids:
1411 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1412 return
1413
1414
1415 class GoogleSearchIE(InfoExtractor):
1416 """Information Extractor for Google Video search queries."""
1417 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1418 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1419 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1420 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1421 _max_google_results = 1000
1422 IE_NAME = u'video.google:search'
1423
1424 def __init__(self, downloader=None):
1425 InfoExtractor.__init__(self, downloader)
1426
1427 def report_download_page(self, query, pagenum):
1428 """Report attempt to download playlist page with given number."""
1429 query = query.decode(preferredencoding())
1430 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1431
1432 def _real_extract(self, query):
1433 mobj = re.match(self._VALID_URL, query)
1434 if mobj is None:
1435 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1436 return
1437
1438 prefix, query = query.split(':')
1439 prefix = prefix[8:]
1440 query = query.encode('utf-8')
1441 if prefix == '':
1442 self._download_n_results(query, 1)
1443 return
1444 elif prefix == 'all':
1445 self._download_n_results(query, self._max_google_results)
1446 return
1447 else:
1448 try:
1449 n = int(prefix)
1450 if n <= 0:
1451 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1452 return
1453 elif n > self._max_google_results:
1454 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1455 n = self._max_google_results
1456 self._download_n_results(query, n)
1457 return
1458 except ValueError: # parsing prefix as integer fails
1459 self._download_n_results(query, 1)
1460 return
1461
1462 def _download_n_results(self, query, n):
1463 """Downloads a specified number of results for a query"""
1464
1465 video_ids = []
1466 pagenum = 0
1467
1468 while True:
1469 self.report_download_page(query, pagenum)
1470 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1471 request = compat_urllib_request.Request(result_url)
1472 try:
1473 page = compat_urllib_request.urlopen(request).read()
1474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1476 return
1477
1478 # Extract video identifiers
1479 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1480 video_id = mobj.group(1)
1481 if video_id not in video_ids:
1482 video_ids.append(video_id)
1483 if len(video_ids) == n:
1484 # Specified n videos reached
1485 for id in video_ids:
1486 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1487 return
1488
1489 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1490 for id in video_ids:
1491 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1492 return
1493
1494 pagenum = pagenum + 1
1495
1496
1497 class YahooSearchIE(InfoExtractor):
1498 """Information Extractor for Yahoo! Video search queries."""
1499
1500 _WORKING = False
1501 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1502 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1503 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1504 _MORE_PAGES_INDICATOR = r'\s*Next'
1505 _max_yahoo_results = 1000
1506 IE_NAME = u'video.yahoo:search'
1507
1508 def __init__(self, downloader=None):
1509 InfoExtractor.__init__(self, downloader)
1510
1511 def report_download_page(self, query, pagenum):
1512 """Report attempt to download playlist page with given number."""
1513 query = query.decode(preferredencoding())
1514 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1515
1516 def _real_extract(self, query):
1517 mobj = re.match(self._VALID_URL, query)
1518 if mobj is None:
1519 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1520 return
1521
1522 prefix, query = query.split(':')
1523 prefix = prefix[8:]
1524 query = query.encode('utf-8')
1525 if prefix == '':
1526 self._download_n_results(query, 1)
1527 return
1528 elif prefix == 'all':
1529 self._download_n_results(query, self._max_yahoo_results)
1530 return
1531 else:
1532 try:
1533 n = int(prefix)
1534 if n <= 0:
1535 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1536 return
1537 elif n > self._max_yahoo_results:
1538 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1539 n = self._max_yahoo_results
1540 self._download_n_results(query, n)
1541 return
1542 except ValueError: # parsing prefix as integer fails
1543 self._download_n_results(query, 1)
1544 return
1545
1546 def _download_n_results(self, query, n):
1547 """Downloads a specified number of results for a query"""
1548
1549 video_ids = []
1550 already_seen = set()
1551 pagenum = 1
1552
1553 while True:
1554 self.report_download_page(query, pagenum)
1555 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1556 request = compat_urllib_request.Request(result_url)
1557 try:
1558 page = compat_urllib_request.urlopen(request).read()
1559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1561 return
1562
1563 # Extract video identifiers
1564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565 video_id = mobj.group(1)
1566 if video_id not in already_seen:
1567 video_ids.append(video_id)
1568 already_seen.add(video_id)
1569 if len(video_ids) == n:
1570 # Specified n videos reached
1571 for id in video_ids:
1572 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1573 return
1574
1575 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1576 for id in video_ids:
1577 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1578 return
1579
1580 pagenum = pagenum + 1
1581
1582
1583 class YoutubePlaylistIE(InfoExtractor):
1584 """Information Extractor for YouTube playlists."""
1585
1586 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1587 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1588 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1589 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1590 IE_NAME = u'youtube:playlist'
1591
1592 def __init__(self, downloader=None):
1593 InfoExtractor.__init__(self, downloader)
1594
1595 def report_download_page(self, playlist_id, pagenum):
1596 """Report attempt to download playlist page with given number."""
1597 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1598
1599 def _real_extract(self, url):
1600 # Extract playlist id
1601 mobj = re.match(self._VALID_URL, url)
1602 if mobj is None:
1603 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1604 return
1605
1606 # Single video case
1607 if mobj.group(3) is not None:
1608 self._downloader.download([mobj.group(3)])
1609 return
1610
1611 # Download playlist pages
1612 # prefix is 'p' as default for playlists but there are other types that need extra care
1613 playlist_prefix = mobj.group(1)
1614 if playlist_prefix == 'a':
1615 playlist_access = 'artist'
1616 else:
1617 playlist_prefix = 'p'
1618 playlist_access = 'view_play_list'
1619 playlist_id = mobj.group(2)
1620 video_ids = []
1621 pagenum = 1
1622
1623 while True:
1624 self.report_download_page(playlist_id, pagenum)
1625 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1626 request = compat_urllib_request.Request(url)
1627 try:
1628 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1629 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1631 return
1632
1633 # Extract video identifiers
1634 ids_in_page = []
1635 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1636 if mobj.group(1) not in ids_in_page:
1637 ids_in_page.append(mobj.group(1))
1638 video_ids.extend(ids_in_page)
1639
1640 if self._MORE_PAGES_INDICATOR not in page:
1641 break
1642 pagenum = pagenum + 1
1643
1644 total = len(video_ids)
1645
1646 playliststart = self._downloader.params.get('playliststart', 1) - 1
1647 playlistend = self._downloader.params.get('playlistend', -1)
1648 if playlistend == -1:
1649 video_ids = video_ids[playliststart:]
1650 else:
1651 video_ids = video_ids[playliststart:playlistend]
1652
1653 if len(video_ids) == total:
1654 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1655 else:
1656 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1657
1658 for id in video_ids:
1659 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1660 return
1661
1662
1663 class YoutubeChannelIE(InfoExtractor):
1664 """Information Extractor for YouTube channels."""
1665
1666 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1667 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1668 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669 IE_NAME = u'youtube:channel'
1670
1671 def report_download_page(self, channel_id, pagenum):
1672 """Report attempt to download channel page with given number."""
1673 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1674
1675 def _real_extract(self, url):
1676 # Extract channel id
1677 mobj = re.match(self._VALID_URL, url)
1678 if mobj is None:
1679 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1680 return
1681
1682 # Download channel pages
1683 channel_id = mobj.group(1)
1684 video_ids = []
1685 pagenum = 1
1686
1687 while True:
1688 self.report_download_page(channel_id, pagenum)
1689 url = self._TEMPLATE_URL % (channel_id, pagenum)
1690 request = compat_urllib_request.Request(url)
1691 try:
1692 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1694 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1695 return
1696
1697 # Extract video identifiers
1698 ids_in_page = []
1699 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1700 if mobj.group(1) not in ids_in_page:
1701 ids_in_page.append(mobj.group(1))
1702 video_ids.extend(ids_in_page)
1703
1704 if self._MORE_PAGES_INDICATOR not in page:
1705 break
1706 pagenum = pagenum + 1
1707
1708 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1709
1710 for id in video_ids:
1711 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1712 return
1713
1714
1715 class YoutubeUserIE(InfoExtractor):
1716 """Information Extractor for YouTube users."""
1717
1718 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1719 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1720 _GDATA_PAGE_SIZE = 50
1721 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1722 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1723 IE_NAME = u'youtube:user'
1724
1725 def __init__(self, downloader=None):
1726 InfoExtractor.__init__(self, downloader)
1727
1728 def report_download_page(self, username, start_index):
1729 """Report attempt to download user page."""
1730 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1731 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1732
1733 def _real_extract(self, url):
1734 # Extract username
1735 mobj = re.match(self._VALID_URL, url)
1736 if mobj is None:
1737 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1738 return
1739
1740 username = mobj.group(1)
1741
1742 # Download video ids using YouTube Data API. Result size per
1743 # query is limited (currently to 50 videos) so we need to query
1744 # page by page until there are no video ids - it means we got
1745 # all of them.
1746
1747 video_ids = []
1748 pagenum = 0
1749
1750 while True:
1751 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1752 self.report_download_page(username, start_index)
1753
1754 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1755
1756 try:
1757 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1758 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1759 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1760 return
1761
1762 # Extract video identifiers
1763 ids_in_page = []
1764
1765 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1766 if mobj.group(1) not in ids_in_page:
1767 ids_in_page.append(mobj.group(1))
1768
1769 video_ids.extend(ids_in_page)
1770
1771 # A little optimization - if current page is not
1772 # "full", ie. does not contain PAGE_SIZE video ids then
1773 # we can assume that this page is the last one - there
1774 # are no more ids on further pages - no need to query
1775 # again.
1776
1777 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1778 break
1779
1780 pagenum += 1
1781
1782 all_ids_count = len(video_ids)
1783 playliststart = self._downloader.params.get('playliststart', 1) - 1
1784 playlistend = self._downloader.params.get('playlistend', -1)
1785
1786 if playlistend == -1:
1787 video_ids = video_ids[playliststart:]
1788 else:
1789 video_ids = video_ids[playliststart:playlistend]
1790
1791 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1792 (username, all_ids_count, len(video_ids)))
1793
1794 for video_id in video_ids:
1795 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1796
1797
1798 class BlipTVUserIE(InfoExtractor):
1799 """Information Extractor for blip.tv users."""
1800
1801 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1802 _PAGE_SIZE = 12
1803 IE_NAME = u'blip.tv:user'
1804
1805 def __init__(self, downloader=None):
1806 InfoExtractor.__init__(self, downloader)
1807
1808 def report_download_page(self, username, pagenum):
1809 """Report attempt to download user page."""
1810 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1811 (self.IE_NAME, username, pagenum))
1812
1813 def _real_extract(self, url):
1814 # Extract username
1815 mobj = re.match(self._VALID_URL, url)
1816 if mobj is None:
1817 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1818 return
1819
1820 username = mobj.group(1)
1821
1822 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1823
1824 request = compat_urllib_request.Request(url)
1825
1826 try:
1827 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1828 mobj = re.search(r'data-users-id="([^"]+)"', page)
1829 page_base = page_base % mobj.group(1)
1830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1831 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1832 return
1833
1834
1835 # Download video ids using BlipTV Ajax calls. Result size per
1836 # query is limited (currently to 12 videos) so we need to query
1837 # page by page until there are no video ids - it means we got
1838 # all of them.
1839
1840 video_ids = []
1841 pagenum = 1
1842
1843 while True:
1844 self.report_download_page(username, pagenum)
1845
1846 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1847
1848 try:
1849 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1852 return
1853
1854 # Extract video identifiers
1855 ids_in_page = []
1856
1857 for mobj in re.finditer(r'href="/([^"]+)"', page):
1858 if mobj.group(1) not in ids_in_page:
1859 ids_in_page.append(unescapeHTML(mobj.group(1)))
1860
1861 video_ids.extend(ids_in_page)
1862
1863 # A little optimization - if current page is not
1864 # "full", ie. does not contain PAGE_SIZE video ids then
1865 # we can assume that this page is the last one - there
1866 # are no more ids on further pages - no need to query
1867 # again.
1868
1869 if len(ids_in_page) < self._PAGE_SIZE:
1870 break
1871
1872 pagenum += 1
1873
1874 all_ids_count = len(video_ids)
1875 playliststart = self._downloader.params.get('playliststart', 1) - 1
1876 playlistend = self._downloader.params.get('playlistend', -1)
1877
1878 if playlistend == -1:
1879 video_ids = video_ids[playliststart:]
1880 else:
1881 video_ids = video_ids[playliststart:playlistend]
1882
1883 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1884 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1885
1886 for video_id in video_ids:
1887 self._downloader.download([u'http://blip.tv/'+video_id])
1888
1889
1890 class DepositFilesIE(InfoExtractor):
1891 """Information extractor for depositfiles.com"""
1892
1893 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1894 IE_NAME = u'DepositFiles'
1895
1896 def __init__(self, downloader=None):
1897 InfoExtractor.__init__(self, downloader)
1898
1899 def report_download_webpage(self, file_id):
1900 """Report webpage download."""
1901 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1902
1903 def report_extraction(self, file_id):
1904 """Report information extraction."""
1905 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1906
1907 def _real_extract(self, url):
1908 file_id = url.split('/')[-1]
1909 # Rebuild url in english locale
1910 url = 'http://depositfiles.com/en/files/' + file_id
1911
1912 # Retrieve file webpage with 'Free download' button pressed
1913 free_download_indication = { 'gateway_result' : '1' }
1914 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1915 try:
1916 self.report_download_webpage(file_id)
1917 webpage = compat_urllib_request.urlopen(request).read()
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1920 return
1921
1922 # Search for the real file URL
1923 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1924 if (mobj is None) or (mobj.group(1) is None):
1925 # Try to figure out reason of the error.
1926 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1927 if (mobj is not None) and (mobj.group(1) is not None):
1928 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1929 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1930 else:
1931 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1932 return
1933
1934 file_url = mobj.group(1)
1935 file_extension = os.path.splitext(file_url)[1][1:]
1936
1937 # Search for file title
1938 mobj = re.search(r'<b title="(.*?)">', webpage)
1939 if mobj is None:
1940 self._downloader.trouble(u'ERROR: unable to extract title')
1941 return
1942 file_title = mobj.group(1).decode('utf-8')
1943
1944 return [{
1945 'id': file_id.decode('utf-8'),
1946 'url': file_url.decode('utf-8'),
1947 'uploader': None,
1948 'upload_date': None,
1949 'title': file_title,
1950 'ext': file_extension.decode('utf-8'),
1951 }]
1952
1953
1954 class FacebookIE(InfoExtractor):
1955 """Information Extractor for Facebook"""
1956
1957 _WORKING = False
1958 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1959 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1960 _NETRC_MACHINE = 'facebook'
1961 _available_formats = ['video', 'highqual', 'lowqual']
1962 _video_extensions = {
1963 'video': 'mp4',
1964 'highqual': 'mp4',
1965 'lowqual': 'mp4',
1966 }
1967 IE_NAME = u'facebook'
1968
1969 def __init__(self, downloader=None):
1970 InfoExtractor.__init__(self, downloader)
1971
1972 def _reporter(self, message):
1973 """Add header and report message."""
1974 self._downloader.to_screen(u'[facebook] %s' % message)
1975
1976 def report_login(self):
1977 """Report attempt to log in."""
1978 self._reporter(u'Logging in')
1979
1980 def report_video_webpage_download(self, video_id):
1981 """Report attempt to download video webpage."""
1982 self._reporter(u'%s: Downloading video webpage' % video_id)
1983
1984 def report_information_extraction(self, video_id):
1985 """Report attempt to extract video information."""
1986 self._reporter(u'%s: Extracting video information' % video_id)
1987
1988 def _parse_page(self, video_webpage):
1989 """Extract video information from page"""
1990 # General data
1991 data = {'title': r'\("video_title", "(.*?)"\)',
1992 'description': r'<div class="datawrap">(.*?)</div>',
1993 'owner': r'\("video_owner_name", "(.*?)"\)',
1994 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1995 }
1996 video_info = {}
1997 for piece in data.keys():
1998 mobj = re.search(data[piece], video_webpage)
1999 if mobj is not None:
2000 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2001
2002 # Video urls
2003 video_urls = {}
2004 for fmt in self._available_formats:
2005 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2006 if mobj is not None:
2007 # URL is in a Javascript segment inside an escaped Unicode format within
2008 # the generally utf-8 page
2009 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2010 video_info['video_urls'] = video_urls
2011
2012 return video_info
2013
2014 def _real_initialize(self):
2015 if self._downloader is None:
2016 return
2017
2018 useremail = None
2019 password = None
2020 downloader_params = self._downloader.params
2021
2022 # Attempt to use provided username and password or .netrc data
2023 if downloader_params.get('username', None) is not None:
2024 useremail = downloader_params['username']
2025 password = downloader_params['password']
2026 elif downloader_params.get('usenetrc', False):
2027 try:
2028 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2029 if info is not None:
2030 useremail = info[0]
2031 password = info[2]
2032 else:
2033 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2034 except (IOError, netrc.NetrcParseError) as err:
2035 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2036 return
2037
2038 if useremail is None:
2039 return
2040
2041 # Log in
2042 login_form = {
2043 'email': useremail,
2044 'pass': password,
2045 'login': 'Log+In'
2046 }
2047 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2048 try:
2049 self.report_login()
2050 login_results = compat_urllib_request.urlopen(request).read()
2051 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2052 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2053 return
2054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2056 return
2057
2058 def _real_extract(self, url):
2059 mobj = re.match(self._VALID_URL, url)
2060 if mobj is None:
2061 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2062 return
2063 video_id = mobj.group('ID')
2064
2065 # Get video webpage
2066 self.report_video_webpage_download(video_id)
2067 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2068 try:
2069 page = compat_urllib_request.urlopen(request)
2070 video_webpage = page.read()
2071 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2073 return
2074
2075 # Start extracting information
2076 self.report_information_extraction(video_id)
2077
2078 # Extract information
2079 video_info = self._parse_page(video_webpage)
2080
2081 # uploader
2082 if 'owner' not in video_info:
2083 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2084 return
2085 video_uploader = video_info['owner']
2086
2087 # title
2088 if 'title' not in video_info:
2089 self._downloader.trouble(u'ERROR: unable to extract video title')
2090 return
2091 video_title = video_info['title']
2092 video_title = video_title.decode('utf-8')
2093
2094 # thumbnail image
2095 if 'thumbnail' not in video_info:
2096 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2097 video_thumbnail = ''
2098 else:
2099 video_thumbnail = video_info['thumbnail']
2100
2101 # upload date
2102 upload_date = None
2103 if 'upload_date' in video_info:
2104 upload_time = video_info['upload_date']
2105 timetuple = email.utils.parsedate_tz(upload_time)
2106 if timetuple is not None:
2107 try:
2108 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2109 except:
2110 pass
2111
2112 # description
2113 video_description = video_info.get('description', 'No description available.')
2114
2115 url_map = video_info['video_urls']
2116 if len(url_map.keys()) > 0:
2117 # Decide which formats to download
2118 req_format = self._downloader.params.get('format', None)
2119 format_limit = self._downloader.params.get('format_limit', None)
2120
2121 if format_limit is not None and format_limit in self._available_formats:
2122 format_list = self._available_formats[self._available_formats.index(format_limit):]
2123 else:
2124 format_list = self._available_formats
2125 existing_formats = [x for x in format_list if x in url_map]
2126 if len(existing_formats) == 0:
2127 self._downloader.trouble(u'ERROR: no known formats available for video')
2128 return
2129 if req_format is None:
2130 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2131 elif req_format == 'worst':
2132 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2133 elif req_format == '-1':
2134 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2135 else:
2136 # Specific format
2137 if req_format not in url_map:
2138 self._downloader.trouble(u'ERROR: requested format not available')
2139 return
2140 video_url_list = [(req_format, url_map[req_format])] # Specific format
2141
2142 results = []
2143 for format_param, video_real_url in video_url_list:
2144 # Extension
2145 video_extension = self._video_extensions.get(format_param, 'mp4')
2146
2147 results.append({
2148 'id': video_id.decode('utf-8'),
2149 'url': video_real_url.decode('utf-8'),
2150 'uploader': video_uploader.decode('utf-8'),
2151 'upload_date': upload_date,
2152 'title': video_title,
2153 'ext': video_extension.decode('utf-8'),
2154 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2155 'thumbnail': video_thumbnail.decode('utf-8'),
2156 'description': video_description.decode('utf-8'),
2157 })
2158 return results
2159
2160 class BlipTVIE(InfoExtractor):
2161 """Information extractor for blip.tv"""
2162
2163 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2164 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2165 IE_NAME = u'blip.tv'
2166
2167 def report_extraction(self, file_id):
2168 """Report information extraction."""
2169 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2170
2171 def report_direct_download(self, title):
2172 """Report information extraction."""
2173 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2174
2175 def _real_extract(self, url):
2176 mobj = re.match(self._VALID_URL, url)
2177 if mobj is None:
2178 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2179 return
2180
2181 if '?' in url:
2182 cchar = '&'
2183 else:
2184 cchar = '?'
2185 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186 request = compat_urllib_request.Request(json_url)
2187 self.report_extraction(mobj.group(1))
2188 info = None
2189 try:
2190 urlh = compat_urllib_request.urlopen(request)
2191 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2192 basename = url.split('/')[-1]
2193 title,ext = os.path.splitext(basename)
2194 title = title.decode('UTF-8')
2195 ext = ext.replace('.', '')
2196 self.report_direct_download(title)
2197 info = {
2198 'id': title,
2199 'url': url,
2200 'uploader': None,
2201 'upload_date': None,
2202 'title': title,
2203 'ext': ext,
2204 'urlhandle': urlh
2205 }
2206 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2207 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2208 return
2209 if info is None: # Regular URL
2210 try:
2211 json_code_bytes = urlh.read()
2212 json_code = json_code_bytes.decode('utf-8')
2213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2215 return
2216
2217 try:
2218 json_data = json.loads(json_code)
2219 if 'Post' in json_data:
2220 data = json_data['Post']
2221 else:
2222 data = json_data
2223
2224 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225 video_url = data['media']['url']
2226 umobj = re.match(self._URL_EXT, video_url)
2227 if umobj is None:
2228 raise ValueError('Can not determine filename extension')
2229 ext = umobj.group(1)
2230
2231 info = {
2232 'id': data['item_id'],
2233 'url': video_url,
2234 'uploader': data['display_name'],
2235 'upload_date': upload_date,
2236 'title': data['title'],
2237 'ext': ext,
2238 'format': data['media']['mimeType'],
2239 'thumbnail': data['thumbnailUrl'],
2240 'description': data['description'],
2241 'player_url': data['embedUrl']
2242 }
2243 except (ValueError,KeyError) as err:
2244 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2245 return
2246
2247 std_headers['User-Agent'] = 'iTunes/10.6.1'
2248 return [info]
2249
2250
2251 class MyVideoIE(InfoExtractor):
2252 """Information Extractor for myvideo.de."""
2253
2254 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255 IE_NAME = u'myvideo'
2256
2257 def __init__(self, downloader=None):
2258 InfoExtractor.__init__(self, downloader)
2259
2260 def report_download_webpage(self, video_id):
2261 """Report webpage download."""
2262 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2263
2264 def report_extraction(self, video_id):
2265 """Report information extraction."""
2266 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2267
2268 def _real_extract(self,url):
2269 mobj = re.match(self._VALID_URL, url)
2270 if mobj is None:
2271 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2272 return
2273
2274 video_id = mobj.group(1)
2275
2276 # Get video webpage
2277 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2278 try:
2279 self.report_download_webpage(video_id)
2280 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2281 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2282 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2283 return
2284
2285 self.report_extraction(video_id)
2286 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2287 webpage)
2288 if mobj is None:
2289 self._downloader.trouble(u'ERROR: unable to extract media URL')
2290 return
2291 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2292
2293 mobj = re.search('<title>([^<]+)</title>', webpage)
2294 if mobj is None:
2295 self._downloader.trouble(u'ERROR: unable to extract title')
2296 return
2297
2298 video_title = mobj.group(1)
2299
2300 return [{
2301 'id': video_id,
2302 'url': video_url,
2303 'uploader': None,
2304 'upload_date': None,
2305 'title': video_title,
2306 'ext': u'flv',
2307 }]
2308
2309 class ComedyCentralIE(InfoExtractor):
2310 """Information extractor for The Daily Show and Colbert Report """
2311
2312 # urls can be abbreviations like :thedailyshow or :colbert
2313 # urls for episodes like:
2314 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2315 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2316 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2317 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2318 |(https?://)?(www\.)?
2319 (?P<showname>thedailyshow|colbertnation)\.com/
2320 (full-episodes/(?P<episode>.*)|
2321 (?P<clip>
2322 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2323 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2324 $"""
2325 IE_NAME = u'comedycentral'
2326
2327 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2328
2329 _video_extensions = {
2330 '3500': 'mp4',
2331 '2200': 'mp4',
2332 '1700': 'mp4',
2333 '1200': 'mp4',
2334 '750': 'mp4',
2335 '400': 'mp4',
2336 }
2337 _video_dimensions = {
2338 '3500': '1280x720',
2339 '2200': '960x540',
2340 '1700': '768x432',
2341 '1200': '640x360',
2342 '750': '512x288',
2343 '400': '384x216',
2344 }
2345
2346 def suitable(self, url):
2347 """Receives a URL and returns True if suitable for this IE."""
2348 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2349
2350 def report_extraction(self, episode_id):
2351 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2352
2353 def report_config_download(self, episode_id):
2354 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2355
2356 def report_index_download(self, episode_id):
2357 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2358
2359 def report_player_url(self, episode_id):
2360 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2361
2362
2363 def _print_formats(self, formats):
2364 print('Available formats:')
2365 for x in formats:
2366 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2367
2368
2369 def _real_extract(self, url):
2370 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2371 if mobj is None:
2372 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2373 return
2374
2375 if mobj.group('shortname'):
2376 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2377 url = u'http://www.thedailyshow.com/full-episodes/'
2378 else:
2379 url = u'http://www.colbertnation.com/full-episodes/'
2380 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2381 assert mobj is not None
2382
2383 if mobj.group('clip'):
2384 if mobj.group('showname') == 'thedailyshow':
2385 epTitle = mobj.group('tdstitle')
2386 else:
2387 epTitle = mobj.group('cntitle')
2388 dlNewest = False
2389 else:
2390 dlNewest = not mobj.group('episode')
2391 if dlNewest:
2392 epTitle = mobj.group('showname')
2393 else:
2394 epTitle = mobj.group('episode')
2395
2396 req = compat_urllib_request.Request(url)
2397 self.report_extraction(epTitle)
2398 try:
2399 htmlHandle = compat_urllib_request.urlopen(req)
2400 html = htmlHandle.read()
2401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2403 return
2404 if dlNewest:
2405 url = htmlHandle.geturl()
2406 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2407 if mobj is None:
2408 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2409 return
2410 if mobj.group('episode') == '':
2411 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2412 return
2413 epTitle = mobj.group('episode')
2414
2415 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2416
2417 if len(mMovieParams) == 0:
2418 # The Colbert Report embeds the information in a without
2419 # a URL prefix; so extract the alternate reference
2420 # and then add the URL prefix manually.
2421
2422 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2423 if len(altMovieParams) == 0:
2424 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2425 return
2426 else:
2427 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2428
2429 playerUrl_raw = mMovieParams[0][0]
2430 self.report_player_url(epTitle)
2431 try:
2432 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2433 playerUrl = urlHandle.geturl()
2434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2435 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2436 return
2437
2438 uri = mMovieParams[0][1]
2439 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2440 self.report_index_download(epTitle)
2441 try:
2442 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2445 return
2446
2447 results = []
2448
2449 idoc = xml.etree.ElementTree.fromstring(indexXml)
2450 itemEls = idoc.findall('.//item')
2451 for itemEl in itemEls:
2452 mediaId = itemEl.findall('./guid')[0].text
2453 shortMediaId = mediaId.split(':')[-1]
2454 showId = mediaId.split(':')[-2].replace('.com', '')
2455 officialTitle = itemEl.findall('./title')[0].text
2456 officialDate = itemEl.findall('./pubDate')[0].text
2457
2458 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2459 compat_urllib_parse.urlencode({'uri': mediaId}))
2460 configReq = compat_urllib_request.Request(configUrl)
2461 self.report_config_download(epTitle)
2462 try:
2463 configXml = compat_urllib_request.urlopen(configReq).read()
2464 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2466 return
2467
2468 cdoc = xml.etree.ElementTree.fromstring(configXml)
2469 turls = []
2470 for rendition in cdoc.findall('.//rendition'):
2471 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2472 turls.append(finfo)
2473
2474 if len(turls) == 0:
2475 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2476 continue
2477
2478 if self._downloader.params.get('listformats', None):
2479 self._print_formats([i[0] for i in turls])
2480 return
2481
2482 # For now, just pick the highest bitrate
2483 format,video_url = turls[-1]
2484
2485 # Get the format arg from the arg stream
2486 req_format = self._downloader.params.get('format', None)
2487
2488 # Select format if we can find one
2489 for f,v in turls:
2490 if f == req_format:
2491 format, video_url = f, v
2492 break
2493
2494 # Patch to download from alternative CDN, which does not
2495 # break on current RTMPDump builds
2496 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2497 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2498
2499 if video_url.startswith(broken_cdn):
2500 video_url = video_url.replace(broken_cdn, better_cdn)
2501
2502 effTitle = showId + u'-' + epTitle
2503 info = {
2504 'id': shortMediaId,
2505 'url': video_url,
2506 'uploader': showId,
2507 'upload_date': officialDate,
2508 'title': effTitle,
2509 'ext': 'mp4',
2510 'format': format,
2511 'thumbnail': None,
2512 'description': officialTitle,
2513 'player_url': None #playerUrl
2514 }
2515
2516 results.append(info)
2517
2518 return results
2519
2520
2521 class EscapistIE(InfoExtractor):
2522 """Information extractor for The Escapist """
2523
2524 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2525 IE_NAME = u'escapist'
2526
2527 def report_extraction(self, showName):
2528 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2529
2530 def report_config_download(self, showName):
2531 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2532
2533 def _real_extract(self, url):
2534 mobj = re.match(self._VALID_URL, url)
2535 if mobj is None:
2536 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2537 return
2538 showName = mobj.group('showname')
2539 videoId = mobj.group('episode')
2540
2541 self.report_extraction(showName)
2542 try:
2543 webPage = compat_urllib_request.urlopen(url)
2544 webPageBytes = webPage.read()
2545 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2546 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2549 return
2550
2551 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2552 description = unescapeHTML(descMatch.group(1))
2553 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2554 imgUrl = unescapeHTML(imgMatch.group(1))
2555 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2556 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2557 configUrlMatch = re.search('config=(.*)$', playerUrl)
2558 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2559
2560 self.report_config_download(showName)
2561 try:
2562 configJSON = compat_urllib_request.urlopen(configUrl)
2563 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2564 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2567 return
2568
2569 # Technically, it's JavaScript, not JSON
2570 configJSON = configJSON.replace("'", '"')
2571
2572 try:
2573 config = json.loads(configJSON)
2574 except (ValueError,) as err:
2575 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2576 return
2577
2578 playlist = config['playlist']
2579 videoUrl = playlist[1]['url']
2580
2581 info = {
2582 'id': videoId,
2583 'url': videoUrl,
2584 'uploader': showName,
2585 'upload_date': None,
2586 'title': showName,
2587 'ext': 'flv',
2588 'thumbnail': imgUrl,
2589 'description': description,
2590 'player_url': playerUrl,
2591 }
2592
2593 return [info]
2594
2595
2596 class CollegeHumorIE(InfoExtractor):
2597 """Information extractor for collegehumor.com"""
2598
2599 _WORKING = False
2600 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2601 IE_NAME = u'collegehumor'
2602
2603 def report_manifest(self, video_id):
2604 """Report information extraction."""
2605 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2606
2607 def report_extraction(self, video_id):
2608 """Report information extraction."""
2609 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2610
2611 def _real_extract(self, url):
2612 mobj = re.match(self._VALID_URL, url)
2613 if mobj is None:
2614 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2615 return
2616 video_id = mobj.group('videoid')
2617
2618 info = {
2619 'id': video_id,
2620 'uploader': None,
2621 'upload_date': None,
2622 }
2623
2624 self.report_extraction(video_id)
2625 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2626 try:
2627 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2628 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2629 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2630 return
2631
2632 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2633 try:
2634 videoNode = mdoc.findall('./video')[0]
2635 info['description'] = videoNode.findall('./description')[0].text
2636 info['title'] = videoNode.findall('./caption')[0].text
2637 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2638 manifest_url = videoNode.findall('./file')[0].text
2639 except IndexError:
2640 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2641 return
2642
2643 manifest_url += '?hdcore=2.10.3'
2644 self.report_manifest(video_id)
2645 try:
2646 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2649 return
2650
2651 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2652 try:
2653 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2654 node_id = media_node.attrib['url']
2655 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2656 except IndexError as err:
2657 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2658 return
2659
2660 url_pr = compat_urllib_parse_urlparse(manifest_url)
2661 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2662
2663 info['url'] = url
2664 info['ext'] = 'f4f'
2665 return [info]
2666
2667
2668 class XVideosIE(InfoExtractor):
2669 """Information extractor for xvideos.com"""
2670
2671 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2672 IE_NAME = u'xvideos'
2673
2674 def report_webpage(self, video_id):
2675 """Report information extraction."""
2676 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2677
2678 def report_extraction(self, video_id):
2679 """Report information extraction."""
2680 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2681
2682 def _real_extract(self, url):
2683 mobj = re.match(self._VALID_URL, url)
2684 if mobj is None:
2685 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2686 return
2687 video_id = mobj.group(1)
2688
2689 self.report_webpage(video_id)
2690
2691 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2692 try:
2693 webpage_bytes = compat_urllib_request.urlopen(request).read()
2694 webpage = webpage_bytes.decode('utf-8', 'replace')
2695 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2696 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2697 return
2698
2699 self.report_extraction(video_id)
2700
2701
2702 # Extract video URL
2703 mobj = re.search(r'flv_url=(.+?)&', webpage)
2704 if mobj is None:
2705 self._downloader.trouble(u'ERROR: unable to extract video url')
2706 return
2707 video_url = compat_urllib_parse.unquote(mobj.group(1))
2708
2709
2710 # Extract title
2711 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2712 if mobj is None:
2713 self._downloader.trouble(u'ERROR: unable to extract video title')
2714 return
2715 video_title = mobj.group(1)
2716
2717
2718 # Extract video thumbnail
2719 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2720 if mobj is None:
2721 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2722 return
2723 video_thumbnail = mobj.group(0)
2724
2725 info = {
2726 'id': video_id,
2727 'url': video_url,
2728 'uploader': None,
2729 'upload_date': None,
2730 'title': video_title,
2731 'ext': 'flv',
2732 'thumbnail': video_thumbnail,
2733 'description': None,
2734 }
2735
2736 return [info]
2737
2738
2739 class SoundcloudIE(InfoExtractor):
2740 """Information extractor for soundcloud.com
2741 To access the media, the uid of the song and a stream token
2742 must be extracted from the page source and the script must make
2743 a request to media.soundcloud.com/crossdomain.xml. Then
2744 the media can be grabbed by requesting from an url composed
2745 of the stream token and uid
2746 """
2747
2748 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2749 IE_NAME = u'soundcloud'
2750
2751 def __init__(self, downloader=None):
2752 InfoExtractor.__init__(self, downloader)
2753
2754 def report_resolve(self, video_id):
2755 """Report information extraction."""
2756 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2757
2758 def report_extraction(self, video_id):
2759 """Report information extraction."""
2760 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2761
2762 def _real_extract(self, url):
2763 mobj = re.match(self._VALID_URL, url)
2764 if mobj is None:
2765 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2766 return
2767
2768 # extract uploader (which is in the url)
2769 uploader = mobj.group(1)
2770 # extract simple title (uploader + slug of song title)
2771 slug_title = mobj.group(2)
2772 simple_title = uploader + u'-' + slug_title
2773
2774 self.report_resolve('%s/%s' % (uploader, slug_title))
2775
2776 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2777 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778 request = compat_urllib_request.Request(resolv_url)
2779 try:
2780 info_json_bytes = compat_urllib_request.urlopen(request).read()
2781 info_json = info_json_bytes.decode('utf-8')
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2784 return
2785
2786 info = json.loads(info_json)
2787 video_id = info['id']
2788 self.report_extraction('%s/%s' % (uploader, slug_title))
2789
2790 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791 request = compat_urllib_request.Request(streams_url)
2792 try:
2793 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2794 stream_json = stream_json_bytes.decode('utf-8')
2795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2797 return
2798
2799 streams = json.loads(stream_json)
2800 mediaURL = streams['http_mp3_128_url']
2801
2802 return [{
2803 'id': info['id'],
2804 'url': mediaURL,
2805 'uploader': info['user']['username'],
2806 'upload_date': info['created_at'],
2807 'title': info['title'],
2808 'ext': u'mp3',
2809 'description': info['description'],
2810 }]
2811
2812
2813 class InfoQIE(InfoExtractor):
2814 """Information extractor for infoq.com"""
2815
2816 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2817 IE_NAME = u'infoq'
2818
2819 def report_webpage(self, video_id):
2820 """Report information extraction."""
2821 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2822
2823 def report_extraction(self, video_id):
2824 """Report information extraction."""
2825 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2826
2827 def _real_extract(self, url):
2828 mobj = re.match(self._VALID_URL, url)
2829 if mobj is None:
2830 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2831 return
2832
2833 self.report_webpage(url)
2834
2835 request = compat_urllib_request.Request(url)
2836 try:
2837 webpage = compat_urllib_request.urlopen(request).read()
2838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2839 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2840 return
2841
2842 self.report_extraction(url)
2843
2844
2845 # Extract video URL
2846 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2847 if mobj is None:
2848 self._downloader.trouble(u'ERROR: unable to extract video url')
2849 return
2850 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2851
2852
2853 # Extract title
2854 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2855 if mobj is None:
2856 self._downloader.trouble(u'ERROR: unable to extract video title')
2857 return
2858 video_title = mobj.group(1).decode('utf-8')
2859
2860 # Extract description
2861 video_description = u'No description available.'
2862 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2863 if mobj is not None:
2864 video_description = mobj.group(1).decode('utf-8')
2865
2866 video_filename = video_url.split('/')[-1]
2867 video_id, extension = video_filename.split('.')
2868
2869 info = {
2870 'id': video_id,
2871 'url': video_url,
2872 'uploader': None,
2873 'upload_date': None,
2874 'title': video_title,
2875 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2876 'thumbnail': None,
2877 'description': video_description,
2878 }
2879
2880 return [info]
2881
2882 class MixcloudIE(InfoExtractor):
2883 """Information extractor for www.mixcloud.com"""
2884
2885 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2886 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2887 IE_NAME = u'mixcloud'
2888
2889 def __init__(self, downloader=None):
2890 InfoExtractor.__init__(self, downloader)
2891
2892 def report_download_json(self, file_id):
2893 """Report JSON download."""
2894 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2895
2896 def report_extraction(self, file_id):
2897 """Report information extraction."""
2898 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2899
2900 def get_urls(self, jsonData, fmt, bitrate='best'):
2901 """Get urls from 'audio_formats' section in json"""
2902 file_url = None
2903 try:
2904 bitrate_list = jsonData[fmt]
2905 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2906 bitrate = max(bitrate_list) # select highest
2907
2908 url_list = jsonData[fmt][bitrate]
2909 except TypeError: # we have no bitrate info.
2910 url_list = jsonData[fmt]
2911 return url_list
2912
2913 def check_urls(self, url_list):
2914 """Returns 1st active url from list"""
2915 for url in url_list:
2916 try:
2917 compat_urllib_request.urlopen(url)
2918 return url
2919 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2920 url = None
2921
2922 return None
2923
2924 def _print_formats(self, formats):
2925 print('Available formats:')
2926 for fmt in formats.keys():
2927 for b in formats[fmt]:
2928 try:
2929 ext = formats[fmt][b][0]
2930 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2931 except TypeError: # we have no bitrate info
2932 ext = formats[fmt][0]
2933 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2934 break
2935
2936 def _real_extract(self, url):
2937 mobj = re.match(self._VALID_URL, url)
2938 if mobj is None:
2939 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2940 return
2941 # extract uploader & filename from url
2942 uploader = mobj.group(1).decode('utf-8')
2943 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2944
2945 # construct API request
2946 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2947 # retrieve .json file with links to files
2948 request = compat_urllib_request.Request(file_url)
2949 try:
2950 self.report_download_json(file_url)
2951 jsonData = compat_urllib_request.urlopen(request).read()
2952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2953 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2954 return
2955
2956 # parse JSON
2957 json_data = json.loads(jsonData)
2958 player_url = json_data['player_swf_url']
2959 formats = dict(json_data['audio_formats'])
2960
2961 req_format = self._downloader.params.get('format', None)
2962 bitrate = None
2963
2964 if self._downloader.params.get('listformats', None):
2965 self._print_formats(formats)
2966 return
2967
2968 if req_format is None or req_format == 'best':
2969 for format_param in formats.keys():
2970 url_list = self.get_urls(formats, format_param)
2971 # check urls
2972 file_url = self.check_urls(url_list)
2973 if file_url is not None:
2974 break # got it!
2975 else:
2976 if req_format not in formats.keys():
2977 self._downloader.trouble(u'ERROR: format is not available')
2978 return
2979
2980 url_list = self.get_urls(formats, req_format)
2981 file_url = self.check_urls(url_list)
2982 format_param = req_format
2983
2984 return [{
2985 'id': file_id.decode('utf-8'),
2986 'url': file_url.decode('utf-8'),
2987 'uploader': uploader.decode('utf-8'),
2988 'upload_date': None,
2989 'title': json_data['name'],
2990 'ext': file_url.split('.')[-1].decode('utf-8'),
2991 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2992 'thumbnail': json_data['thumbnail_url'],
2993 'description': json_data['description'],
2994 'player_url': player_url.decode('utf-8'),
2995 }]
2996
2997 class StanfordOpenClassroomIE(InfoExtractor):
2998 """Information extractor for Stanford's Open ClassRoom"""
2999
3000 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3001 IE_NAME = u'stanfordoc'
3002
3003 def report_download_webpage(self, objid):
3004 """Report information extraction."""
3005 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3006
3007 def report_extraction(self, video_id):
3008 """Report information extraction."""
3009 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3010
3011 def _real_extract(self, url):
3012 mobj = re.match(self._VALID_URL, url)
3013 if mobj is None:
3014 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3015 return
3016
3017 if mobj.group('course') and mobj.group('video'): # A specific video
3018 course = mobj.group('course')
3019 video = mobj.group('video')
3020 info = {
3021 'id': course + '_' + video,
3022 'uploader': None,
3023 'upload_date': None,
3024 }
3025
3026 self.report_extraction(info['id'])
3027 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3028 xmlUrl = baseUrl + video + '.xml'
3029 try:
3030 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3031 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3033 return
3034 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3035 try:
3036 info['title'] = mdoc.findall('./title')[0].text
3037 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3038 except IndexError:
3039 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3040 return
3041 info['ext'] = info['url'].rpartition('.')[2]
3042 return [info]
3043 elif mobj.group('course'): # A course page
3044 course = mobj.group('course')
3045 info = {
3046 'id': course,
3047 'type': 'playlist',
3048 'uploader': None,
3049 'upload_date': None,
3050 }
3051
3052 self.report_download_webpage(info['id'])
3053 try:
3054 coursepage = compat_urllib_request.urlopen(url).read()
3055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3056 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3057 return
3058
3059 m = re.search('<h1>([^<]+)</h1>', coursepage)
3060 if m:
3061 info['title'] = unescapeHTML(m.group(1))
3062 else:
3063 info['title'] = info['id']
3064
3065 m = re.search('<description>([^<]+)</description>', coursepage)
3066 if m:
3067 info['description'] = unescapeHTML(m.group(1))
3068
3069 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3070 info['list'] = [
3071 {
3072 'type': 'reference',
3073 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3074 }
3075 for vpage in links]
3076 results = []
3077 for entry in info['list']:
3078 assert entry['type'] == 'reference'
3079 results += self.extract(entry['url'])
3080 return results
3081
3082 else: # Root page
3083 info = {
3084 'id': 'Stanford OpenClassroom',
3085 'type': 'playlist',
3086 'uploader': None,
3087 'upload_date': None,
3088 }
3089
3090 self.report_download_webpage(info['id'])
3091 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3092 try:
3093 rootpage = compat_urllib_request.urlopen(rootURL).read()
3094 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3095 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3096 return
3097
3098 info['title'] = info['id']
3099
3100 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3101 info['list'] = [
3102 {
3103 'type': 'reference',
3104 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3105 }
3106 for cpage in links]
3107
3108 results = []
3109 for entry in info['list']:
3110 assert entry['type'] == 'reference'
3111 results += self.extract(entry['url'])
3112 return results
3113
3114 class MTVIE(InfoExtractor):
3115 """Information extractor for MTV.com"""
3116
3117 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3118 IE_NAME = u'mtv'
3119
3120 def report_webpage(self, video_id):
3121 """Report information extraction."""
3122 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3123
3124 def report_extraction(self, video_id):
3125 """Report information extraction."""
3126 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3127
3128 def _real_extract(self, url):
3129 mobj = re.match(self._VALID_URL, url)
3130 if mobj is None:
3131 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3132 return
3133 if not mobj.group('proto'):
3134 url = 'http://' + url
3135 video_id = mobj.group('videoid')
3136 self.report_webpage(video_id)
3137
3138 request = compat_urllib_request.Request(url)
3139 try:
3140 webpage = compat_urllib_request.urlopen(request).read()
3141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3143 return
3144
3145 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3146 if mobj is None:
3147 self._downloader.trouble(u'ERROR: unable to extract song name')
3148 return
3149 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3150 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3151 if mobj is None:
3152 self._downloader.trouble(u'ERROR: unable to extract performer')
3153 return
3154 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3155 video_title = performer + ' - ' + song_name
3156
3157 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3158 if mobj is None:
3159 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3160 return
3161 mtvn_uri = mobj.group(1)
3162
3163 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3164 if mobj is None:
3165 self._downloader.trouble(u'ERROR: unable to extract content id')
3166 return
3167 content_id = mobj.group(1)
3168
3169 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3170 self.report_extraction(video_id)
3171 request = compat_urllib_request.Request(videogen_url)
3172 try:
3173 metadataXml = compat_urllib_request.urlopen(request).read()
3174 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3175 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3176 return
3177
3178 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3179 renditions = mdoc.findall('.//rendition')
3180
3181 # For now, always pick the highest quality.
3182 rendition = renditions[-1]
3183
3184 try:
3185 _,_,ext = rendition.attrib['type'].partition('/')
3186 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3187 video_url = rendition.find('./src').text
3188 except KeyError:
3189 self._downloader.trouble('Invalid rendition field.')
3190 return
3191
3192 info = {
3193 'id': video_id,
3194 'url': video_url,
3195 'uploader': performer,
3196 'upload_date': None,
3197 'title': video_title,
3198 'ext': ext,
3199 'format': format,
3200 }
3201
3202 return [info]
3203
3204
3205 class YoukuIE(InfoExtractor):
3206
3207 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3208 IE_NAME = u'Youku'
3209
3210 def __init__(self, downloader=None):
3211 InfoExtractor.__init__(self, downloader)
3212
3213 def report_download_webpage(self, file_id):
3214 """Report webpage download."""
3215 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3216
3217 def report_extraction(self, file_id):
3218 """Report information extraction."""
3219 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3220
3221 def _gen_sid(self):
3222 nowTime = int(time.time() * 1000)
3223 random1 = random.randint(1000,1998)
3224 random2 = random.randint(1000,9999)
3225
3226 return "%d%d%d" %(nowTime,random1,random2)
3227
3228 def _get_file_ID_mix_string(self, seed):
3229 mixed = []
3230 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3231 seed = float(seed)
3232 for i in range(len(source)):
3233 seed = (seed * 211 + 30031 ) % 65536
3234 index = math.floor(seed / 65536 * len(source) )
3235 mixed.append(source[int(index)])
3236 source.remove(source[int(index)])
3237 #return ''.join(mixed)
3238 return mixed
3239
3240 def _get_file_id(self, fileId, seed):
3241 mixed = self._get_file_ID_mix_string(seed)
3242 ids = fileId.split('*')
3243 realId = []
3244 for ch in ids:
3245 if ch:
3246 realId.append(mixed[int(ch)])
3247 return ''.join(realId)
3248
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3251 if mobj is None:
3252 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3253 return
3254 video_id = mobj.group('ID')
3255
3256 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3257
3258 request = compat_urllib_request.Request(info_url, None, std_headers)
3259 try:
3260 self.report_download_webpage(video_id)
3261 jsondata = compat_urllib_request.urlopen(request).read()
3262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3264 return
3265
3266 self.report_extraction(video_id)
3267 try:
3268 jsonstr = jsondata.decode('utf-8')
3269 config = json.loads(jsonstr)
3270
3271 video_title = config['data'][0]['title']
3272 seed = config['data'][0]['seed']
3273
3274 format = self._downloader.params.get('format', None)
3275 supported_format = config['data'][0]['streamfileids'].keys()
3276
3277 if format is None or format == 'best':
3278 if 'hd2' in supported_format:
3279 format = 'hd2'
3280 else:
3281 format = 'flv'
3282 ext = u'flv'
3283 elif format == 'worst':
3284 format = 'mp4'
3285 ext = u'mp4'
3286 else:
3287 format = 'flv'
3288 ext = u'flv'
3289
3290
3291 fileid = config['data'][0]['streamfileids'][format]
3292 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3293 except (UnicodeDecodeError, ValueError, KeyError):
3294 self._downloader.trouble(u'ERROR: unable to extract info section')
3295 return
3296
3297 files_info=[]
3298 sid = self._gen_sid()
3299 fileid = self._get_file_id(fileid, seed)
3300
3301 #column 8,9 of fileid represent the segment number
3302 #fileid[7:9] should be changed
3303 for index, key in enumerate(keys):
3304
3305 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3306 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3307
3308 info = {
3309 'id': '%s_part%02d' % (video_id, index),
3310 'url': download_url,
3311 'uploader': None,
3312 'upload_date': None,
3313 'title': video_title,
3314 'ext': ext,
3315 }
3316 files_info.append(info)
3317
3318 return files_info
3319
3320
3321 class XNXXIE(InfoExtractor):
3322 """Information extractor for xnxx.com"""
3323
3324 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3325 IE_NAME = u'xnxx'
3326 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3327 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3328 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3329
3330 def report_webpage(self, video_id):
3331 """Report information extraction"""
3332 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3333
3334 def report_extraction(self, video_id):
3335 """Report information extraction"""
3336 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3337
3338 def _real_extract(self, url):
3339 mobj = re.match(self._VALID_URL, url)
3340 if mobj is None:
3341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3342 return
3343 video_id = mobj.group(1)
3344
3345 self.report_webpage(video_id)
3346
3347 # Get webpage content
3348 try:
3349 webpage_bytes = compat_urllib_request.urlopen(url).read()
3350 webpage = webpage_bytes.decode('utf-8')
3351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3352 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3353 return
3354
3355 result = re.search(self.VIDEO_URL_RE, webpage)
3356 if result is None:
3357 self._downloader.trouble(u'ERROR: unable to extract video url')
3358 return
3359 video_url = compat_urllib_parse.unquote(result.group(1))
3360
3361 result = re.search(self.VIDEO_TITLE_RE, webpage)
3362 if result is None:
3363 self._downloader.trouble(u'ERROR: unable to extract video title')
3364 return
3365 video_title = result.group(1)
3366
3367 result = re.search(self.VIDEO_THUMB_RE, webpage)
3368 if result is None:
3369 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3370 return
3371 video_thumbnail = result.group(1)
3372
3373 return [{
3374 'id': video_id,
3375 'url': video_url,
3376 'uploader': None,
3377 'upload_date': None,
3378 'title': video_title,
3379 'ext': 'flv',
3380 'thumbnail': video_thumbnail,
3381 'description': None,
3382 }]
3383
3384
3385 class GooglePlusIE(InfoExtractor):
3386 """Information extractor for plus.google.com."""
3387
3388 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3389 IE_NAME = u'plus.google'
3390
3391 def __init__(self, downloader=None):
3392 InfoExtractor.__init__(self, downloader)
3393
3394 def report_extract_entry(self, url):
3395 """Report downloading extry"""
3396 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3397
3398 def report_date(self, upload_date):
3399 """Report downloading extry"""
3400 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3401
3402 def report_uploader(self, uploader):
3403 """Report downloading extry"""
3404 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3405
3406 def report_title(self, video_title):
3407 """Report downloading extry"""
3408 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3409
3410 def report_extract_vid_page(self, video_page):
3411 """Report information extraction."""
3412 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3413
3414 def _real_extract(self, url):
3415 # Extract id from URL
3416 mobj = re.match(self._VALID_URL, url)
3417 if mobj is None:
3418 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3419 return
3420
3421 post_url = mobj.group(0)
3422 video_id = mobj.group(1)
3423
3424 video_extension = 'flv'
3425
3426 # Step 1, Retrieve post webpage to extract further information
3427 self.report_extract_entry(post_url)
3428 request = compat_urllib_request.Request(post_url)
3429 try:
3430 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3432 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3433 return
3434
3435 # Extract update date
3436 upload_date = None
3437 pattern = 'title="Timestamp">(.*?)</a>'
3438 mobj = re.search(pattern, webpage)
3439 if mobj:
3440 upload_date = mobj.group(1)
3441 # Convert timestring to a format suitable for filename
3442 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3443 upload_date = upload_date.strftime('%Y%m%d')
3444 self.report_date(upload_date)
3445
3446 # Extract uploader
3447 uploader = None
3448 pattern = r'rel\="author".*?>(.*?)</a>'
3449 mobj = re.search(pattern, webpage)
3450 if mobj:
3451 uploader = mobj.group(1)
3452 self.report_uploader(uploader)
3453
3454 # Extract title
3455 # Get the first line for title
3456 video_title = u'NA'
3457 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3458 mobj = re.search(pattern, webpage)
3459 if mobj:
3460 video_title = mobj.group(1)
3461 self.report_title(video_title)
3462
3463 # Step 2, Stimulate clicking the image box to launch video
3464 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3465 mobj = re.search(pattern, webpage)
3466 if mobj is None:
3467 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3468
3469 video_page = mobj.group(1)
3470 request = compat_urllib_request.Request(video_page)
3471 try:
3472 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3474 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3475 return
3476 self.report_extract_vid_page(video_page)
3477
3478
3479 # Extract video links on video page
3480 """Extract video links of all sizes"""
3481 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3482 mobj = re.findall(pattern, webpage)
3483 if len(mobj) == 0:
3484 self._downloader.trouble(u'ERROR: unable to extract video links')
3485
3486 # Sort in resolution
3487 links = sorted(mobj)
3488
3489 # Choose the lowest of the sort, i.e. highest resolution
3490 video_url = links[-1]
3491 # Only get the url. The resolution part in the tuple has no use anymore
3492 video_url = video_url[-1]
3493 # Treat escaped \u0026 style hex
3494 try:
3495 video_url = video_url.decode("unicode_escape")
3496 except AttributeError: # Python 3
3497 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3498
3499
3500 return [{
3501 'id': video_id,
3502 'url': video_url,
3503 'uploader': uploader,
3504 'upload_date': upload_date,
3505 'title': video_title,
3506 'ext': video_extension,
3507 }]
3508
3509 class NBAIE(InfoExtractor):
3510 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3511 IE_NAME = u'nba'
3512
3513 def report_extraction(self, video_id):
3514 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3515
3516 def _real_extract(self, url):
3517 mobj = re.match(self._VALID_URL, url)
3518 if mobj is None:
3519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3520 return
3521
3522 video_id = mobj.group(1)
3523 if video_id.endswith('/index.html'):
3524 video_id = video_id[:-len('/index.html')]
3525
3526 self.report_extraction(video_id)
3527 try:
3528 urlh = compat_urllib_request.urlopen(url)
3529 webpage_bytes = urlh.read()
3530 webpage = webpage_bytes.decode('utf-8', 'ignore')
3531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3532 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3533 return
3534
3535 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3536 def _findProp(rexp, default=None):
3537 m = re.search(rexp, webpage)
3538 if m:
3539 return unescapeHTML(m.group(1))
3540 else:
3541 return default
3542
3543 shortened_video_id = video_id.rpartition('/')[2]
3544 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3545 info = {
3546 'id': shortened_video_id,
3547 'url': video_url,
3548 'ext': 'mp4',
3549 'title': title,
3550 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3551 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3552 }
3553 return [info]
3554
3555 class JustinTVIE(InfoExtractor):
3556 """Information extractor for justin.tv and twitch.tv"""
3557 # TODO: One broadcast may be split into multiple videos. The key
3558 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3559 # starts at 1 and increases. Can we treat all parts as one video?
3560
3561 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3562 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3563 _JUSTIN_PAGE_LIMIT = 100
3564 IE_NAME = u'justin.tv'
3565
3566 def report_extraction(self, file_id):
3567 """Report information extraction."""
3568 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3569
3570 def report_download_page(self, channel, offset):
3571 """Report attempt to download a single page of videos."""
3572 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3573 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3574
3575 # Return count of items, list of *valid* items
3576 def _parse_page(self, url):
3577 try:
3578 urlh = compat_urllib_request.urlopen(url)
3579 webpage_bytes = urlh.read()
3580 webpage = webpage_bytes.decode('utf-8', 'ignore')
3581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3582 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3583 return
3584
3585 response = json.loads(webpage)
3586 info = []
3587 for clip in response:
3588 video_url = clip['video_file_url']
3589 if video_url:
3590 video_extension = os.path.splitext(video_url)[1][1:]
3591 video_date = re.sub('-', '', clip['created_on'][:10])
3592 info.append({
3593 'id': clip['id'],
3594 'url': video_url,
3595 'title': clip['title'],
3596 'uploader': clip.get('user_id', clip.get('channel_id')),
3597 'upload_date': video_date,
3598 'ext': video_extension,
3599 })
3600 return (len(response), info)
3601
3602 def _real_extract(self, url):
3603 mobj = re.match(self._VALID_URL, url)
3604 if mobj is None:
3605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3606 return
3607
3608 api = 'http://api.justin.tv'
3609 video_id = mobj.group(mobj.lastindex)
3610 paged = False
3611 if mobj.lastindex == 1:
3612 paged = True
3613 api += '/channel/archives/%s.json'
3614 else:
3615 api += '/clip/show/%s.json'
3616 api = api % (video_id,)
3617
3618 self.report_extraction(video_id)
3619
3620 info = []
3621 offset = 0
3622 limit = self._JUSTIN_PAGE_LIMIT
3623 while True:
3624 if paged:
3625 self.report_download_page(video_id, offset)
3626 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3627 page_count, page_info = self._parse_page(page_url)
3628 info.extend(page_info)
3629 if not paged or page_count != limit:
3630 break
3631 offset += limit
3632 return info