]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Better error reporting for SoundCloud IE
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 The following fields are optional:
41
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
49
50 The fields should all be Unicode strings.
51
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
55
56 _real_extract() must return a *list* of information dictionaries as
57 described above.
58
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
61 """
62
63 _ready = False
64 _downloader = None
65 _WORKING = True
66
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
69 self._ready = False
70 self.set_downloader(downloader)
71
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
75
76 def working(self):
77 """Getter method for _WORKING."""
78 return self._WORKING
79
80 def initialize(self):
81 """Initializes an instance (authentication, etc)."""
82 if not self._ready:
83 self._real_initialize()
84 self._ready = True
85
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
88 self.initialize()
89 return self._real_extract(url)
90
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
94
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
97 pass
98
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
101 pass
102
103
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
106
107 _VALID_URL = r"""^
108 (
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
120 v=
121 )
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
126 $"""
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
136 '13': '3gp',
137 '17': 'mp4',
138 '18': 'mp4',
139 '22': 'mp4',
140 '37': 'mp4',
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142 '43': 'webm',
143 '44': 'webm',
144 '45': 'webm',
145 '46': 'webm',
146 }
147 _video_dimensions = {
148 '5': '240x400',
149 '6': '???',
150 '13': '???',
151 '17': '144x176',
152 '18': '360x640',
153 '22': '720x1280',
154 '34': '360x640',
155 '35': '480x854',
156 '37': '1080x1920',
157 '38': '3072x4096',
158 '43': '360x640',
159 '44': '480x854',
160 '45': '720x1280',
161 '46': '1080x1920',
162 }
163 IE_NAME = u'youtube'
164
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
172
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
176
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
180
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205 def _closed_captions_xml_to_srt(self, xml_string):
206 srt = ''
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
211 start = float(start)
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
220 return srt
221
222 def _print_formats(self, formats):
223 print('Available formats:')
224 for x in formats:
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227 def _real_initialize(self):
228 if self._downloader is None:
229 return
230
231 username = None
232 password = None
233 downloader_params = self._downloader.params
234
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
240 try:
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242 if info is not None:
243 username = info[0]
244 password = info[2]
245 else:
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249 return
250
251 # Set language
252 request = compat_urllib_request.Request(self._LANG_URL)
253 try:
254 self.report_lang()
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258 return
259
260 # No authentication to be performed
261 if username is None:
262 return
263
264 # Log in
265 login_form = {
266 'current_form': 'loginForm',
267 'next': '/',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
271 }
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273 try:
274 self.report_login()
275 login_results = compat_urllib_request.urlopen(request).read()
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278 return
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281 return
282
283 # Confirm age
284 age_form = {
285 'next_url': '/',
286 'action_confirm': 'Confirm',
287 }
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289 try:
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read()
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294 return
295
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
299 if mobj:
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304 if mobj is None:
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306 return
307 video_id = mobj.group(2)
308
309 # Get video webpage
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312 try:
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316 return
317
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322 if mobj is not None:
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324 else:
325 player_url = None
326
327 # Get video info
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
333 try:
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
338 break
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341 return
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345 else:
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347 return
348
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
352 return
353
354 # Start extracting information
355 self.report_information_extraction(video_id)
356
357 # uploader
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360 return
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363 # title
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
366 return
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369 # thumbnail image
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372 video_thumbnail = ''
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376 # upload date
377 upload_date = None
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379 if mobj is not None:
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
383 try:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385 except:
386 pass
387
388 # description
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
392 else:
393 video_description = ''
394
395 # closed captions
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
398 try:
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401 try:
402 srt_list = compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
412 srt_lang = 'en'
413 else:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418 try:
419 srt_xml = compat_urllib_request.urlopen(request).read()
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422 if not srt_xml:
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
425 except Trouble as trouble:
426 self._downloader.trouble(trouble[0])
427
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
430 video_duration = ''
431 else:
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434 # token
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
439
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
453 else:
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
458 return
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
461 return
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468 else:
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
474 if rf in url_map:
475 video_url_list = [(rf, url_map[rf])]
476 break
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
479 return
480 else:
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482 return
483
484 results = []
485 for format_param, video_real_url in video_url_list:
486 # Extension
487 video_extension = self._video_extensions.get(format_param, 'flv')
488
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
491
492 results.append({
493 'id': video_id,
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
505 })
506 return results
507
508
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
511
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
516
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
519
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
527
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
539 try:
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544 return
545
546 # Confirm age
547 disclaimer_form = {
548 'filters': '0',
549 'submit': "Continue - I'm over 18",
550 }
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552 try:
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557 return
558
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
562 if mobj is None:
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564 return
565
566 video_id = mobj.group(1)
567
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572 return
573
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576 try:
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581 return
582
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586 if mobj is not None:
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
589
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592 if mobj is None:
593 video_url = mediaURL
594 else:
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597 else:
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599 if mobj is None:
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
601 return
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
605 return
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607 if mobj is None:
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
609 return
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615 if mobj is None:
616 self._downloader.trouble(u'ERROR: unable to extract title')
617 return
618 video_title = mobj.group(1).decode('utf-8')
619
620 mobj = re.search(r'submitter=(.*?);', webpage)
621 if mobj is None:
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623 return
624 video_uploader = mobj.group(1)
625
626 return [{
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
630 'upload_date': None,
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
633 }]
634
635
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
638
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
641
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
644
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
656 if mobj is None:
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658 return
659
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662 video_extension = 'mp4'
663
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
667 try:
668 self.report_download_webpage(video_id)
669 webpage = compat_urllib_request.urlopen(request).read()
670 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
671 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
672 return
673
674 # Extract URL, uploader and title from webpage
675 self.report_extraction(video_id)
676 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
677 if mobj is None:
678 self._downloader.trouble(u'ERROR: unable to extract media URL')
679 return
680 flashvars = compat_urllib_parse.unquote(mobj.group(1))
681
682 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
683 if key in flashvars:
684 max_quality = key
685 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
686 break
687 else:
688 self._downloader.trouble(u'ERROR: unable to extract video URL')
689 return
690
691 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
692 if mobj is None:
693 self._downloader.trouble(u'ERROR: unable to extract video URL')
694 return
695
696 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
697
698 # TODO: support choosing qualities
699
700 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
701 if mobj is None:
702 self._downloader.trouble(u'ERROR: unable to extract title')
703 return
704 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
705
706 video_uploader = None
707 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
708 if mobj is None:
709 # lookin for official user
710 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
711 if mobj_official is None:
712 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
713 else:
714 video_uploader = mobj_official.group(1)
715 else:
716 video_uploader = mobj.group(1)
717
718 video_upload_date = None
719 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
720 if mobj is not None:
721 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
722
723 return [{
724 'id': video_id.decode('utf-8'),
725 'url': video_url.decode('utf-8'),
726 'uploader': video_uploader.decode('utf-8'),
727 'upload_date': video_upload_date,
728 'title': video_title,
729 'ext': video_extension.decode('utf-8'),
730 }]
731
732
733 class GoogleIE(InfoExtractor):
734 """Information extractor for video.google.com."""
735
736 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
737 IE_NAME = u'video.google'
738
739 def __init__(self, downloader=None):
740 InfoExtractor.__init__(self, downloader)
741
742 def report_download_webpage(self, video_id):
743 """Report webpage download."""
744 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
745
746 def report_extraction(self, video_id):
747 """Report information extraction."""
748 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
749
750 def _real_extract(self, url):
751 # Extract id from URL
752 mobj = re.match(self._VALID_URL, url)
753 if mobj is None:
754 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
755 return
756
757 video_id = mobj.group(1)
758
759 video_extension = 'mp4'
760
761 # Retrieve video webpage to extract further information
762 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
763 try:
764 self.report_download_webpage(video_id)
765 webpage = compat_urllib_request.urlopen(request).read()
766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
767 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
768 return
769
770 # Extract URL, uploader, and title from webpage
771 self.report_extraction(video_id)
772 mobj = re.search(r"download_url:'([^']+)'", webpage)
773 if mobj is None:
774 video_extension = 'flv'
775 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
776 if mobj is None:
777 self._downloader.trouble(u'ERROR: unable to extract media URL')
778 return
779 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
780 mediaURL = mediaURL.replace('\\x3d', '\x3d')
781 mediaURL = mediaURL.replace('\\x26', '\x26')
782
783 video_url = mediaURL
784
785 mobj = re.search(r'<title>(.*)</title>', webpage)
786 if mobj is None:
787 self._downloader.trouble(u'ERROR: unable to extract title')
788 return
789 video_title = mobj.group(1).decode('utf-8')
790
791 # Extract video description
792 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
793 if mobj is None:
794 self._downloader.trouble(u'ERROR: unable to extract video description')
795 return
796 video_description = mobj.group(1).decode('utf-8')
797 if not video_description:
798 video_description = 'No description available.'
799
800 # Extract video thumbnail
801 if self._downloader.params.get('forcethumbnail', False):
802 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
803 try:
804 webpage = compat_urllib_request.urlopen(request).read()
805 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
806 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
807 return
808 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
809 if mobj is None:
810 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
811 return
812 video_thumbnail = mobj.group(1)
813 else: # we need something to pass to process_info
814 video_thumbnail = ''
815
816 return [{
817 'id': video_id.decode('utf-8'),
818 'url': video_url.decode('utf-8'),
819 'uploader': None,
820 'upload_date': None,
821 'title': video_title,
822 'ext': video_extension.decode('utf-8'),
823 }]
824
825
826 class PhotobucketIE(InfoExtractor):
827 """Information extractor for photobucket.com."""
828
829 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
830 IE_NAME = u'photobucket'
831
832 def __init__(self, downloader=None):
833 InfoExtractor.__init__(self, downloader)
834
835 def report_download_webpage(self, video_id):
836 """Report webpage download."""
837 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
838
839 def report_extraction(self, video_id):
840 """Report information extraction."""
841 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
842
843 def _real_extract(self, url):
844 # Extract id from URL
845 mobj = re.match(self._VALID_URL, url)
846 if mobj is None:
847 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
848 return
849
850 video_id = mobj.group(1)
851
852 video_extension = 'flv'
853
854 # Retrieve video webpage to extract further information
855 request = compat_urllib_request.Request(url)
856 try:
857 self.report_download_webpage(video_id)
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861 return
862
863 # Extract URL, uploader, and title from webpage
864 self.report_extraction(video_id)
865 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
866 if mobj is None:
867 self._downloader.trouble(u'ERROR: unable to extract media URL')
868 return
869 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
870
871 video_url = mediaURL
872
873 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
874 if mobj is None:
875 self._downloader.trouble(u'ERROR: unable to extract title')
876 return
877 video_title = mobj.group(1).decode('utf-8')
878
879 video_uploader = mobj.group(2).decode('utf-8')
880
881 return [{
882 'id': video_id.decode('utf-8'),
883 'url': video_url.decode('utf-8'),
884 'uploader': video_uploader,
885 'upload_date': None,
886 'title': video_title,
887 'ext': video_extension.decode('utf-8'),
888 }]
889
890
891 class YahooIE(InfoExtractor):
892 """Information extractor for video.yahoo.com."""
893
894 # _VALID_URL matches all Yahoo! Video URLs
895 # _VPAGE_URL matches only the extractable '/watch/' URLs
896 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
897 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
898 IE_NAME = u'video.yahoo'
899
900 def __init__(self, downloader=None):
901 InfoExtractor.__init__(self, downloader)
902
903 def report_download_webpage(self, video_id):
904 """Report webpage download."""
905 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
906
907 def report_extraction(self, video_id):
908 """Report information extraction."""
909 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
910
911 def _real_extract(self, url, new_video=True):
912 # Extract ID from URL
913 mobj = re.match(self._VALID_URL, url)
914 if mobj is None:
915 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
916 return
917
918 video_id = mobj.group(2)
919 video_extension = 'flv'
920
921 # Rewrite valid but non-extractable URLs as
922 # extractable English language /watch/ URLs
923 if re.match(self._VPAGE_URL, url) is None:
924 request = compat_urllib_request.Request(url)
925 try:
926 webpage = compat_urllib_request.urlopen(request).read()
927 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
928 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
929 return
930
931 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
932 if mobj is None:
933 self._downloader.trouble(u'ERROR: Unable to extract id field')
934 return
935 yahoo_id = mobj.group(1)
936
937 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
938 if mobj is None:
939 self._downloader.trouble(u'ERROR: Unable to extract vid field')
940 return
941 yahoo_vid = mobj.group(1)
942
943 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
944 return self._real_extract(url, new_video=False)
945
946 # Retrieve video webpage to extract further information
947 request = compat_urllib_request.Request(url)
948 try:
949 self.report_download_webpage(video_id)
950 webpage = compat_urllib_request.urlopen(request).read()
951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
953 return
954
955 # Extract uploader and title from webpage
956 self.report_extraction(video_id)
957 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
958 if mobj is None:
959 self._downloader.trouble(u'ERROR: unable to extract video title')
960 return
961 video_title = mobj.group(1).decode('utf-8')
962
963 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
964 if mobj is None:
965 self._downloader.trouble(u'ERROR: unable to extract video uploader')
966 return
967 video_uploader = mobj.group(1).decode('utf-8')
968
969 # Extract video thumbnail
970 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
971 if mobj is None:
972 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
973 return
974 video_thumbnail = mobj.group(1).decode('utf-8')
975
976 # Extract video description
977 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
978 if mobj is None:
979 self._downloader.trouble(u'ERROR: unable to extract video description')
980 return
981 video_description = mobj.group(1).decode('utf-8')
982 if not video_description:
983 video_description = 'No description available.'
984
985 # Extract video height and width
986 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
987 if mobj is None:
988 self._downloader.trouble(u'ERROR: unable to extract video height')
989 return
990 yv_video_height = mobj.group(1)
991
992 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
993 if mobj is None:
994 self._downloader.trouble(u'ERROR: unable to extract video width')
995 return
996 yv_video_width = mobj.group(1)
997
998 # Retrieve video playlist to extract media URL
999 # I'm not completely sure what all these options are, but we
1000 # seem to need most of them, otherwise the server sends a 401.
1001 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1002 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1003 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1004 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1005 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006 try:
1007 self.report_download_webpage(video_id)
1008 webpage = compat_urllib_request.urlopen(request).read()
1009 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1010 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011 return
1012
1013 # Extract media URL from playlist XML
1014 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1015 if mobj is None:
1016 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1017 return
1018 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1019 video_url = unescapeHTML(video_url)
1020
1021 return [{
1022 'id': video_id.decode('utf-8'),
1023 'url': video_url,
1024 'uploader': video_uploader,
1025 'upload_date': None,
1026 'title': video_title,
1027 'ext': video_extension.decode('utf-8'),
1028 'thumbnail': video_thumbnail.decode('utf-8'),
1029 'description': video_description,
1030 }]
1031
1032
1033 class VimeoIE(InfoExtractor):
1034 """Information extractor for vimeo.com."""
1035
1036 # _VALID_URL matches Vimeo URLs
1037 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1038 IE_NAME = u'vimeo'
1039
1040 def __init__(self, downloader=None):
1041 InfoExtractor.__init__(self, downloader)
1042
1043 def report_download_webpage(self, video_id):
1044 """Report webpage download."""
1045 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1046
1047 def report_extraction(self, video_id):
1048 """Report information extraction."""
1049 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1050
1051 def _real_extract(self, url, new_video=True):
1052 # Extract ID from URL
1053 mobj = re.match(self._VALID_URL, url)
1054 if mobj is None:
1055 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1056 return
1057
1058 video_id = mobj.group(1)
1059
1060 # Retrieve video webpage to extract further information
1061 request = compat_urllib_request.Request(url, None, std_headers)
1062 try:
1063 self.report_download_webpage(video_id)
1064 webpage = compat_urllib_request.urlopen(request).read()
1065 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1067 return
1068
1069 # Now we begin extracting as much information as we can from what we
1070 # retrieved. First we extract the information common to all extractors,
1071 # and latter we extract those that are Vimeo specific.
1072 self.report_extraction(video_id)
1073
1074 # Extract the config JSON
1075 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1076 try:
1077 config = json.loads(config)
1078 except:
1079 self._downloader.trouble(u'ERROR: unable to extract info section')
1080 return
1081
1082 # Extract title
1083 video_title = config["video"]["title"]
1084
1085 # Extract uploader
1086 video_uploader = config["video"]["owner"]["name"]
1087
1088 # Extract video thumbnail
1089 video_thumbnail = config["video"]["thumbnail"]
1090
1091 # Extract video description
1092 video_description = get_element_by_id("description", webpage.decode('utf8'))
1093 if video_description: video_description = clean_html(video_description)
1094 else: video_description = ''
1095
1096 # Extract upload date
1097 video_upload_date = None
1098 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1099 if mobj is not None:
1100 video_upload_date = mobj.group(1)
1101
1102 # Vimeo specific: extract request signature and timestamp
1103 sig = config['request']['signature']
1104 timestamp = config['request']['timestamp']
1105
1106 # Vimeo specific: extract video codec and quality information
1107 # First consider quality, then codecs, then take everything
1108 # TODO bind to format param
1109 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1110 files = { 'hd': [], 'sd': [], 'other': []}
1111 for codec_name, codec_extension in codecs:
1112 if codec_name in config["video"]["files"]:
1113 if 'hd' in config["video"]["files"][codec_name]:
1114 files['hd'].append((codec_name, codec_extension, 'hd'))
1115 elif 'sd' in config["video"]["files"][codec_name]:
1116 files['sd'].append((codec_name, codec_extension, 'sd'))
1117 else:
1118 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1119
1120 for quality in ('hd', 'sd', 'other'):
1121 if len(files[quality]) > 0:
1122 video_quality = files[quality][0][2]
1123 video_codec = files[quality][0][0]
1124 video_extension = files[quality][0][1]
1125 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1126 break
1127 else:
1128 self._downloader.trouble(u'ERROR: no known codec found')
1129 return
1130
1131 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1132 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133
1134 return [{
1135 'id': video_id,
1136 'url': video_url,
1137 'uploader': video_uploader,
1138 'upload_date': video_upload_date,
1139 'title': video_title,
1140 'ext': video_extension,
1141 'thumbnail': video_thumbnail,
1142 'description': video_description,
1143 }]
1144
1145
1146 class ArteTvIE(InfoExtractor):
1147 """arte.tv information extractor."""
1148
1149 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1150 _LIVE_URL = r'index-[0-9]+\.html$'
1151
1152 IE_NAME = u'arte.tv'
1153
1154 def __init__(self, downloader=None):
1155 InfoExtractor.__init__(self, downloader)
1156
1157 def report_download_webpage(self, video_id):
1158 """Report webpage download."""
1159 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1160
1161 def report_extraction(self, video_id):
1162 """Report information extraction."""
1163 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1164
1165 def fetch_webpage(self, url):
1166 self._downloader.increment_downloads()
1167 request = compat_urllib_request.Request(url)
1168 try:
1169 self.report_download_webpage(url)
1170 webpage = compat_urllib_request.urlopen(request).read()
1171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1172 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1173 return
1174 except ValueError as err:
1175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176 return
1177 return webpage
1178
1179 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1180 page = self.fetch_webpage(url)
1181 mobj = re.search(regex, page, regexFlags)
1182 info = {}
1183
1184 if mobj is None:
1185 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186 return
1187
1188 for (i, key, err) in matchTuples:
1189 if mobj.group(i) is None:
1190 self._downloader.trouble(err)
1191 return
1192 else:
1193 info[key] = mobj.group(i)
1194
1195 return info
1196
1197 def extractLiveStream(self, url):
1198 video_lang = url.split('/')[-4]
1199 info = self.grep_webpage(
1200 url,
1201 r'src="(.*?/videothek_js.*?\.js)',
1202 0,
1203 [
1204 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1205 ]
1206 )
1207 http_host = url.split('/')[2]
1208 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209 info = self.grep_webpage(
1210 next_url,
1211 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212 '(http://.*?\.swf).*?' +
1213 '(rtmp://.*?)\'',
1214 re.DOTALL,
1215 [
1216 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1217 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1218 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1219 ]
1220 )
1221 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223 def extractPlus7Stream(self, url):
1224 video_lang = url.split('/')[-3]
1225 info = self.grep_webpage(
1226 url,
1227 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228 0,
1229 [
1230 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1231 ]
1232 )
1233 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1235 next_url,
1236 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237 0,
1238 [
1239 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1240 ]
1241 )
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244 info = self.grep_webpage(
1245 next_url,
1246 r'<video id="(.*?)".*?>.*?' +
1247 '<name>(.*?)</name>.*?' +
1248 '<dateVideo>(.*?)</dateVideo>.*?' +
1249 '<url quality="hd">(.*?)</url>',
1250 re.DOTALL,
1251 [
1252 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1253 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1254 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1255 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1256 ]
1257 )
1258
1259 return {
1260 'id': info.get('id'),
1261 'url': compat_urllib_parse.unquote(info.get('url')),
1262 'uploader': u'arte.tv',
1263 'upload_date': info.get('date'),
1264 'title': info.get('title'),
1265 'ext': u'mp4',
1266 'format': u'NA',
1267 'player_url': None,
1268 }
1269
1270 def _real_extract(self, url):
1271 video_id = url.split('/')[-1]
1272 self.report_extraction(video_id)
1273
1274 if re.search(self._LIVE_URL, video_id) is not None:
1275 self.extractLiveStream(url)
1276 return
1277 else:
1278 info = self.extractPlus7Stream(url)
1279
1280 return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284 """Generic last-resort information extractor."""
1285
1286 _VALID_URL = r'.*'
1287 IE_NAME = u'generic'
1288
1289 def __init__(self, downloader=None):
1290 InfoExtractor.__init__(self, downloader)
1291
1292 def report_download_webpage(self, video_id):
1293 """Report webpage download."""
1294 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1295 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1296
1297 def report_extraction(self, video_id):
1298 """Report information extraction."""
1299 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1300
1301 def report_following_redirect(self, new_url):
1302 """Report information extraction."""
1303 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1304
1305 def _test_redirect(self, url):
1306 """Check if it is a redirect, like url shorteners, in case restart chain."""
1307 class HeadRequest(compat_urllib_request.Request):
1308 def get_method(self):
1309 return "HEAD"
1310
1311 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1312 """
1313 Subclass the HTTPRedirectHandler to make it use our
1314 HeadRequest also on the redirected URL
1315 """
1316 def redirect_request(self, req, fp, code, msg, headers, newurl):
1317 if code in (301, 302, 303, 307):
1318 newurl = newurl.replace(' ', '%20')
1319 newheaders = dict((k,v) for k,v in req.headers.items()
1320 if k.lower() not in ("content-length", "content-type"))
1321 return HeadRequest(newurl,
1322 headers=newheaders,
1323 origin_req_host=req.get_origin_req_host(),
1324 unverifiable=True)
1325 else:
1326 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1327
1328 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1329 """
1330 Fallback to GET if HEAD is not allowed (405 HTTP error)
1331 """
1332 def http_error_405(self, req, fp, code, msg, headers):
1333 fp.read()
1334 fp.close()
1335
1336 newheaders = dict((k,v) for k,v in req.headers.items()
1337 if k.lower() not in ("content-length", "content-type"))
1338 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1339 headers=newheaders,
1340 origin_req_host=req.get_origin_req_host(),
1341 unverifiable=True))
1342
1343 # Build our opener
1344 opener = compat_urllib_request.OpenerDirector()
1345 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1346 HTTPMethodFallback, HEADRedirectHandler,
1347 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1348 opener.add_handler(handler())
1349
1350 response = opener.open(HeadRequest(url))
1351 new_url = response.geturl()
1352
1353 if url == new_url:
1354 return False
1355
1356 self.report_following_redirect(new_url)
1357 self._downloader.download([new_url])
1358 return True
1359
1360 def _real_extract(self, url):
1361 if self._test_redirect(url): return
1362
1363 video_id = url.split('/')[-1]
1364 request = compat_urllib_request.Request(url)
1365 try:
1366 self.report_download_webpage(video_id)
1367 webpage = compat_urllib_request.urlopen(request).read()
1368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1369 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1370 return
1371 except ValueError as err:
1372 # since this is the last-resort InfoExtractor, if
1373 # this error is thrown, it'll be thrown here
1374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375 return
1376
1377 self.report_extraction(video_id)
1378 # Start with something easy: JW Player in SWFObject
1379 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380 if mobj is None:
1381 # Broaden the search a little bit
1382 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383 if mobj is None:
1384 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385 return
1386
1387 # It's possible that one of the regexes
1388 # matched, but returned an empty group:
1389 if mobj.group(1) is None:
1390 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391 return
1392
1393 video_url = compat_urllib_parse.unquote(mobj.group(1))
1394 video_id = os.path.basename(video_url)
1395
1396 # here's a fun little line of code for you:
1397 video_extension = os.path.splitext(video_id)[1][1:]
1398 video_id = os.path.splitext(video_id)[0]
1399
1400 # it's tempting to parse this further, but you would
1401 # have to take into account all the variations like
1402 # Video Title - Site Name
1403 # Site Name | Video Title
1404 # Video Title - Tagline | Site Name
1405 # and so on and so forth; it's just not practical
1406 mobj = re.search(r'<title>(.*)</title>', webpage)
1407 if mobj is None:
1408 self._downloader.trouble(u'ERROR: unable to extract title')
1409 return
1410 video_title = mobj.group(1).decode('utf-8')
1411
1412 # video uploader is domain name
1413 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1414 if mobj is None:
1415 self._downloader.trouble(u'ERROR: unable to extract title')
1416 return
1417 video_uploader = mobj.group(1).decode('utf-8')
1418
1419 return [{
1420 'id': video_id.decode('utf-8'),
1421 'url': video_url.decode('utf-8'),
1422 'uploader': video_uploader,
1423 'upload_date': None,
1424 'title': video_title,
1425 'ext': video_extension.decode('utf-8'),
1426 }]
1427
1428
1429 class YoutubeSearchIE(InfoExtractor):
1430 """Information Extractor for YouTube search queries."""
1431 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1432 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1433 _max_youtube_results = 1000
1434 IE_NAME = u'youtube:search'
1435
1436 def __init__(self, downloader=None):
1437 InfoExtractor.__init__(self, downloader)
1438
1439 def report_download_page(self, query, pagenum):
1440 """Report attempt to download search page with given number."""
1441 query = query.decode(preferredencoding())
1442 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1443
1444 def _real_extract(self, query):
1445 mobj = re.match(self._VALID_URL, query)
1446 if mobj is None:
1447 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1448 return
1449
1450 prefix, query = query.split(':')
1451 prefix = prefix[8:]
1452 query = query.encode('utf-8')
1453 if prefix == '':
1454 self._download_n_results(query, 1)
1455 return
1456 elif prefix == 'all':
1457 self._download_n_results(query, self._max_youtube_results)
1458 return
1459 else:
1460 try:
1461 n = int(prefix)
1462 if n <= 0:
1463 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1464 return
1465 elif n > self._max_youtube_results:
1466 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1467 n = self._max_youtube_results
1468 self._download_n_results(query, n)
1469 return
1470 except ValueError: # parsing prefix as integer fails
1471 self._download_n_results(query, 1)
1472 return
1473
1474 def _download_n_results(self, query, n):
1475 """Downloads a specified number of results for a query"""
1476
1477 video_ids = []
1478 pagenum = 0
1479 limit = n
1480
1481 while (50 * pagenum) < limit:
1482 self.report_download_page(query, pagenum+1)
1483 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1484 request = compat_urllib_request.Request(result_url)
1485 try:
1486 data = compat_urllib_request.urlopen(request).read()
1487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1488 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1489 return
1490 api_response = json.loads(data)['data']
1491
1492 new_ids = list(video['id'] for video in api_response['items'])
1493 video_ids += new_ids
1494
1495 limit = min(n, api_response['totalItems'])
1496 pagenum += 1
1497
1498 if len(video_ids) > n:
1499 video_ids = video_ids[:n]
1500 for id in video_ids:
1501 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1502 return
1503
1504
1505 class GoogleSearchIE(InfoExtractor):
1506 """Information Extractor for Google Video search queries."""
1507 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1508 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1509 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1510 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1511 _max_google_results = 1000
1512 IE_NAME = u'video.google:search'
1513
1514 def __init__(self, downloader=None):
1515 InfoExtractor.__init__(self, downloader)
1516
1517 def report_download_page(self, query, pagenum):
1518 """Report attempt to download playlist page with given number."""
1519 query = query.decode(preferredencoding())
1520 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1521
1522 def _real_extract(self, query):
1523 mobj = re.match(self._VALID_URL, query)
1524 if mobj is None:
1525 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1526 return
1527
1528 prefix, query = query.split(':')
1529 prefix = prefix[8:]
1530 query = query.encode('utf-8')
1531 if prefix == '':
1532 self._download_n_results(query, 1)
1533 return
1534 elif prefix == 'all':
1535 self._download_n_results(query, self._max_google_results)
1536 return
1537 else:
1538 try:
1539 n = int(prefix)
1540 if n <= 0:
1541 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1542 return
1543 elif n > self._max_google_results:
1544 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1545 n = self._max_google_results
1546 self._download_n_results(query, n)
1547 return
1548 except ValueError: # parsing prefix as integer fails
1549 self._download_n_results(query, 1)
1550 return
1551
1552 def _download_n_results(self, query, n):
1553 """Downloads a specified number of results for a query"""
1554
1555 video_ids = []
1556 pagenum = 0
1557
1558 while True:
1559 self.report_download_page(query, pagenum)
1560 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1561 request = compat_urllib_request.Request(result_url)
1562 try:
1563 page = compat_urllib_request.urlopen(request).read()
1564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1565 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1566 return
1567
1568 # Extract video identifiers
1569 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1570 video_id = mobj.group(1)
1571 if video_id not in video_ids:
1572 video_ids.append(video_id)
1573 if len(video_ids) == n:
1574 # Specified n videos reached
1575 for id in video_ids:
1576 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577 return
1578
1579 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1580 for id in video_ids:
1581 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1582 return
1583
1584 pagenum = pagenum + 1
1585
1586
1587 class YahooSearchIE(InfoExtractor):
1588 """Information Extractor for Yahoo! Video search queries."""
1589 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1590 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1591 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1592 _MORE_PAGES_INDICATOR = r'\s*Next'
1593 _max_yahoo_results = 1000
1594 IE_NAME = u'video.yahoo:search'
1595
1596 def __init__(self, downloader=None):
1597 InfoExtractor.__init__(self, downloader)
1598
1599 def report_download_page(self, query, pagenum):
1600 """Report attempt to download playlist page with given number."""
1601 query = query.decode(preferredencoding())
1602 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1603
1604 def _real_extract(self, query):
1605 mobj = re.match(self._VALID_URL, query)
1606 if mobj is None:
1607 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1608 return
1609
1610 prefix, query = query.split(':')
1611 prefix = prefix[8:]
1612 query = query.encode('utf-8')
1613 if prefix == '':
1614 self._download_n_results(query, 1)
1615 return
1616 elif prefix == 'all':
1617 self._download_n_results(query, self._max_yahoo_results)
1618 return
1619 else:
1620 try:
1621 n = int(prefix)
1622 if n <= 0:
1623 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1624 return
1625 elif n > self._max_yahoo_results:
1626 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1627 n = self._max_yahoo_results
1628 self._download_n_results(query, n)
1629 return
1630 except ValueError: # parsing prefix as integer fails
1631 self._download_n_results(query, 1)
1632 return
1633
1634 def _download_n_results(self, query, n):
1635 """Downloads a specified number of results for a query"""
1636
1637 video_ids = []
1638 already_seen = set()
1639 pagenum = 1
1640
1641 while True:
1642 self.report_download_page(query, pagenum)
1643 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1644 request = compat_urllib_request.Request(result_url)
1645 try:
1646 page = compat_urllib_request.urlopen(request).read()
1647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1648 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1649 return
1650
1651 # Extract video identifiers
1652 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1653 video_id = mobj.group(1)
1654 if video_id not in already_seen:
1655 video_ids.append(video_id)
1656 already_seen.add(video_id)
1657 if len(video_ids) == n:
1658 # Specified n videos reached
1659 for id in video_ids:
1660 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661 return
1662
1663 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1664 for id in video_ids:
1665 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1666 return
1667
1668 pagenum = pagenum + 1
1669
1670
1671 class YoutubePlaylistIE(InfoExtractor):
1672 """Information Extractor for YouTube playlists."""
1673
1674 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1675 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1676 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1677 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1678 IE_NAME = u'youtube:playlist'
1679
1680 def __init__(self, downloader=None):
1681 InfoExtractor.__init__(self, downloader)
1682
1683 def report_download_page(self, playlist_id, pagenum):
1684 """Report attempt to download playlist page with given number."""
1685 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1686
1687 def _real_extract(self, url):
1688 # Extract playlist id
1689 mobj = re.match(self._VALID_URL, url)
1690 if mobj is None:
1691 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1692 return
1693
1694 # Single video case
1695 if mobj.group(3) is not None:
1696 self._downloader.download([mobj.group(3)])
1697 return
1698
1699 # Download playlist pages
1700 # prefix is 'p' as default for playlists but there are other types that need extra care
1701 playlist_prefix = mobj.group(1)
1702 if playlist_prefix == 'a':
1703 playlist_access = 'artist'
1704 else:
1705 playlist_prefix = 'p'
1706 playlist_access = 'view_play_list'
1707 playlist_id = mobj.group(2)
1708 video_ids = []
1709 pagenum = 1
1710
1711 while True:
1712 self.report_download_page(playlist_id, pagenum)
1713 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1714 request = compat_urllib_request.Request(url)
1715 try:
1716 page = compat_urllib_request.urlopen(request).read()
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719 return
1720
1721 # Extract video identifiers
1722 ids_in_page = []
1723 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1727
1728 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729 break
1730 pagenum = pagenum + 1
1731
1732 playliststart = self._downloader.params.get('playliststart', 1) - 1
1733 playlistend = self._downloader.params.get('playlistend', -1)
1734 if playlistend == -1:
1735 video_ids = video_ids[playliststart:]
1736 else:
1737 video_ids = video_ids[playliststart:playlistend]
1738
1739 for id in video_ids:
1740 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1741 return
1742
1743
1744 class YoutubeChannelIE(InfoExtractor):
1745 """Information Extractor for YouTube channels."""
1746
1747 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1748 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1749 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1750 IE_NAME = u'youtube:channel'
1751
1752 def report_download_page(self, channel_id, pagenum):
1753 """Report attempt to download channel page with given number."""
1754 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1755
1756 def _real_extract(self, url):
1757 # Extract channel id
1758 mobj = re.match(self._VALID_URL, url)
1759 if mobj is None:
1760 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1761 return
1762
1763 # Download channel pages
1764 channel_id = mobj.group(1)
1765 video_ids = []
1766 pagenum = 1
1767
1768 while True:
1769 self.report_download_page(channel_id, pagenum)
1770 url = self._TEMPLATE_URL % (channel_id, pagenum)
1771 request = compat_urllib_request.Request(url)
1772 try:
1773 page = compat_urllib_request.urlopen(request).read()
1774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776 return
1777
1778 # Extract video identifiers
1779 ids_in_page = []
1780 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1781 if mobj.group(1) not in ids_in_page:
1782 ids_in_page.append(mobj.group(1))
1783 video_ids.extend(ids_in_page)
1784
1785 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1786 break
1787 pagenum = pagenum + 1
1788
1789 for id in video_ids:
1790 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1791 return
1792
1793
1794 class YoutubeUserIE(InfoExtractor):
1795 """Information Extractor for YouTube users."""
1796
1797 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1798 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1799 _GDATA_PAGE_SIZE = 50
1800 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1801 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1802 IE_NAME = u'youtube:user'
1803
1804 def __init__(self, downloader=None):
1805 InfoExtractor.__init__(self, downloader)
1806
1807 def report_download_page(self, username, start_index):
1808 """Report attempt to download user page."""
1809 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1810 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1811
1812 def _real_extract(self, url):
1813 # Extract username
1814 mobj = re.match(self._VALID_URL, url)
1815 if mobj is None:
1816 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1817 return
1818
1819 username = mobj.group(1)
1820
1821 # Download video ids using YouTube Data API. Result size per
1822 # query is limited (currently to 50 videos) so we need to query
1823 # page by page until there are no video ids - it means we got
1824 # all of them.
1825
1826 video_ids = []
1827 pagenum = 0
1828
1829 while True:
1830 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1831 self.report_download_page(username, start_index)
1832
1833 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1834
1835 try:
1836 page = compat_urllib_request.urlopen(request).read()
1837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1839 return
1840
1841 # Extract video identifiers
1842 ids_in_page = []
1843
1844 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845 if mobj.group(1) not in ids_in_page:
1846 ids_in_page.append(mobj.group(1))
1847
1848 video_ids.extend(ids_in_page)
1849
1850 # A little optimization - if current page is not
1851 # "full", ie. does not contain PAGE_SIZE video ids then
1852 # we can assume that this page is the last one - there
1853 # are no more ids on further pages - no need to query
1854 # again.
1855
1856 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1857 break
1858
1859 pagenum += 1
1860
1861 all_ids_count = len(video_ids)
1862 playliststart = self._downloader.params.get('playliststart', 1) - 1
1863 playlistend = self._downloader.params.get('playlistend', -1)
1864
1865 if playlistend == -1:
1866 video_ids = video_ids[playliststart:]
1867 else:
1868 video_ids = video_ids[playliststart:playlistend]
1869
1870 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1871 (username, all_ids_count, len(video_ids)))
1872
1873 for video_id in video_ids:
1874 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1875
1876
1877 class BlipTVUserIE(InfoExtractor):
1878 """Information Extractor for blip.tv users."""
1879
1880 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1881 _PAGE_SIZE = 12
1882 IE_NAME = u'blip.tv:user'
1883
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1886
1887 def report_download_page(self, username, pagenum):
1888 """Report attempt to download user page."""
1889 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1890 (self.IE_NAME, username, pagenum))
1891
1892 def _real_extract(self, url):
1893 # Extract username
1894 mobj = re.match(self._VALID_URL, url)
1895 if mobj is None:
1896 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1897 return
1898
1899 username = mobj.group(1)
1900
1901 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1902
1903 request = compat_urllib_request.Request(url)
1904
1905 try:
1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907 mobj = re.search(r'data-users-id="([^"]+)"', page)
1908 page_base = page_base % mobj.group(1)
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1911 return
1912
1913
1914 # Download video ids using BlipTV Ajax calls. Result size per
1915 # query is limited (currently to 12 videos) so we need to query
1916 # page by page until there are no video ids - it means we got
1917 # all of them.
1918
1919 video_ids = []
1920 pagenum = 1
1921
1922 while True:
1923 self.report_download_page(username, pagenum)
1924
1925 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1926
1927 try:
1928 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1931 return
1932
1933 # Extract video identifiers
1934 ids_in_page = []
1935
1936 for mobj in re.finditer(r'href="/([^"]+)"', page):
1937 if mobj.group(1) not in ids_in_page:
1938 ids_in_page.append(unescapeHTML(mobj.group(1)))
1939
1940 video_ids.extend(ids_in_page)
1941
1942 # A little optimization - if current page is not
1943 # "full", ie. does not contain PAGE_SIZE video ids then
1944 # we can assume that this page is the last one - there
1945 # are no more ids on further pages - no need to query
1946 # again.
1947
1948 if len(ids_in_page) < self._PAGE_SIZE:
1949 break
1950
1951 pagenum += 1
1952
1953 all_ids_count = len(video_ids)
1954 playliststart = self._downloader.params.get('playliststart', 1) - 1
1955 playlistend = self._downloader.params.get('playlistend', -1)
1956
1957 if playlistend == -1:
1958 video_ids = video_ids[playliststart:]
1959 else:
1960 video_ids = video_ids[playliststart:playlistend]
1961
1962 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1963 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1964
1965 for video_id in video_ids:
1966 self._downloader.download([u'http://blip.tv/'+video_id])
1967
1968
1969 class DepositFilesIE(InfoExtractor):
1970 """Information extractor for depositfiles.com"""
1971
1972 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1973 IE_NAME = u'DepositFiles'
1974
1975 def __init__(self, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1977
1978 def report_download_webpage(self, file_id):
1979 """Report webpage download."""
1980 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1981
1982 def report_extraction(self, file_id):
1983 """Report information extraction."""
1984 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1985
1986 def _real_extract(self, url):
1987 file_id = url.split('/')[-1]
1988 # Rebuild url in english locale
1989 url = 'http://depositfiles.com/en/files/' + file_id
1990
1991 # Retrieve file webpage with 'Free download' button pressed
1992 free_download_indication = { 'gateway_result' : '1' }
1993 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1994 try:
1995 self.report_download_webpage(file_id)
1996 webpage = compat_urllib_request.urlopen(request).read()
1997 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1998 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1999 return
2000
2001 # Search for the real file URL
2002 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2003 if (mobj is None) or (mobj.group(1) is None):
2004 # Try to figure out reason of the error.
2005 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2006 if (mobj is not None) and (mobj.group(1) is not None):
2007 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2008 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2009 else:
2010 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2011 return
2012
2013 file_url = mobj.group(1)
2014 file_extension = os.path.splitext(file_url)[1][1:]
2015
2016 # Search for file title
2017 mobj = re.search(r'<b title="(.*?)">', webpage)
2018 if mobj is None:
2019 self._downloader.trouble(u'ERROR: unable to extract title')
2020 return
2021 file_title = mobj.group(1).decode('utf-8')
2022
2023 return [{
2024 'id': file_id.decode('utf-8'),
2025 'url': file_url.decode('utf-8'),
2026 'uploader': None,
2027 'upload_date': None,
2028 'title': file_title,
2029 'ext': file_extension.decode('utf-8'),
2030 }]
2031
2032
2033 class FacebookIE(InfoExtractor):
2034 """Information Extractor for Facebook"""
2035
2036 _WORKING = False
2037 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2038 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2039 _NETRC_MACHINE = 'facebook'
2040 _available_formats = ['video', 'highqual', 'lowqual']
2041 _video_extensions = {
2042 'video': 'mp4',
2043 'highqual': 'mp4',
2044 'lowqual': 'mp4',
2045 }
2046 IE_NAME = u'facebook'
2047
2048 def __init__(self, downloader=None):
2049 InfoExtractor.__init__(self, downloader)
2050
2051 def _reporter(self, message):
2052 """Add header and report message."""
2053 self._downloader.to_screen(u'[facebook] %s' % message)
2054
2055 def report_login(self):
2056 """Report attempt to log in."""
2057 self._reporter(u'Logging in')
2058
2059 def report_video_webpage_download(self, video_id):
2060 """Report attempt to download video webpage."""
2061 self._reporter(u'%s: Downloading video webpage' % video_id)
2062
2063 def report_information_extraction(self, video_id):
2064 """Report attempt to extract video information."""
2065 self._reporter(u'%s: Extracting video information' % video_id)
2066
2067 def _parse_page(self, video_webpage):
2068 """Extract video information from page"""
2069 # General data
2070 data = {'title': r'\("video_title", "(.*?)"\)',
2071 'description': r'<div class="datawrap">(.*?)</div>',
2072 'owner': r'\("video_owner_name", "(.*?)"\)',
2073 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2074 }
2075 video_info = {}
2076 for piece in data.keys():
2077 mobj = re.search(data[piece], video_webpage)
2078 if mobj is not None:
2079 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2080
2081 # Video urls
2082 video_urls = {}
2083 for fmt in self._available_formats:
2084 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2085 if mobj is not None:
2086 # URL is in a Javascript segment inside an escaped Unicode format within
2087 # the generally utf-8 page
2088 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2089 video_info['video_urls'] = video_urls
2090
2091 return video_info
2092
2093 def _real_initialize(self):
2094 if self._downloader is None:
2095 return
2096
2097 useremail = None
2098 password = None
2099 downloader_params = self._downloader.params
2100
2101 # Attempt to use provided username and password or .netrc data
2102 if downloader_params.get('username', None) is not None:
2103 useremail = downloader_params['username']
2104 password = downloader_params['password']
2105 elif downloader_params.get('usenetrc', False):
2106 try:
2107 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2108 if info is not None:
2109 useremail = info[0]
2110 password = info[2]
2111 else:
2112 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2113 except (IOError, netrc.NetrcParseError) as err:
2114 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2115 return
2116
2117 if useremail is None:
2118 return
2119
2120 # Log in
2121 login_form = {
2122 'email': useremail,
2123 'pass': password,
2124 'login': 'Log+In'
2125 }
2126 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2127 try:
2128 self.report_login()
2129 login_results = compat_urllib_request.urlopen(request).read()
2130 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2131 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2132 return
2133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2134 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2135 return
2136
2137 def _real_extract(self, url):
2138 mobj = re.match(self._VALID_URL, url)
2139 if mobj is None:
2140 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2141 return
2142 video_id = mobj.group('ID')
2143
2144 # Get video webpage
2145 self.report_video_webpage_download(video_id)
2146 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2147 try:
2148 page = compat_urllib_request.urlopen(request)
2149 video_webpage = page.read()
2150 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2152 return
2153
2154 # Start extracting information
2155 self.report_information_extraction(video_id)
2156
2157 # Extract information
2158 video_info = self._parse_page(video_webpage)
2159
2160 # uploader
2161 if 'owner' not in video_info:
2162 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2163 return
2164 video_uploader = video_info['owner']
2165
2166 # title
2167 if 'title' not in video_info:
2168 self._downloader.trouble(u'ERROR: unable to extract video title')
2169 return
2170 video_title = video_info['title']
2171 video_title = video_title.decode('utf-8')
2172
2173 # thumbnail image
2174 if 'thumbnail' not in video_info:
2175 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2176 video_thumbnail = ''
2177 else:
2178 video_thumbnail = video_info['thumbnail']
2179
2180 # upload date
2181 upload_date = None
2182 if 'upload_date' in video_info:
2183 upload_time = video_info['upload_date']
2184 timetuple = email.utils.parsedate_tz(upload_time)
2185 if timetuple is not None:
2186 try:
2187 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2188 except:
2189 pass
2190
2191 # description
2192 video_description = video_info.get('description', 'No description available.')
2193
2194 url_map = video_info['video_urls']
2195 if len(url_map.keys()) > 0:
2196 # Decide which formats to download
2197 req_format = self._downloader.params.get('format', None)
2198 format_limit = self._downloader.params.get('format_limit', None)
2199
2200 if format_limit is not None and format_limit in self._available_formats:
2201 format_list = self._available_formats[self._available_formats.index(format_limit):]
2202 else:
2203 format_list = self._available_formats
2204 existing_formats = [x for x in format_list if x in url_map]
2205 if len(existing_formats) == 0:
2206 self._downloader.trouble(u'ERROR: no known formats available for video')
2207 return
2208 if req_format is None:
2209 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2210 elif req_format == 'worst':
2211 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2212 elif req_format == '-1':
2213 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2214 else:
2215 # Specific format
2216 if req_format not in url_map:
2217 self._downloader.trouble(u'ERROR: requested format not available')
2218 return
2219 video_url_list = [(req_format, url_map[req_format])] # Specific format
2220
2221 results = []
2222 for format_param, video_real_url in video_url_list:
2223 # Extension
2224 video_extension = self._video_extensions.get(format_param, 'mp4')
2225
2226 results.append({
2227 'id': video_id.decode('utf-8'),
2228 'url': video_real_url.decode('utf-8'),
2229 'uploader': video_uploader.decode('utf-8'),
2230 'upload_date': upload_date,
2231 'title': video_title,
2232 'ext': video_extension.decode('utf-8'),
2233 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2234 'thumbnail': video_thumbnail.decode('utf-8'),
2235 'description': video_description.decode('utf-8'),
2236 })
2237 return results
2238
2239 class BlipTVIE(InfoExtractor):
2240 """Information extractor for blip.tv"""
2241
2242 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2243 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2244 IE_NAME = u'blip.tv'
2245
2246 def report_extraction(self, file_id):
2247 """Report information extraction."""
2248 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2249
2250 def report_direct_download(self, title):
2251 """Report information extraction."""
2252 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2253
2254 def _real_extract(self, url):
2255 mobj = re.match(self._VALID_URL, url)
2256 if mobj is None:
2257 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258 return
2259
2260 if '?' in url:
2261 cchar = '&'
2262 else:
2263 cchar = '?'
2264 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2265 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2266 self.report_extraction(mobj.group(1))
2267 info = None
2268 try:
2269 urlh = compat_urllib_request.urlopen(request)
2270 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2271 basename = url.split('/')[-1]
2272 title,ext = os.path.splitext(basename)
2273 title = title.decode('UTF-8')
2274 ext = ext.replace('.', '')
2275 self.report_direct_download(title)
2276 info = {
2277 'id': title,
2278 'url': url,
2279 'uploader': None,
2280 'upload_date': None,
2281 'title': title,
2282 'ext': ext,
2283 'urlhandle': urlh
2284 }
2285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2286 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2287 return
2288 if info is None: # Regular URL
2289 try:
2290 json_code = urlh.read()
2291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2292 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2293 return
2294
2295 try:
2296 json_data = json.loads(json_code)
2297 if 'Post' in json_data:
2298 data = json_data['Post']
2299 else:
2300 data = json_data
2301
2302 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2303 video_url = data['media']['url']
2304 umobj = re.match(self._URL_EXT, video_url)
2305 if umobj is None:
2306 raise ValueError('Can not determine filename extension')
2307 ext = umobj.group(1)
2308
2309 info = {
2310 'id': data['item_id'],
2311 'url': video_url,
2312 'uploader': data['display_name'],
2313 'upload_date': upload_date,
2314 'title': data['title'],
2315 'ext': ext,
2316 'format': data['media']['mimeType'],
2317 'thumbnail': data['thumbnailUrl'],
2318 'description': data['description'],
2319 'player_url': data['embedUrl']
2320 }
2321 except (ValueError,KeyError) as err:
2322 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2323 return
2324
2325 std_headers['User-Agent'] = 'iTunes/10.6.1'
2326 return [info]
2327
2328
2329 class MyVideoIE(InfoExtractor):
2330 """Information Extractor for myvideo.de."""
2331
2332 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2333 IE_NAME = u'myvideo'
2334
2335 def __init__(self, downloader=None):
2336 InfoExtractor.__init__(self, downloader)
2337
2338 def report_download_webpage(self, video_id):
2339 """Report webpage download."""
2340 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2341
2342 def report_extraction(self, video_id):
2343 """Report information extraction."""
2344 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2345
2346 def _real_extract(self,url):
2347 mobj = re.match(self._VALID_URL, url)
2348 if mobj is None:
2349 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2350 return
2351
2352 video_id = mobj.group(1)
2353
2354 # Get video webpage
2355 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2356 try:
2357 self.report_download_webpage(video_id)
2358 webpage = compat_urllib_request.urlopen(request).read()
2359 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2360 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2361 return
2362
2363 self.report_extraction(video_id)
2364 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2365 webpage)
2366 if mobj is None:
2367 self._downloader.trouble(u'ERROR: unable to extract media URL')
2368 return
2369 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2370
2371 mobj = re.search('<title>([^<]+)</title>', webpage)
2372 if mobj is None:
2373 self._downloader.trouble(u'ERROR: unable to extract title')
2374 return
2375
2376 video_title = mobj.group(1)
2377
2378 return [{
2379 'id': video_id,
2380 'url': video_url,
2381 'uploader': None,
2382 'upload_date': None,
2383 'title': video_title,
2384 'ext': u'flv',
2385 }]
2386
2387 class ComedyCentralIE(InfoExtractor):
2388 """Information extractor for The Daily Show and Colbert Report """
2389
2390 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2391 IE_NAME = u'comedycentral'
2392
2393 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2394
2395 _video_extensions = {
2396 '3500': 'mp4',
2397 '2200': 'mp4',
2398 '1700': 'mp4',
2399 '1200': 'mp4',
2400 '750': 'mp4',
2401 '400': 'mp4',
2402 }
2403 _video_dimensions = {
2404 '3500': '1280x720',
2405 '2200': '960x540',
2406 '1700': '768x432',
2407 '1200': '640x360',
2408 '750': '512x288',
2409 '400': '384x216',
2410 }
2411
2412 def report_extraction(self, episode_id):
2413 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2414
2415 def report_config_download(self, episode_id):
2416 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2417
2418 def report_index_download(self, episode_id):
2419 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2420
2421 def report_player_url(self, episode_id):
2422 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2423
2424
2425 def _print_formats(self, formats):
2426 print('Available formats:')
2427 for x in formats:
2428 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2429
2430
2431 def _real_extract(self, url):
2432 mobj = re.match(self._VALID_URL, url)
2433 if mobj is None:
2434 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435 return
2436
2437 if mobj.group('shortname'):
2438 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2439 url = u'http://www.thedailyshow.com/full-episodes/'
2440 else:
2441 url = u'http://www.colbertnation.com/full-episodes/'
2442 mobj = re.match(self._VALID_URL, url)
2443 assert mobj is not None
2444
2445 dlNewest = not mobj.group('episode')
2446 if dlNewest:
2447 epTitle = mobj.group('showname')
2448 else:
2449 epTitle = mobj.group('episode')
2450
2451 req = compat_urllib_request.Request(url)
2452 self.report_extraction(epTitle)
2453 try:
2454 htmlHandle = compat_urllib_request.urlopen(req)
2455 html = htmlHandle.read()
2456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2457 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2458 return
2459 if dlNewest:
2460 url = htmlHandle.geturl()
2461 mobj = re.match(self._VALID_URL, url)
2462 if mobj is None:
2463 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2464 return
2465 if mobj.group('episode') == '':
2466 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2467 return
2468 epTitle = mobj.group('episode')
2469
2470 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2471
2472 if len(mMovieParams) == 0:
2473 # The Colbert Report embeds the information in a without
2474 # a URL prefix; so extract the alternate reference
2475 # and then add the URL prefix manually.
2476
2477 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2478 if len(altMovieParams) == 0:
2479 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2480 return
2481 else:
2482 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2483
2484 playerUrl_raw = mMovieParams[0][0]
2485 self.report_player_url(epTitle)
2486 try:
2487 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2488 playerUrl = urlHandle.geturl()
2489 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2491 return
2492
2493 uri = mMovieParams[0][1]
2494 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2495 self.report_index_download(epTitle)
2496 try:
2497 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2499 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2500 return
2501
2502 results = []
2503
2504 idoc = xml.etree.ElementTree.fromstring(indexXml)
2505 itemEls = idoc.findall('.//item')
2506 for itemEl in itemEls:
2507 mediaId = itemEl.findall('./guid')[0].text
2508 shortMediaId = mediaId.split(':')[-1]
2509 showId = mediaId.split(':')[-2].replace('.com', '')
2510 officialTitle = itemEl.findall('./title')[0].text
2511 officialDate = itemEl.findall('./pubDate')[0].text
2512
2513 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2514 compat_urllib_parse.urlencode({'uri': mediaId}))
2515 configReq = compat_urllib_request.Request(configUrl)
2516 self.report_config_download(epTitle)
2517 try:
2518 configXml = compat_urllib_request.urlopen(configReq).read()
2519 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2520 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2521 return
2522
2523 cdoc = xml.etree.ElementTree.fromstring(configXml)
2524 turls = []
2525 for rendition in cdoc.findall('.//rendition'):
2526 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2527 turls.append(finfo)
2528
2529 if len(turls) == 0:
2530 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2531 continue
2532
2533 if self._downloader.params.get('listformats', None):
2534 self._print_formats([i[0] for i in turls])
2535 return
2536
2537 # For now, just pick the highest bitrate
2538 format,video_url = turls[-1]
2539
2540 # Get the format arg from the arg stream
2541 req_format = self._downloader.params.get('format', None)
2542
2543 # Select format if we can find one
2544 for f,v in turls:
2545 if f == req_format:
2546 format, video_url = f, v
2547 break
2548
2549 # Patch to download from alternative CDN, which does not
2550 # break on current RTMPDump builds
2551 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2552 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2553
2554 if video_url.startswith(broken_cdn):
2555 video_url = video_url.replace(broken_cdn, better_cdn)
2556
2557 effTitle = showId + u'-' + epTitle
2558 info = {
2559 'id': shortMediaId,
2560 'url': video_url,
2561 'uploader': showId,
2562 'upload_date': officialDate,
2563 'title': effTitle,
2564 'ext': 'mp4',
2565 'format': format,
2566 'thumbnail': None,
2567 'description': officialTitle,
2568 'player_url': None #playerUrl
2569 }
2570
2571 results.append(info)
2572
2573 return results
2574
2575
2576 class EscapistIE(InfoExtractor):
2577 """Information extractor for The Escapist """
2578
2579 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2580 IE_NAME = u'escapist'
2581
2582 def report_extraction(self, showName):
2583 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2584
2585 def report_config_download(self, showName):
2586 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2587
2588 def _real_extract(self, url):
2589 mobj = re.match(self._VALID_URL, url)
2590 if mobj is None:
2591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2592 return
2593 showName = mobj.group('showname')
2594 videoId = mobj.group('episode')
2595
2596 self.report_extraction(showName)
2597 try:
2598 webPage = compat_urllib_request.urlopen(url)
2599 webPageBytes = webPage.read()
2600 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2601 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2602 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2604 return
2605
2606 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2607 description = unescapeHTML(descMatch.group(1))
2608 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2609 imgUrl = unescapeHTML(imgMatch.group(1))
2610 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2611 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2612 configUrlMatch = re.search('config=(.*)$', playerUrl)
2613 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2614
2615 self.report_config_download(showName)
2616 try:
2617 configJSON = compat_urllib_request.urlopen(configUrl).read()
2618 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2619 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2620 return
2621
2622 # Technically, it's JavaScript, not JSON
2623 configJSON = configJSON.replace("'", '"')
2624
2625 try:
2626 config = json.loads(configJSON)
2627 except (ValueError,) as err:
2628 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2629 return
2630
2631 playlist = config['playlist']
2632 videoUrl = playlist[1]['url']
2633
2634 info = {
2635 'id': videoId,
2636 'url': videoUrl,
2637 'uploader': showName,
2638 'upload_date': None,
2639 'title': showName,
2640 'ext': 'flv',
2641 'thumbnail': imgUrl,
2642 'description': description,
2643 'player_url': playerUrl,
2644 }
2645
2646 return [info]
2647
2648
2649 class CollegeHumorIE(InfoExtractor):
2650 """Information extractor for collegehumor.com"""
2651
2652 _WORKING = False
2653 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2654 IE_NAME = u'collegehumor'
2655
2656 def report_manifest(self, video_id):
2657 """Report information extraction."""
2658 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2659
2660 def report_extraction(self, video_id):
2661 """Report information extraction."""
2662 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2663
2664 def _real_extract(self, url):
2665 mobj = re.match(self._VALID_URL, url)
2666 if mobj is None:
2667 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2668 return
2669 video_id = mobj.group('videoid')
2670
2671 info = {
2672 'id': video_id,
2673 'uploader': None,
2674 'upload_date': None,
2675 }
2676
2677 self.report_extraction(video_id)
2678 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2679 try:
2680 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2681 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2682 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2683 return
2684
2685 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2686 try:
2687 videoNode = mdoc.findall('./video')[0]
2688 info['description'] = videoNode.findall('./description')[0].text
2689 info['title'] = videoNode.findall('./caption')[0].text
2690 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2691 manifest_url = videoNode.findall('./file')[0].text
2692 except IndexError:
2693 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2694 return
2695
2696 manifest_url += '?hdcore=2.10.3'
2697 self.report_manifest(video_id)
2698 try:
2699 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2700 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2701 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2702 return
2703
2704 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2705 try:
2706 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2707 node_id = media_node.attrib['url']
2708 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2709 except IndexError as err:
2710 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2711 return
2712
2713 url_pr = compat_urllib_parse_urlparse(manifest_url)
2714 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2715
2716 info['url'] = url
2717 info['ext'] = 'f4f'
2718 return [info]
2719
2720
2721 class XVideosIE(InfoExtractor):
2722 """Information extractor for xvideos.com"""
2723
2724 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2725 IE_NAME = u'xvideos'
2726
2727 def report_webpage(self, video_id):
2728 """Report information extraction."""
2729 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2730
2731 def report_extraction(self, video_id):
2732 """Report information extraction."""
2733 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2734
2735 def _real_extract(self, url):
2736 mobj = re.match(self._VALID_URL, url)
2737 if mobj is None:
2738 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2739 return
2740 video_id = mobj.group(1).decode('utf-8')
2741
2742 self.report_webpage(video_id)
2743
2744 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2745 try:
2746 webpage = compat_urllib_request.urlopen(request).read()
2747 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2748 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2749 return
2750
2751 self.report_extraction(video_id)
2752
2753
2754 # Extract video URL
2755 mobj = re.search(r'flv_url=(.+?)&', webpage)
2756 if mobj is None:
2757 self._downloader.trouble(u'ERROR: unable to extract video url')
2758 return
2759 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2760
2761
2762 # Extract title
2763 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2764 if mobj is None:
2765 self._downloader.trouble(u'ERROR: unable to extract video title')
2766 return
2767 video_title = mobj.group(1).decode('utf-8')
2768
2769
2770 # Extract video thumbnail
2771 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2772 if mobj is None:
2773 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2774 return
2775 video_thumbnail = mobj.group(0).decode('utf-8')
2776
2777 info = {
2778 'id': video_id,
2779 'url': video_url,
2780 'uploader': None,
2781 'upload_date': None,
2782 'title': video_title,
2783 'ext': 'flv',
2784 'thumbnail': video_thumbnail,
2785 'description': None,
2786 }
2787
2788 return [info]
2789
2790
2791 class SoundcloudIE(InfoExtractor):
2792 """Information extractor for soundcloud.com
2793 To access the media, the uid of the song and a stream token
2794 must be extracted from the page source and the script must make
2795 a request to media.soundcloud.com/crossdomain.xml. Then
2796 the media can be grabbed by requesting from an url composed
2797 of the stream token and uid
2798 """
2799
2800 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2801 IE_NAME = u'soundcloud'
2802
2803 def __init__(self, downloader=None):
2804 InfoExtractor.__init__(self, downloader)
2805
2806 def report_webpage(self, video_id):
2807 """Report information extraction."""
2808 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2809
2810 def report_extraction(self, video_id):
2811 """Report information extraction."""
2812 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2813
2814 def _real_extract(self, url):
2815 mobj = re.match(self._VALID_URL, url)
2816 if mobj is None:
2817 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2818 return
2819
2820 # extract uploader (which is in the url)
2821 uploader = mobj.group(1)
2822 # extract simple title (uploader + slug of song title)
2823 slug_title = mobj.group(2)
2824 simple_title = uploader + u'-' + slug_title
2825
2826 self.report_webpage('%s/%s' % (uploader, slug_title))
2827
2828 url = 'https://soundcloud.com/%s/%s' % (uploader, slug_title)
2829 request = compat_urllib_request.Request(url)
2830 try:
2831 urlo = compat_urllib_request.urlopen(request).read()
2832 webpage = webpage_bytes.decode('utf-8')
2833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2834 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2835 return
2836
2837 self.report_extraction('%s/%s' % (uploader, slug_title))
2838
2839 # extract uid and stream token that soundcloud hands out for access
2840 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2841 if mobj:
2842 video_id = mobj.group(1)
2843 stream_token = mobj.group(2)
2844 else:
2845 self._downloader.trouble(u'ERROR: unable to find video ID in Soundcloud file')
2846 return
2847
2848 # extract unsimplified title
2849 mobj = re.search('"title":"(.*?)",', webpage)
2850 if mobj:
2851 title = mobj.group(1)
2852 else:
2853 title = simple_title
2854
2855 # construct media url (with uid/token)
2856 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2857 mediaURL = mediaURL % (video_id, stream_token)
2858
2859 # description
2860 description = u'No description available'
2861 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2862 if mobj:
2863 description = mobj.group(1)
2864
2865 # upload date
2866 upload_date = None
2867 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2868 if mobj:
2869 try:
2870 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2871 except Exception as err:
2872 self._downloader.to_stderr(compat_str(err))
2873
2874 # for soundcloud, a request to a cross domain is required for cookies
2875 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2876
2877 return [{
2878 'id': video_id,
2879 'url': mediaURL,
2880 'uploader': uploader,
2881 'upload_date': upload_date,
2882 'title': title,
2883 'ext': u'mp3',
2884 'description': description
2885 }]
2886
2887
2888 class InfoQIE(InfoExtractor):
2889 """Information extractor for infoq.com"""
2890
2891 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2892 IE_NAME = u'infoq'
2893
2894 def report_webpage(self, video_id):
2895 """Report information extraction."""
2896 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2897
2898 def report_extraction(self, video_id):
2899 """Report information extraction."""
2900 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2901
2902 def _real_extract(self, url):
2903 mobj = re.match(self._VALID_URL, url)
2904 if mobj is None:
2905 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2906 return
2907
2908 self.report_webpage(url)
2909
2910 request = compat_urllib_request.Request(url)
2911 try:
2912 webpage = compat_urllib_request.urlopen(request).read()
2913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2914 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2915 return
2916
2917 self.report_extraction(url)
2918
2919
2920 # Extract video URL
2921 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2922 if mobj is None:
2923 self._downloader.trouble(u'ERROR: unable to extract video url')
2924 return
2925 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2926
2927
2928 # Extract title
2929 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2930 if mobj is None:
2931 self._downloader.trouble(u'ERROR: unable to extract video title')
2932 return
2933 video_title = mobj.group(1).decode('utf-8')
2934
2935 # Extract description
2936 video_description = u'No description available.'
2937 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2938 if mobj is not None:
2939 video_description = mobj.group(1).decode('utf-8')
2940
2941 video_filename = video_url.split('/')[-1]
2942 video_id, extension = video_filename.split('.')
2943
2944 info = {
2945 'id': video_id,
2946 'url': video_url,
2947 'uploader': None,
2948 'upload_date': None,
2949 'title': video_title,
2950 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2951 'thumbnail': None,
2952 'description': video_description,
2953 }
2954
2955 return [info]
2956
2957 class MixcloudIE(InfoExtractor):
2958 """Information extractor for www.mixcloud.com"""
2959 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2960 IE_NAME = u'mixcloud'
2961
2962 def __init__(self, downloader=None):
2963 InfoExtractor.__init__(self, downloader)
2964
2965 def report_download_json(self, file_id):
2966 """Report JSON download."""
2967 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2968
2969 def report_extraction(self, file_id):
2970 """Report information extraction."""
2971 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2972
2973 def get_urls(self, jsonData, fmt, bitrate='best'):
2974 """Get urls from 'audio_formats' section in json"""
2975 file_url = None
2976 try:
2977 bitrate_list = jsonData[fmt]
2978 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2979 bitrate = max(bitrate_list) # select highest
2980
2981 url_list = jsonData[fmt][bitrate]
2982 except TypeError: # we have no bitrate info.
2983 url_list = jsonData[fmt]
2984 return url_list
2985
2986 def check_urls(self, url_list):
2987 """Returns 1st active url from list"""
2988 for url in url_list:
2989 try:
2990 compat_urllib_request.urlopen(url)
2991 return url
2992 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2993 url = None
2994
2995 return None
2996
2997 def _print_formats(self, formats):
2998 print('Available formats:')
2999 for fmt in formats.keys():
3000 for b in formats[fmt]:
3001 try:
3002 ext = formats[fmt][b][0]
3003 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3004 except TypeError: # we have no bitrate info
3005 ext = formats[fmt][0]
3006 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3007 break
3008
3009 def _real_extract(self, url):
3010 mobj = re.match(self._VALID_URL, url)
3011 if mobj is None:
3012 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3013 return
3014 # extract uploader & filename from url
3015 uploader = mobj.group(1).decode('utf-8')
3016 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3017
3018 # construct API request
3019 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3020 # retrieve .json file with links to files
3021 request = compat_urllib_request.Request(file_url)
3022 try:
3023 self.report_download_json(file_url)
3024 jsonData = compat_urllib_request.urlopen(request).read()
3025 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3026 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3027 return
3028
3029 # parse JSON
3030 json_data = json.loads(jsonData)
3031 player_url = json_data['player_swf_url']
3032 formats = dict(json_data['audio_formats'])
3033
3034 req_format = self._downloader.params.get('format', None)
3035 bitrate = None
3036
3037 if self._downloader.params.get('listformats', None):
3038 self._print_formats(formats)
3039 return
3040
3041 if req_format is None or req_format == 'best':
3042 for format_param in formats.keys():
3043 url_list = self.get_urls(formats, format_param)
3044 # check urls
3045 file_url = self.check_urls(url_list)
3046 if file_url is not None:
3047 break # got it!
3048 else:
3049 if req_format not in formats.keys():
3050 self._downloader.trouble(u'ERROR: format is not available')
3051 return
3052
3053 url_list = self.get_urls(formats, req_format)
3054 file_url = self.check_urls(url_list)
3055 format_param = req_format
3056
3057 return [{
3058 'id': file_id.decode('utf-8'),
3059 'url': file_url.decode('utf-8'),
3060 'uploader': uploader.decode('utf-8'),
3061 'upload_date': None,
3062 'title': json_data['name'],
3063 'ext': file_url.split('.')[-1].decode('utf-8'),
3064 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3065 'thumbnail': json_data['thumbnail_url'],
3066 'description': json_data['description'],
3067 'player_url': player_url.decode('utf-8'),
3068 }]
3069
3070 class StanfordOpenClassroomIE(InfoExtractor):
3071 """Information extractor for Stanford's Open ClassRoom"""
3072
3073 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3074 IE_NAME = u'stanfordoc'
3075
3076 def report_download_webpage(self, objid):
3077 """Report information extraction."""
3078 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3079
3080 def report_extraction(self, video_id):
3081 """Report information extraction."""
3082 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3083
3084 def _real_extract(self, url):
3085 mobj = re.match(self._VALID_URL, url)
3086 if mobj is None:
3087 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3088 return
3089
3090 if mobj.group('course') and mobj.group('video'): # A specific video
3091 course = mobj.group('course')
3092 video = mobj.group('video')
3093 info = {
3094 'id': course + '_' + video,
3095 'uploader': None,
3096 'upload_date': None,
3097 }
3098
3099 self.report_extraction(info['id'])
3100 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3101 xmlUrl = baseUrl + video + '.xml'
3102 try:
3103 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3104 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3105 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3106 return
3107 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3108 try:
3109 info['title'] = mdoc.findall('./title')[0].text
3110 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3111 except IndexError:
3112 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3113 return
3114 info['ext'] = info['url'].rpartition('.')[2]
3115 return [info]
3116 elif mobj.group('course'): # A course page
3117 course = mobj.group('course')
3118 info = {
3119 'id': course,
3120 'type': 'playlist',
3121 'uploader': None,
3122 'upload_date': None,
3123 }
3124
3125 self.report_download_webpage(info['id'])
3126 try:
3127 coursepage = compat_urllib_request.urlopen(url).read()
3128 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3129 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3130 return
3131
3132 m = re.search('<h1>([^<]+)</h1>', coursepage)
3133 if m:
3134 info['title'] = unescapeHTML(m.group(1))
3135 else:
3136 info['title'] = info['id']
3137
3138 m = re.search('<description>([^<]+)</description>', coursepage)
3139 if m:
3140 info['description'] = unescapeHTML(m.group(1))
3141
3142 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3143 info['list'] = [
3144 {
3145 'type': 'reference',
3146 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3147 }
3148 for vpage in links]
3149 results = []
3150 for entry in info['list']:
3151 assert entry['type'] == 'reference'
3152 results += self.extract(entry['url'])
3153 return results
3154
3155 else: # Root page
3156 info = {
3157 'id': 'Stanford OpenClassroom',
3158 'type': 'playlist',
3159 'uploader': None,
3160 'upload_date': None,
3161 }
3162
3163 self.report_download_webpage(info['id'])
3164 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3165 try:
3166 rootpage = compat_urllib_request.urlopen(rootURL).read()
3167 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3168 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3169 return
3170
3171 info['title'] = info['id']
3172
3173 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3174 info['list'] = [
3175 {
3176 'type': 'reference',
3177 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3178 }
3179 for cpage in links]
3180
3181 results = []
3182 for entry in info['list']:
3183 assert entry['type'] == 'reference'
3184 results += self.extract(entry['url'])
3185 return results
3186
3187 class MTVIE(InfoExtractor):
3188 """Information extractor for MTV.com"""
3189
3190 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3191 IE_NAME = u'mtv'
3192
3193 def report_webpage(self, video_id):
3194 """Report information extraction."""
3195 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3196
3197 def report_extraction(self, video_id):
3198 """Report information extraction."""
3199 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3200
3201 def _real_extract(self, url):
3202 mobj = re.match(self._VALID_URL, url)
3203 if mobj is None:
3204 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3205 return
3206 if not mobj.group('proto'):
3207 url = 'http://' + url
3208 video_id = mobj.group('videoid')
3209 self.report_webpage(video_id)
3210
3211 request = compat_urllib_request.Request(url)
3212 try:
3213 webpage = compat_urllib_request.urlopen(request).read()
3214 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3215 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3216 return
3217
3218 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3219 if mobj is None:
3220 self._downloader.trouble(u'ERROR: unable to extract song name')
3221 return
3222 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3223 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3224 if mobj is None:
3225 self._downloader.trouble(u'ERROR: unable to extract performer')
3226 return
3227 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3228 video_title = performer + ' - ' + song_name
3229
3230 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3231 if mobj is None:
3232 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3233 return
3234 mtvn_uri = mobj.group(1)
3235
3236 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3237 if mobj is None:
3238 self._downloader.trouble(u'ERROR: unable to extract content id')
3239 return
3240 content_id = mobj.group(1)
3241
3242 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3243 self.report_extraction(video_id)
3244 request = compat_urllib_request.Request(videogen_url)
3245 try:
3246 metadataXml = compat_urllib_request.urlopen(request).read()
3247 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3248 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3249 return
3250
3251 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3252 renditions = mdoc.findall('.//rendition')
3253
3254 # For now, always pick the highest quality.
3255 rendition = renditions[-1]
3256
3257 try:
3258 _,_,ext = rendition.attrib['type'].partition('/')
3259 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3260 video_url = rendition.find('./src').text
3261 except KeyError:
3262 self._downloader.trouble('Invalid rendition field.')
3263 return
3264
3265 info = {
3266 'id': video_id,
3267 'url': video_url,
3268 'uploader': performer,
3269 'upload_date': None,
3270 'title': video_title,
3271 'ext': ext,
3272 'format': format,
3273 }
3274
3275 return [info]
3276
3277
3278 class YoukuIE(InfoExtractor):
3279
3280 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3281 IE_NAME = u'Youku'
3282
3283 def __init__(self, downloader=None):
3284 InfoExtractor.__init__(self, downloader)
3285
3286 def report_download_webpage(self, file_id):
3287 """Report webpage download."""
3288 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3289
3290 def report_extraction(self, file_id):
3291 """Report information extraction."""
3292 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3293
3294 def _gen_sid(self):
3295 nowTime = int(time.time() * 1000)
3296 random1 = random.randint(1000,1998)
3297 random2 = random.randint(1000,9999)
3298
3299 return "%d%d%d" %(nowTime,random1,random2)
3300
3301 def _get_file_ID_mix_string(self, seed):
3302 mixed = []
3303 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3304 seed = float(seed)
3305 for i in range(len(source)):
3306 seed = (seed * 211 + 30031 ) % 65536
3307 index = math.floor(seed / 65536 * len(source) )
3308 mixed.append(source[int(index)])
3309 source.remove(source[int(index)])
3310 #return ''.join(mixed)
3311 return mixed
3312
3313 def _get_file_id(self, fileId, seed):
3314 mixed = self._get_file_ID_mix_string(seed)
3315 ids = fileId.split('*')
3316 realId = []
3317 for ch in ids:
3318 if ch:
3319 realId.append(mixed[int(ch)])
3320 return ''.join(realId)
3321
3322 def _real_extract(self, url):
3323 mobj = re.match(self._VALID_URL, url)
3324 if mobj is None:
3325 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3326 return
3327 video_id = mobj.group('ID')
3328
3329 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3330
3331 request = compat_urllib_request.Request(info_url, None, std_headers)
3332 try:
3333 self.report_download_webpage(video_id)
3334 jsondata = compat_urllib_request.urlopen(request).read()
3335 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3336 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3337 return
3338
3339 self.report_extraction(video_id)
3340 try:
3341 config = json.loads(jsondata)
3342
3343 video_title = config['data'][0]['title']
3344 seed = config['data'][0]['seed']
3345
3346 format = self._downloader.params.get('format', None)
3347 supported_format = config['data'][0]['streamfileids'].keys()
3348
3349 if format is None or format == 'best':
3350 if 'hd2' in supported_format:
3351 format = 'hd2'
3352 else:
3353 format = 'flv'
3354 ext = u'flv'
3355 elif format == 'worst':
3356 format = 'mp4'
3357 ext = u'mp4'
3358 else:
3359 format = 'flv'
3360 ext = u'flv'
3361
3362
3363 fileid = config['data'][0]['streamfileids'][format]
3364 seg_number = len(config['data'][0]['segs'][format])
3365
3366 keys=[]
3367 for i in xrange(seg_number):
3368 keys.append(config['data'][0]['segs'][format][i]['k'])
3369
3370 #TODO check error
3371 #youku only could be viewed from mainland china
3372 except:
3373 self._downloader.trouble(u'ERROR: unable to extract info section')
3374 return
3375
3376 files_info=[]
3377 sid = self._gen_sid()
3378 fileid = self._get_file_id(fileid, seed)
3379
3380 #column 8,9 of fileid represent the segment number
3381 #fileid[7:9] should be changed
3382 for index, key in enumerate(keys):
3383
3384 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3385 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3386
3387 info = {
3388 'id': '%s_part%02d' % (video_id, index),
3389 'url': download_url,
3390 'uploader': None,
3391 'upload_date': None,
3392 'title': video_title,
3393 'ext': ext,
3394 }
3395 files_info.append(info)
3396
3397 return files_info
3398
3399
3400 class XNXXIE(InfoExtractor):
3401 """Information extractor for xnxx.com"""
3402
3403 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3404 IE_NAME = u'xnxx'
3405 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3406 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3407 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3408
3409 def report_webpage(self, video_id):
3410 """Report information extraction"""
3411 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3412
3413 def report_extraction(self, video_id):
3414 """Report information extraction"""
3415 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3416
3417 def _real_extract(self, url):
3418 mobj = re.match(self._VALID_URL, url)
3419 if mobj is None:
3420 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3421 return
3422 video_id = mobj.group(1).decode('utf-8')
3423
3424 self.report_webpage(video_id)
3425
3426 # Get webpage content
3427 try:
3428 webpage = compat_urllib_request.urlopen(url).read()
3429 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3430 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3431 return
3432
3433 result = re.search(self.VIDEO_URL_RE, webpage)
3434 if result is None:
3435 self._downloader.trouble(u'ERROR: unable to extract video url')
3436 return
3437 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3438
3439 result = re.search(self.VIDEO_TITLE_RE, webpage)
3440 if result is None:
3441 self._downloader.trouble(u'ERROR: unable to extract video title')
3442 return
3443 video_title = result.group(1).decode('utf-8')
3444
3445 result = re.search(self.VIDEO_THUMB_RE, webpage)
3446 if result is None:
3447 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3448 return
3449 video_thumbnail = result.group(1).decode('utf-8')
3450
3451 return [{
3452 'id': video_id,
3453 'url': video_url,
3454 'uploader': None,
3455 'upload_date': None,
3456 'title': video_title,
3457 'ext': 'flv',
3458 'thumbnail': video_thumbnail,
3459 'description': None,
3460 }]
3461
3462
3463 class GooglePlusIE(InfoExtractor):
3464 """Information extractor for plus.google.com."""
3465
3466 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3467 IE_NAME = u'plus.google'
3468
3469 def __init__(self, downloader=None):
3470 InfoExtractor.__init__(self, downloader)
3471
3472 def report_extract_entry(self, url):
3473 """Report downloading extry"""
3474 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3475
3476 def report_date(self, upload_date):
3477 """Report downloading extry"""
3478 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3479
3480 def report_uploader(self, uploader):
3481 """Report downloading extry"""
3482 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3483
3484 def report_title(self, video_title):
3485 """Report downloading extry"""
3486 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3487
3488 def report_extract_vid_page(self, video_page):
3489 """Report information extraction."""
3490 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3491
3492 def _real_extract(self, url):
3493 # Extract id from URL
3494 mobj = re.match(self._VALID_URL, url)
3495 if mobj is None:
3496 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3497 return
3498
3499 post_url = mobj.group(0)
3500 video_id = mobj.group(2)
3501
3502 video_extension = 'flv'
3503
3504 # Step 1, Retrieve post webpage to extract further information
3505 self.report_extract_entry(post_url)
3506 request = compat_urllib_request.Request(post_url)
3507 try:
3508 webpage = compat_urllib_request.urlopen(request).read()
3509 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3510 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3511 return
3512
3513 # Extract update date
3514 upload_date = None
3515 pattern = 'title="Timestamp">(.*?)</a>'
3516 mobj = re.search(pattern, webpage)
3517 if mobj:
3518 upload_date = mobj.group(1)
3519 # Convert timestring to a format suitable for filename
3520 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3521 upload_date = upload_date.strftime('%Y%m%d')
3522 self.report_date(upload_date)
3523
3524 # Extract uploader
3525 uploader = None
3526 pattern = r'rel\="author".*?>(.*?)</a>'
3527 mobj = re.search(pattern, webpage)
3528 if mobj:
3529 uploader = mobj.group(1)
3530 self.report_uploader(uploader)
3531
3532 # Extract title
3533 # Get the first line for title
3534 video_title = u'NA'
3535 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3536 mobj = re.search(pattern, webpage)
3537 if mobj:
3538 video_title = mobj.group(1)
3539 self.report_title(video_title)
3540
3541 # Step 2, Stimulate clicking the image box to launch video
3542 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3543 mobj = re.search(pattern, webpage)
3544 if mobj is None:
3545 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3546
3547 video_page = mobj.group(1)
3548 request = compat_urllib_request.Request(video_page)
3549 try:
3550 webpage = compat_urllib_request.urlopen(request).read()
3551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3552 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3553 return
3554 self.report_extract_vid_page(video_page)
3555
3556
3557 # Extract video links on video page
3558 """Extract video links of all sizes"""
3559 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3560 mobj = re.findall(pattern, webpage)
3561 if len(mobj) == 0:
3562 self._downloader.trouble(u'ERROR: unable to extract video links')
3563
3564 # Sort in resolution
3565 links = sorted(mobj)
3566
3567 # Choose the lowest of the sort, i.e. highest resolution
3568 video_url = links[-1]
3569 # Only get the url. The resolution part in the tuple has no use anymore
3570 video_url = video_url[-1]
3571 # Treat escaped \u0026 style hex
3572 video_url = unicode(video_url, "unicode_escape")
3573
3574
3575 return [{
3576 'id': video_id.decode('utf-8'),
3577 'url': video_url,
3578 'uploader': uploader.decode('utf-8'),
3579 'upload_date': upload_date.decode('utf-8'),
3580 'title': video_title.decode('utf-8'),
3581 'ext': video_extension.decode('utf-8'),
3582 }]