]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
Drop md5: spec for now (unused and breaks int values)
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
4fcca4bb 6import base64
d77c3dfd 7import datetime
d77c3dfd
FV
8import netrc
9import os
10import re
11import socket
12import time
d77c3dfd 13import email.utils
921a1455 14import xml.etree.ElementTree
302efc19 15import random
16import math
d77c3dfd 17
9e8056d5 18from .utils import *
d77c3dfd
FV
19
20
21class InfoExtractor(object):
59ae15a5 22 """Information Extractor class.
d77c3dfd 23
59ae15a5
PH
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
cdb30764 27 others. The information is stored in a dictionary which is then
59ae15a5
PH
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
717b1f72 31
59ae15a5 32 The dictionaries must include the following fields:
717b1f72 33
59ae15a5
PH
34 id: Video identifier.
35 url: Final video URL.
59ae15a5
PH
36 title: Video title, unescaped.
37 ext: Video filename extension.
717b1f72 38
59ae15a5 39 The following fields are optional:
717b1f72 40
59ae15a5
PH
41 format: The video format, defaults to ext (used for --get-format)
42 thumbnail: Full URL to a video thumbnail image.
43 description: One-line video description.
539679c7
PH
44 uploader: Full name of the video uploader.
45 upload_date: Video upload date (YYYYMMDD).
77c4beab 46 uploader_id: Nickname or id of the video uploader.
6119f78c 47 location: Physical location of the video.
59ae15a5
PH
48 player_url: SWF Player URL (used for rtmpdump).
49 subtitles: The .srt file contents.
50 urlhandle: [internal] The urlHandle to be used to download the file,
51 like returned by urllib.request.urlopen
d77c3dfd 52
59ae15a5 53 The fields should all be Unicode strings.
9ce5d9ee 54
59ae15a5
PH
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
717b1f72 58
59ae15a5
PH
59 _real_extract() must return a *list* of information dictionaries as
60 described above.
03c5b0fb 61
59ae15a5
PH
62 Finally, the _WORKING attribute should be set to False for broken IEs
63 in order to warn the users and skip the tests.
64 """
d77c3dfd 65
59ae15a5
PH
66 _ready = False
67 _downloader = None
68 _WORKING = True
d77c3dfd 69
59ae15a5
PH
70 def __init__(self, downloader=None):
71 """Constructor. Receives an optional downloader."""
72 self._ready = False
73 self.set_downloader(downloader)
d77c3dfd 74
59ae15a5
PH
75 def suitable(self, url):
76 """Receives a URL and returns True if suitable for this IE."""
77 return re.match(self._VALID_URL, url) is not None
d77c3dfd 78
59ae15a5
PH
79 def working(self):
80 """Getter method for _WORKING."""
81 return self._WORKING
03c5b0fb 82
59ae15a5
PH
83 def initialize(self):
84 """Initializes an instance (authentication, etc)."""
85 if not self._ready:
86 self._real_initialize()
87 self._ready = True
d77c3dfd 88
59ae15a5
PH
89 def extract(self, url):
90 """Extracts URL information and returns it in list of dicts."""
91 self.initialize()
92 return self._real_extract(url)
d77c3dfd 93
59ae15a5
PH
94 def set_downloader(self, downloader):
95 """Sets the downloader for this IE."""
96 self._downloader = downloader
d77c3dfd 97
59ae15a5
PH
98 def _real_initialize(self):
99 """Real initialization process. Redefine in subclasses."""
100 pass
d77c3dfd 101
59ae15a5
PH
102 def _real_extract(self, url):
103 """Real extraction process. Redefine in subclasses."""
104 pass
d77c3dfd 105
d0d4f277
PH
106 @property
107 def IE_NAME(self):
108 return type(self).__name__[:-2]
d77c3dfd 109
64ce2aad
PH
110 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
111 """ Returns the response handle """
d830b7c2
PH
112 if note is None:
113 note = u'Downloading video webpage'
114 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
115 try:
64ce2aad 116 return compat_urllib_request.urlopen(url_or_request)
d830b7c2
PH
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 if errnote is None:
119 errnote = u'Unable to download webpage'
01951dda 120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
d830b7c2 121
64ce2aad
PH
122 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
123 """ Returns the data of the page as a string """
124 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
125 webpage_bytes = urlh.read()
126 return webpage_bytes.decode('utf-8', 'replace')
127
d830b7c2 128
d77c3dfd 129class YoutubeIE(InfoExtractor):
59ae15a5
PH
130 """Information extractor for youtube.com."""
131
132 _VALID_URL = r"""^
133 (
134 (?:https?://)? # http(s):// (optional)
135 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
136 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
137 (?:.*?\#/)? # handle anchor (#/) redirect urls
138 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
139 (?: # the various things that can precede the ID:
140 (?:(?:v|embed|e)/) # v/ or embed/ or e/
141 |(?: # or the v= param in all its forms
142 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
143 (?:\?|\#!?) # the params delimiter ? or # or #!
3bb61659 144 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
59ae15a5
PH
145 v=
146 )
147 )? # optional -> youtube.com/xxxx is OK
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
152 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
153 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
154 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
155 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
156 _NETRC_MACHINE = 'youtube'
157 # Listed in order of quality
158 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
159 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
160 _video_extensions = {
161 '13': '3gp',
162 '17': 'mp4',
163 '18': 'mp4',
164 '22': 'mp4',
165 '37': 'mp4',
166 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
167 '43': 'webm',
168 '44': 'webm',
169 '45': 'webm',
170 '46': 'webm',
171 }
172 _video_dimensions = {
173 '5': '240x400',
174 '6': '???',
175 '13': '???',
176 '17': '144x176',
177 '18': '360x640',
178 '22': '720x1280',
179 '34': '360x640',
180 '35': '480x854',
181 '37': '1080x1920',
182 '38': '3072x4096',
183 '43': '360x640',
184 '44': '480x854',
185 '45': '720x1280',
186 '46': '1080x1920',
cdb30764 187 }
59ae15a5
PH
188 IE_NAME = u'youtube'
189
190 def suitable(self, url):
191 """Receives a URL and returns True if suitable for this IE."""
192 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
193
194 def report_lang(self):
195 """Report attempt to set language."""
196 self._downloader.to_screen(u'[youtube] Setting language')
197
198 def report_login(self):
199 """Report attempt to log in."""
200 self._downloader.to_screen(u'[youtube] Logging in')
201
202 def report_age_confirmation(self):
203 """Report attempt to confirm age."""
204 self._downloader.to_screen(u'[youtube] Confirming age')
205
206 def report_video_webpage_download(self, video_id):
207 """Report attempt to download video webpage."""
208 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
209
210 def report_video_info_webpage_download(self, video_id):
211 """Report attempt to download video info webpage."""
212 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
213
214 def report_video_subtitles_download(self, video_id):
215 """Report attempt to download video info webpage."""
216 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
217
218 def report_information_extraction(self, video_id):
219 """Report attempt to extract video information."""
220 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
221
222 def report_unavailable_format(self, video_id, format):
223 """Report extracted video URL."""
224 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
225
226 def report_rtmp_download(self):
227 """Indicate the download will use the RTMP protocol."""
228 self._downloader.to_screen(u'[youtube] RTMP download detected')
229
230 def _closed_captions_xml_to_srt(self, xml_string):
231 srt = ''
232 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
233 # TODO parse xml instead of regex
234 for n, (start, dur_tag, dur, caption) in enumerate(texts):
235 if not dur: dur = '4'
236 start = float(start)
237 end = start + float(dur)
238 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
239 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
240 caption = unescapeHTML(caption)
241 caption = unescapeHTML(caption) # double cycle, intentional
242 srt += str(n+1) + '\n'
243 srt += start + ' --> ' + end + '\n'
244 srt += caption + '\n\n'
245 return srt
246
056d8575
FV
247 def _extract_subtitles(self, video_id):
248 self.report_video_subtitles_download(video_id)
249 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
250 try:
251 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
252 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
253 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
254 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
255 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
256 if not srt_lang_list:
257 return (u'WARNING: video has no closed captions', None)
258 if self._downloader.params.get('subtitleslang', False):
259 srt_lang = self._downloader.params.get('subtitleslang')
260 elif 'en' in srt_lang_list:
261 srt_lang = 'en'
262 else:
1a2c3c0f 263 srt_lang = list(srt_lang_list.keys())[0]
056d8575
FV
264 if not srt_lang in srt_lang_list:
265 return (u'WARNING: no closed captions found in the specified language', None)
266 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
267 try:
268 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
269 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
270 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
271 if not srt_xml:
272 return (u'WARNING: unable to download video subtitles', None)
273 return (None, self._closed_captions_xml_to_srt(srt_xml))
274
59ae15a5
PH
275 def _print_formats(self, formats):
276 print('Available formats:')
277 for x in formats:
278 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
279
280 def _real_initialize(self):
281 if self._downloader is None:
282 return
283
284 username = None
285 password = None
286 downloader_params = self._downloader.params
287
288 # Attempt to use provided username and password or .netrc data
289 if downloader_params.get('username', None) is not None:
290 username = downloader_params['username']
291 password = downloader_params['password']
292 elif downloader_params.get('usenetrc', False):
293 try:
294 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
295 if info is not None:
296 username = info[0]
297 password = info[2]
298 else:
299 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
300 except (IOError, netrc.NetrcParseError) as err:
301 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
302 return
303
304 # Set language
305 request = compat_urllib_request.Request(self._LANG_URL)
306 try:
307 self.report_lang()
308 compat_urllib_request.urlopen(request).read()
309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
311 return
312
313 # No authentication to be performed
314 if username is None:
315 return
316
317 # Log in
318 login_form = {
319 'current_form': 'loginForm',
320 'next': '/',
321 'action_login': 'Log In',
322 'username': username,
323 'password': password,
324 }
325 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
326 try:
327 self.report_login()
80d3177e 328 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
329 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
330 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
331 return
332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
333 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
334 return
335
336 # Confirm age
337 age_form = {
338 'next_url': '/',
339 'action_confirm': 'Confirm',
340 }
341 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
342 try:
343 self.report_age_confirmation()
80d3177e 344 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
345 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
346 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
347 return
348
3bb61659 349 def _extract_id(self, url):
59ae15a5
PH
350 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
351 if mobj is None:
352 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
353 return
354 video_id = mobj.group(2)
3bb61659
PH
355 return video_id
356
357 def _real_extract(self, url):
358 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
359 mobj = re.search(self._NEXT_URL_RE, url)
360 if mobj:
361 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
362 video_id = self._extract_id(url)
59ae15a5
PH
363
364 # Get video webpage
365 self.report_video_webpage_download(video_id)
3bb61659
PH
366 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
367 request = compat_urllib_request.Request(url)
59ae15a5
PH
368 try:
369 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
372 return
373
374 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
375
376 # Attempt to extract SWF player URL
377 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
378 if mobj is not None:
379 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
380 else:
381 player_url = None
382
383 # Get video info
384 self.report_video_info_webpage_download(video_id)
385 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
386 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
387 % (video_id, el_type))
388 request = compat_urllib_request.Request(video_info_url)
389 try:
390 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
391 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
392 video_info = compat_parse_qs(video_info_webpage)
393 if 'token' in video_info:
394 break
395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
397 return
398 if 'token' not in video_info:
399 if 'reason' in video_info:
400 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
401 else:
402 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
403 return
404
405 # Check for "rental" videos
406 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
407 self._downloader.trouble(u'ERROR: "rental" videos not supported')
408 return
409
410 # Start extracting information
411 self.report_information_extraction(video_id)
412
413 # uploader
414 if 'author' not in video_info:
77c4beab 415 self._downloader.trouble(u'ERROR: unable to extract uploader name')
59ae15a5
PH
416 return
417 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
418
77c4beab
FV
419 # uploader_id
420 video_uploader_id = None
26cf0408 421 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
77c4beab
FV
422 if mobj is not None:
423 video_uploader_id = mobj.group(1)
424 else:
425 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
426
59ae15a5
PH
427 # title
428 if 'title' not in video_info:
429 self._downloader.trouble(u'ERROR: unable to extract video title')
430 return
431 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
432
433 # thumbnail image
434 if 'thumbnail_url' not in video_info:
435 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
436 video_thumbnail = ''
437 else: # don't panic if we can't find it
438 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
439
440 # upload date
441 upload_date = None
442 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
443 if mobj is not None:
444 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
445 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
446 for expression in format_expressions:
447 try:
448 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
449 except:
450 pass
451
452 # description
453 video_description = get_element_by_id("eow-description", video_webpage)
454 if video_description:
455 video_description = clean_html(video_description)
456 else:
457 video_description = ''
458
459 # closed captions
460 video_subtitles = None
461 if self._downloader.params.get('writesubtitles', False):
056d8575
FV
462 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
463 if srt_error:
464 self._downloader.trouble(srt_error)
59ae15a5
PH
465
466 if 'length_seconds' not in video_info:
467 self._downloader.trouble(u'WARNING: unable to extract video duration')
468 video_duration = ''
469 else:
470 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
471
472 # token
473 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
474
475 # Decide which formats to download
476 req_format = self._downloader.params.get('format', None)
477
478 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
479 self.report_rtmp_download()
480 video_url_list = [(None, video_info['conn'][0])]
481 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
482 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
483 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1a2c3c0f 484 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
59ae15a5
PH
485 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
486
487 format_limit = self._downloader.params.get('format_limit', None)
488 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
489 if format_limit is not None and format_limit in available_formats:
490 format_list = available_formats[available_formats.index(format_limit):]
491 else:
492 format_list = available_formats
493 existing_formats = [x for x in format_list if x in url_map]
494 if len(existing_formats) == 0:
495 self._downloader.trouble(u'ERROR: no known formats available for video')
496 return
497 if self._downloader.params.get('listformats', None):
498 self._print_formats(existing_formats)
499 return
500 if req_format is None or req_format == 'best':
501 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
502 elif req_format == 'worst':
503 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
504 elif req_format in ('-1', 'all'):
505 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
506 else:
507 # Specific formats. We pick the first in a slash-delimeted sequence.
508 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
509 req_formats = req_format.split('/')
510 video_url_list = None
511 for rf in req_formats:
512 if rf in url_map:
513 video_url_list = [(rf, url_map[rf])]
514 break
515 if video_url_list is None:
516 self._downloader.trouble(u'ERROR: requested format not available')
517 return
518 else:
519 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
520 return
521
522 results = []
523 for format_param, video_real_url in video_url_list:
524 # Extension
525 video_extension = self._video_extensions.get(format_param, 'flv')
526
32761d86
FV
527 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
528 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
529
530 results.append({
531 'id': video_id,
532 'url': video_real_url,
533 'uploader': video_uploader,
77c4beab 534 'uploader_id': video_uploader_id,
59ae15a5
PH
535 'upload_date': upload_date,
536 'title': video_title,
537 'ext': video_extension,
538 'format': video_format,
539 'thumbnail': video_thumbnail,
540 'description': video_description,
541 'player_url': player_url,
542 'subtitles': video_subtitles,
543 'duration': video_duration
544 })
545 return results
d77c3dfd
FV
546
547
548class MetacafeIE(InfoExtractor):
59ae15a5
PH
549 """Information Extractor for metacafe.com."""
550
551 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
552 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
553 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
554 IE_NAME = u'metacafe'
555
556 def __init__(self, downloader=None):
557 InfoExtractor.__init__(self, downloader)
558
559 def report_disclaimer(self):
560 """Report disclaimer retrieval."""
561 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
562
563 def report_age_confirmation(self):
564 """Report attempt to confirm age."""
565 self._downloader.to_screen(u'[metacafe] Confirming age')
566
567 def report_download_webpage(self, video_id):
568 """Report webpage download."""
569 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
570
571 def report_extraction(self, video_id):
572 """Report information extraction."""
573 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
574
575 def _real_initialize(self):
576 # Retrieve disclaimer
577 request = compat_urllib_request.Request(self._DISCLAIMER)
578 try:
579 self.report_disclaimer()
580 disclaimer = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
583 return
584
585 # Confirm age
586 disclaimer_form = {
587 'filters': '0',
588 'submit': "Continue - I'm over 18",
589 }
590 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
591 try:
592 self.report_age_confirmation()
593 disclaimer = compat_urllib_request.urlopen(request).read()
594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
596 return
597
598 def _real_extract(self, url):
599 # Extract id and simplified title from URL
600 mobj = re.match(self._VALID_URL, url)
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
603 return
604
605 video_id = mobj.group(1)
606
607 # Check if video comes from YouTube
608 mobj2 = re.match(r'^yt-(.*)$', video_id)
609 if mobj2 is not None:
610 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
611 return
612
613 # Retrieve video webpage to extract further information
614 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
615 try:
616 self.report_download_webpage(video_id)
617 webpage = compat_urllib_request.urlopen(request).read()
618 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
619 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
620 return
621
622 # Extract URL, uploader and title from webpage
623 self.report_extraction(video_id)
624 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
625 if mobj is not None:
626 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
627 video_extension = mediaURL[-3:]
628
629 # Extract gdaKey if available
630 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
631 if mobj is None:
632 video_url = mediaURL
633 else:
634 gdaKey = mobj.group(1)
635 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
636 else:
637 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
638 if mobj is None:
639 self._downloader.trouble(u'ERROR: unable to extract media URL')
640 return
641 vardict = compat_parse_qs(mobj.group(1))
642 if 'mediaData' not in vardict:
643 self._downloader.trouble(u'ERROR: unable to extract media URL')
644 return
645 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
646 if mobj is None:
647 self._downloader.trouble(u'ERROR: unable to extract media URL')
648 return
649 mediaURL = mobj.group(1).replace('\\/', '/')
650 video_extension = mediaURL[-3:]
651 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
652
653 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
654 if mobj is None:
655 self._downloader.trouble(u'ERROR: unable to extract title')
656 return
657 video_title = mobj.group(1).decode('utf-8')
658
659 mobj = re.search(r'submitter=(.*?);', webpage)
660 if mobj is None:
661 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
662 return
663 video_uploader = mobj.group(1)
664
665 return [{
666 'id': video_id.decode('utf-8'),
667 'url': video_url.decode('utf-8'),
668 'uploader': video_uploader.decode('utf-8'),
669 'upload_date': None,
670 'title': video_title,
671 'ext': video_extension.decode('utf-8'),
672 }]
d77c3dfd
FV
673
674
675class DailymotionIE(InfoExtractor):
59ae15a5
PH
676 """Information Extractor for Dailymotion"""
677
678 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
679 IE_NAME = u'dailymotion'
680
681 def __init__(self, downloader=None):
682 InfoExtractor.__init__(self, downloader)
683
59ae15a5
PH
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
687
688 def _real_extract(self, url):
689 # Extract id and simplified title from URL
690 mobj = re.match(self._VALID_URL, url)
691 if mobj is None:
692 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
693 return
694
695 video_id = mobj.group(1).split('_')[0].split('?')[0]
696
697 video_extension = 'mp4'
698
699 # Retrieve video webpage to extract further information
700 request = compat_urllib_request.Request(url)
701 request.add_header('Cookie', 'family_filter=off')
8e241d1a 702 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
703
704 # Extract URL, uploader and title from webpage
705 self.report_extraction(video_id)
706 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
707 if mobj is None:
708 self._downloader.trouble(u'ERROR: unable to extract media URL')
709 return
710 flashvars = compat_urllib_parse.unquote(mobj.group(1))
711
712 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
713 if key in flashvars:
714 max_quality = key
715 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
716 break
717 else:
718 self._downloader.trouble(u'ERROR: unable to extract video URL')
719 return
720
721 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
722 if mobj is None:
723 self._downloader.trouble(u'ERROR: unable to extract video URL')
724 return
725
726 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
727
728 # TODO: support choosing qualities
729
730 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
731 if mobj is None:
732 self._downloader.trouble(u'ERROR: unable to extract title')
733 return
28ca6b5a 734 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
735
736 video_uploader = None
737 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
738 if mobj is None:
739 # lookin for official user
740 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
741 if mobj_official is None:
742 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
743 else:
744 video_uploader = mobj_official.group(1)
745 else:
746 video_uploader = mobj.group(1)
747
748 video_upload_date = None
749 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
750 if mobj is not None:
751 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
752
753 return [{
28ca6b5a
PH
754 'id': video_id,
755 'url': video_url,
756 'uploader': video_uploader,
59ae15a5
PH
757 'upload_date': video_upload_date,
758 'title': video_title,
28ca6b5a 759 'ext': video_extension,
59ae15a5 760 }]
d77c3dfd
FV
761
762
d77c3dfd 763class PhotobucketIE(InfoExtractor):
59ae15a5
PH
764 """Information extractor for photobucket.com."""
765
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
768
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
771
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
775
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
779
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
783 if mobj is None:
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
785 return
786
787 video_id = mobj.group(1)
788
789 video_extension = 'flv'
790
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 try:
794 self.report_download_webpage(video_id)
795 webpage = compat_urllib_request.urlopen(request).read()
796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
798 return
799
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
803 if mobj is None:
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
805 return
806 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
807
808 video_url = mediaURL
809
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
811 if mobj is None:
812 self._downloader.trouble(u'ERROR: unable to extract title')
813 return
814 video_title = mobj.group(1).decode('utf-8')
815
816 video_uploader = mobj.group(2).decode('utf-8')
817
818 return [{
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
822 'upload_date': None,
823 'title': video_title,
824 'ext': video_extension.decode('utf-8'),
825 }]
d77c3dfd
FV
826
827
828class YahooIE(InfoExtractor):
59ae15a5
PH
829 """Information extractor for video.yahoo.com."""
830
93702113 831 _WORKING = False
59ae15a5
PH
832 # _VALID_URL matches all Yahoo! Video URLs
833 # _VPAGE_URL matches only the extractable '/watch/' URLs
834 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
835 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
836 IE_NAME = u'video.yahoo'
837
838 def __init__(self, downloader=None):
839 InfoExtractor.__init__(self, downloader)
840
841 def report_download_webpage(self, video_id):
842 """Report webpage download."""
843 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
844
845 def report_extraction(self, video_id):
846 """Report information extraction."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
848
849 def _real_extract(self, url, new_video=True):
850 # Extract ID from URL
851 mobj = re.match(self._VALID_URL, url)
852 if mobj is None:
853 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
854 return
855
856 video_id = mobj.group(2)
857 video_extension = 'flv'
858
859 # Rewrite valid but non-extractable URLs as
860 # extractable English language /watch/ URLs
861 if re.match(self._VPAGE_URL, url) is None:
862 request = compat_urllib_request.Request(url)
863 try:
864 webpage = compat_urllib_request.urlopen(request).read()
865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
866 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
867 return
868
869 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
870 if mobj is None:
871 self._downloader.trouble(u'ERROR: Unable to extract id field')
872 return
873 yahoo_id = mobj.group(1)
874
875 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
876 if mobj is None:
877 self._downloader.trouble(u'ERROR: Unable to extract vid field')
878 return
879 yahoo_vid = mobj.group(1)
880
881 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
882 return self._real_extract(url, new_video=False)
883
884 # Retrieve video webpage to extract further information
885 request = compat_urllib_request.Request(url)
886 try:
887 self.report_download_webpage(video_id)
888 webpage = compat_urllib_request.urlopen(request).read()
889 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
890 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
891 return
892
893 # Extract uploader and title from webpage
894 self.report_extraction(video_id)
895 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
896 if mobj is None:
897 self._downloader.trouble(u'ERROR: unable to extract video title')
898 return
899 video_title = mobj.group(1).decode('utf-8')
900
901 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
902 if mobj is None:
903 self._downloader.trouble(u'ERROR: unable to extract video uploader')
904 return
905 video_uploader = mobj.group(1).decode('utf-8')
906
907 # Extract video thumbnail
908 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
909 if mobj is None:
910 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
911 return
912 video_thumbnail = mobj.group(1).decode('utf-8')
913
914 # Extract video description
915 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
916 if mobj is None:
917 self._downloader.trouble(u'ERROR: unable to extract video description')
918 return
919 video_description = mobj.group(1).decode('utf-8')
920 if not video_description:
921 video_description = 'No description available.'
922
923 # Extract video height and width
924 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
925 if mobj is None:
926 self._downloader.trouble(u'ERROR: unable to extract video height')
927 return
928 yv_video_height = mobj.group(1)
929
930 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
931 if mobj is None:
932 self._downloader.trouble(u'ERROR: unable to extract video width')
933 return
934 yv_video_width = mobj.group(1)
935
936 # Retrieve video playlist to extract media URL
937 # I'm not completely sure what all these options are, but we
938 # seem to need most of them, otherwise the server sends a 401.
939 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
940 yv_bitrate = '700' # according to Wikipedia this is hard-coded
941 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
942 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
943 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
944 try:
945 self.report_download_webpage(video_id)
946 webpage = compat_urllib_request.urlopen(request).read()
947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
949 return
950
951 # Extract media URL from playlist XML
952 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
953 if mobj is None:
954 self._downloader.trouble(u'ERROR: Unable to extract media URL')
955 return
956 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
957 video_url = unescapeHTML(video_url)
958
959 return [{
960 'id': video_id.decode('utf-8'),
961 'url': video_url,
962 'uploader': video_uploader,
963 'upload_date': None,
964 'title': video_title,
965 'ext': video_extension.decode('utf-8'),
966 'thumbnail': video_thumbnail.decode('utf-8'),
967 'description': video_description,
968 }]
d77c3dfd
FV
969
970
971class VimeoIE(InfoExtractor):
59ae15a5
PH
972 """Information extractor for vimeo.com."""
973
974 # _VALID_URL matches Vimeo URLs
975 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
976 IE_NAME = u'vimeo'
977
978 def __init__(self, downloader=None):
979 InfoExtractor.__init__(self, downloader)
980
981 def report_download_webpage(self, video_id):
982 """Report webpage download."""
983 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
984
985 def report_extraction(self, video_id):
986 """Report information extraction."""
987 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
988
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
992 if mobj is None:
993 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
994 return
995
996 video_id = mobj.group(1)
997
998 # Retrieve video webpage to extract further information
999 request = compat_urllib_request.Request(url, None, std_headers)
1000 try:
1001 self.report_download_webpage(video_id)
f1171f7c
PH
1002 webpage_bytes = compat_urllib_request.urlopen(request).read()
1003 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
1004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1006 return
1007
1008 # Now we begin extracting as much information as we can from what we
1009 # retrieved. First we extract the information common to all extractors,
1010 # and latter we extract those that are Vimeo specific.
1011 self.report_extraction(video_id)
1012
1013 # Extract the config JSON
59ae15a5 1014 try:
1ca63e3a 1015 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1016 config = json.loads(config)
1017 except:
1018 self._downloader.trouble(u'ERROR: unable to extract info section')
1019 return
cdb30764 1020
59ae15a5
PH
1021 # Extract title
1022 video_title = config["video"]["title"]
1023
77c4beab 1024 # Extract uploader and uploader_id
59ae15a5 1025 video_uploader = config["video"]["owner"]["name"]
77c4beab 1026 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1027
1028 # Extract video thumbnail
1029 video_thumbnail = config["video"]["thumbnail"]
1030
1031 # Extract video description
0dcfb234 1032 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5
PH
1033 if video_description: video_description = clean_html(video_description)
1034 else: video_description = ''
1035
1036 # Extract upload date
1037 video_upload_date = None
6b3aef80 1038 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1039 if mobj is not None:
6b3aef80 1040 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1041
1042 # Vimeo specific: extract request signature and timestamp
1043 sig = config['request']['signature']
1044 timestamp = config['request']['timestamp']
1045
1046 # Vimeo specific: extract video codec and quality information
1047 # First consider quality, then codecs, then take everything
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 files = { 'hd': [], 'sd': [], 'other': []}
1051 for codec_name, codec_extension in codecs:
1052 if codec_name in config["video"]["files"]:
1053 if 'hd' in config["video"]["files"][codec_name]:
1054 files['hd'].append((codec_name, codec_extension, 'hd'))
1055 elif 'sd' in config["video"]["files"][codec_name]:
1056 files['sd'].append((codec_name, codec_extension, 'sd'))
1057 else:
1058 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1059
1060 for quality in ('hd', 'sd', 'other'):
1061 if len(files[quality]) > 0:
1062 video_quality = files[quality][0][2]
1063 video_codec = files[quality][0][0]
1064 video_extension = files[quality][0][1]
1065 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1066 break
1067 else:
1068 self._downloader.trouble(u'ERROR: no known codec found')
1069 return
1070
1071 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1072 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1073
1074 return [{
1075 'id': video_id,
1076 'url': video_url,
1077 'uploader': video_uploader,
77c4beab 1078 'uploader_id': video_uploader_id,
59ae15a5
PH
1079 'upload_date': video_upload_date,
1080 'title': video_title,
1081 'ext': video_extension,
1082 'thumbnail': video_thumbnail,
1083 'description': video_description,
1084 }]
d77c3dfd
FV
1085
1086
f2ad10a9 1087class ArteTvIE(InfoExtractor):
59ae15a5
PH
1088 """arte.tv information extractor."""
1089
1090 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1091 _LIVE_URL = r'index-[0-9]+\.html$'
1092
1093 IE_NAME = u'arte.tv'
1094
1095 def __init__(self, downloader=None):
1096 InfoExtractor.__init__(self, downloader)
1097
1098 def report_download_webpage(self, video_id):
1099 """Report webpage download."""
1100 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1101
1102 def report_extraction(self, video_id):
1103 """Report information extraction."""
1104 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1105
1106 def fetch_webpage(self, url):
59ae15a5
PH
1107 request = compat_urllib_request.Request(url)
1108 try:
1109 self.report_download_webpage(url)
1110 webpage = compat_urllib_request.urlopen(request).read()
1111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1112 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1113 return
1114 except ValueError as err:
1115 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1116 return
1117 return webpage
1118
1119 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1120 page = self.fetch_webpage(url)
1121 mobj = re.search(regex, page, regexFlags)
1122 info = {}
1123
1124 if mobj is None:
1125 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1126 return
1127
1128 for (i, key, err) in matchTuples:
1129 if mobj.group(i) is None:
1130 self._downloader.trouble(err)
1131 return
1132 else:
1133 info[key] = mobj.group(i)
1134
1135 return info
1136
1137 def extractLiveStream(self, url):
1138 video_lang = url.split('/')[-4]
1139 info = self.grep_webpage(
1140 url,
1141 r'src="(.*?/videothek_js.*?\.js)',
1142 0,
1143 [
1144 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1145 ]
1146 )
1147 http_host = url.split('/')[2]
1148 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1149 info = self.grep_webpage(
1150 next_url,
1151 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1152 '(http://.*?\.swf).*?' +
1153 '(rtmp://.*?)\'',
1154 re.DOTALL,
1155 [
1156 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1157 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1158 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1159 ]
1160 )
1161 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1162
1163 def extractPlus7Stream(self, url):
1164 video_lang = url.split('/')[-3]
1165 info = self.grep_webpage(
1166 url,
1167 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1168 0,
1169 [
1170 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1171 ]
1172 )
1173 next_url = compat_urllib_parse.unquote(info.get('url'))
1174 info = self.grep_webpage(
1175 next_url,
1176 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1177 0,
1178 [
1179 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1180 ]
1181 )
1182 next_url = compat_urllib_parse.unquote(info.get('url'))
1183
1184 info = self.grep_webpage(
1185 next_url,
1186 r'<video id="(.*?)".*?>.*?' +
1187 '<name>(.*?)</name>.*?' +
1188 '<dateVideo>(.*?)</dateVideo>.*?' +
1189 '<url quality="hd">(.*?)</url>',
1190 re.DOTALL,
1191 [
1192 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1193 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1194 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1195 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1196 ]
1197 )
1198
1199 return {
1200 'id': info.get('id'),
1201 'url': compat_urllib_parse.unquote(info.get('url')),
1202 'uploader': u'arte.tv',
1203 'upload_date': info.get('date'),
93702113 1204 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1205 'ext': u'mp4',
1206 'format': u'NA',
1207 'player_url': None,
1208 }
1209
1210 def _real_extract(self, url):
1211 video_id = url.split('/')[-1]
1212 self.report_extraction(video_id)
1213
1214 if re.search(self._LIVE_URL, video_id) is not None:
1215 self.extractLiveStream(url)
1216 return
1217 else:
1218 info = self.extractPlus7Stream(url)
1219
1220 return [info]
f2ad10a9
CA
1221
1222
d77c3dfd 1223class GenericIE(InfoExtractor):
59ae15a5
PH
1224 """Generic last-resort information extractor."""
1225
1226 _VALID_URL = r'.*'
1227 IE_NAME = u'generic'
1228
1229 def __init__(self, downloader=None):
1230 InfoExtractor.__init__(self, downloader)
1231
1232 def report_download_webpage(self, video_id):
1233 """Report webpage download."""
1234 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1235 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1236
1237 def report_extraction(self, video_id):
1238 """Report information extraction."""
1239 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1240
1241 def report_following_redirect(self, new_url):
1242 """Report information extraction."""
1243 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1244
59ae15a5
PH
1245 def _test_redirect(self, url):
1246 """Check if it is a redirect, like url shorteners, in case restart chain."""
1247 class HeadRequest(compat_urllib_request.Request):
1248 def get_method(self):
1249 return "HEAD"
1250
1251 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1252 """
cdb30764 1253 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1254 HeadRequest also on the redirected URL
1255 """
cdb30764 1256 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1257 if code in (301, 302, 303, 307):
cdb30764 1258 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1259 newheaders = dict((k,v) for k,v in req.headers.items()
1260 if k.lower() not in ("content-length", "content-type"))
cdb30764 1261 return HeadRequest(newurl,
59ae15a5 1262 headers=newheaders,
cdb30764
ND
1263 origin_req_host=req.get_origin_req_host(),
1264 unverifiable=True)
1265 else:
1266 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1267
1268 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1269 """
1270 Fallback to GET if HEAD is not allowed (405 HTTP error)
1271 """
cdb30764 1272 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1273 fp.read()
1274 fp.close()
1275
1276 newheaders = dict((k,v) for k,v in req.headers.items()
1277 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1278 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1279 headers=newheaders,
1280 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1281 unverifiable=True))
1282
1283 # Build our opener
cdb30764 1284 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1285 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1286 HTTPMethodFallback, HEADRedirectHandler,
1287 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1288 opener.add_handler(handler())
1289
1290 response = opener.open(HeadRequest(url))
1291 new_url = response.geturl()
1292
1293 if url == new_url:
1294 return False
1295
1296 self.report_following_redirect(new_url)
1297 self._downloader.download([new_url])
1298 return True
1299
1300 def _real_extract(self, url):
1301 if self._test_redirect(url): return
1302
1303 video_id = url.split('/')[-1]
1304 request = compat_urllib_request.Request(url)
1305 try:
1306 self.report_download_webpage(video_id)
1307 webpage = compat_urllib_request.urlopen(request).read()
1308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1309 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1310 return
1311 except ValueError as err:
1312 # since this is the last-resort InfoExtractor, if
1313 # this error is thrown, it'll be thrown here
1314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315 return
1316
1317 self.report_extraction(video_id)
1318 # Start with something easy: JW Player in SWFObject
1319 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1320 if mobj is None:
1321 # Broaden the search a little bit
1322 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1323 if mobj is None:
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325 return
1326
1327 # It's possible that one of the regexes
1328 # matched, but returned an empty group:
1329 if mobj.group(1) is None:
1330 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1331 return
1332
1333 video_url = compat_urllib_parse.unquote(mobj.group(1))
1334 video_id = os.path.basename(video_url)
1335
1336 # here's a fun little line of code for you:
1337 video_extension = os.path.splitext(video_id)[1][1:]
1338 video_id = os.path.splitext(video_id)[0]
1339
1340 # it's tempting to parse this further, but you would
1341 # have to take into account all the variations like
1342 # Video Title - Site Name
1343 # Site Name | Video Title
1344 # Video Title - Tagline | Site Name
1345 # and so on and so forth; it's just not practical
1346 mobj = re.search(r'<title>(.*)</title>', webpage)
1347 if mobj is None:
1348 self._downloader.trouble(u'ERROR: unable to extract title')
1349 return
f1171f7c 1350 video_title = mobj.group(1)
59ae15a5
PH
1351
1352 # video uploader is domain name
1353 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1354 if mobj is None:
1355 self._downloader.trouble(u'ERROR: unable to extract title')
1356 return
f1171f7c 1357 video_uploader = mobj.group(1)
59ae15a5
PH
1358
1359 return [{
f1171f7c
PH
1360 'id': video_id,
1361 'url': video_url,
59ae15a5
PH
1362 'uploader': video_uploader,
1363 'upload_date': None,
1364 'title': video_title,
f1171f7c 1365 'ext': video_extension,
59ae15a5 1366 }]
d77c3dfd
FV
1367
1368
1369class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1370 """Information Extractor for YouTube search queries."""
1371 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1372 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1373 _max_youtube_results = 1000
1374 IE_NAME = u'youtube:search'
1375
1376 def __init__(self, downloader=None):
1377 InfoExtractor.__init__(self, downloader)
1378
1379 def report_download_page(self, query, pagenum):
1380 """Report attempt to download search page with given number."""
1381 query = query.decode(preferredencoding())
1382 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1383
1384 def _real_extract(self, query):
1385 mobj = re.match(self._VALID_URL, query)
1386 if mobj is None:
1387 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1388 return
1389
1390 prefix, query = query.split(':')
1391 prefix = prefix[8:]
1392 query = query.encode('utf-8')
1393 if prefix == '':
1394 self._download_n_results(query, 1)
1395 return
1396 elif prefix == 'all':
1397 self._download_n_results(query, self._max_youtube_results)
1398 return
1399 else:
1400 try:
1401 n = int(prefix)
1402 if n <= 0:
1403 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1404 return
1405 elif n > self._max_youtube_results:
1406 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1407 n = self._max_youtube_results
1408 self._download_n_results(query, n)
1409 return
1410 except ValueError: # parsing prefix as integer fails
1411 self._download_n_results(query, 1)
1412 return
1413
1414 def _download_n_results(self, query, n):
1415 """Downloads a specified number of results for a query"""
1416
1417 video_ids = []
1418 pagenum = 0
1419 limit = n
1420
1421 while (50 * pagenum) < limit:
1422 self.report_download_page(query, pagenum+1)
1423 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1424 request = compat_urllib_request.Request(result_url)
1425 try:
1426 data = compat_urllib_request.urlopen(request).read()
1427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1429 return
1430 api_response = json.loads(data)['data']
1431
1432 new_ids = list(video['id'] for video in api_response['items'])
1433 video_ids += new_ids
1434
1435 limit = min(n, api_response['totalItems'])
1436 pagenum += 1
1437
1438 if len(video_ids) > n:
1439 video_ids = video_ids[:n]
1440 for id in video_ids:
1441 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1442 return
d77c3dfd
FV
1443
1444
1445class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1446 """Information Extractor for Google Video search queries."""
1447 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1448 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1449 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1450 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1451 _max_google_results = 1000
1452 IE_NAME = u'video.google:search'
1453
1454 def __init__(self, downloader=None):
1455 InfoExtractor.__init__(self, downloader)
1456
1457 def report_download_page(self, query, pagenum):
1458 """Report attempt to download playlist page with given number."""
1459 query = query.decode(preferredencoding())
1460 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1461
1462 def _real_extract(self, query):
1463 mobj = re.match(self._VALID_URL, query)
1464 if mobj is None:
1465 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1466 return
1467
1468 prefix, query = query.split(':')
1469 prefix = prefix[8:]
1470 query = query.encode('utf-8')
1471 if prefix == '':
1472 self._download_n_results(query, 1)
1473 return
1474 elif prefix == 'all':
1475 self._download_n_results(query, self._max_google_results)
1476 return
1477 else:
1478 try:
1479 n = int(prefix)
1480 if n <= 0:
1481 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1482 return
1483 elif n > self._max_google_results:
1484 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1485 n = self._max_google_results
1486 self._download_n_results(query, n)
1487 return
1488 except ValueError: # parsing prefix as integer fails
1489 self._download_n_results(query, 1)
1490 return
1491
1492 def _download_n_results(self, query, n):
1493 """Downloads a specified number of results for a query"""
1494
1495 video_ids = []
1496 pagenum = 0
1497
1498 while True:
1499 self.report_download_page(query, pagenum)
1500 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1501 request = compat_urllib_request.Request(result_url)
1502 try:
1503 page = compat_urllib_request.urlopen(request).read()
1504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1506 return
1507
1508 # Extract video identifiers
1509 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1510 video_id = mobj.group(1)
1511 if video_id not in video_ids:
1512 video_ids.append(video_id)
1513 if len(video_ids) == n:
1514 # Specified n videos reached
1515 for id in video_ids:
1516 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1517 return
1518
1519 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520 for id in video_ids:
1521 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1522 return
1523
1524 pagenum = pagenum + 1
d77c3dfd
FV
1525
1526
1527class YahooSearchIE(InfoExtractor):
59ae15a5 1528 """Information Extractor for Yahoo! Video search queries."""
93702113
FV
1529
1530 _WORKING = False
59ae15a5
PH
1531 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1532 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1533 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1534 _MORE_PAGES_INDICATOR = r'\s*Next'
1535 _max_yahoo_results = 1000
1536 IE_NAME = u'video.yahoo:search'
1537
1538 def __init__(self, downloader=None):
1539 InfoExtractor.__init__(self, downloader)
1540
1541 def report_download_page(self, query, pagenum):
1542 """Report attempt to download playlist page with given number."""
1543 query = query.decode(preferredencoding())
1544 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1545
1546 def _real_extract(self, query):
1547 mobj = re.match(self._VALID_URL, query)
1548 if mobj is None:
1549 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1550 return
1551
1552 prefix, query = query.split(':')
1553 prefix = prefix[8:]
1554 query = query.encode('utf-8')
1555 if prefix == '':
1556 self._download_n_results(query, 1)
1557 return
1558 elif prefix == 'all':
1559 self._download_n_results(query, self._max_yahoo_results)
1560 return
1561 else:
1562 try:
1563 n = int(prefix)
1564 if n <= 0:
1565 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1566 return
1567 elif n > self._max_yahoo_results:
1568 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1569 n = self._max_yahoo_results
1570 self._download_n_results(query, n)
1571 return
1572 except ValueError: # parsing prefix as integer fails
1573 self._download_n_results(query, 1)
1574 return
1575
1576 def _download_n_results(self, query, n):
1577 """Downloads a specified number of results for a query"""
1578
1579 video_ids = []
1580 already_seen = set()
1581 pagenum = 1
1582
1583 while True:
1584 self.report_download_page(query, pagenum)
1585 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1586 request = compat_urllib_request.Request(result_url)
1587 try:
1588 page = compat_urllib_request.urlopen(request).read()
1589 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1590 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1591 return
1592
1593 # Extract video identifiers
1594 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595 video_id = mobj.group(1)
1596 if video_id not in already_seen:
1597 video_ids.append(video_id)
1598 already_seen.add(video_id)
1599 if len(video_ids) == n:
1600 # Specified n videos reached
1601 for id in video_ids:
1602 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1603 return
1604
1605 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1606 for id in video_ids:
1607 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608 return
1609
1610 pagenum = pagenum + 1
d77c3dfd
FV
1611
1612
1613class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1614 """Information Extractor for YouTube playlists."""
1615
e387eb5a 1616 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
59ae15a5
PH
1617 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1618 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
9789a05c 1619 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1620 IE_NAME = u'youtube:playlist'
1621
1622 def __init__(self, downloader=None):
1623 InfoExtractor.__init__(self, downloader)
1624
1625 def report_download_page(self, playlist_id, pagenum):
1626 """Report attempt to download playlist page with given number."""
1627 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1628
1629 def _real_extract(self, url):
1630 # Extract playlist id
1631 mobj = re.match(self._VALID_URL, url)
1632 if mobj is None:
1633 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1634 return
1635
1636 # Single video case
1637 if mobj.group(3) is not None:
1638 self._downloader.download([mobj.group(3)])
1639 return
1640
1641 # Download playlist pages
1642 # prefix is 'p' as default for playlists but there are other types that need extra care
1643 playlist_prefix = mobj.group(1)
1644 if playlist_prefix == 'a':
1645 playlist_access = 'artist'
1646 else:
1647 playlist_prefix = 'p'
1648 playlist_access = 'view_play_list'
1649 playlist_id = mobj.group(2)
1650 video_ids = []
1651 pagenum = 1
1652
1653 while True:
1654 self.report_download_page(playlist_id, pagenum)
1655 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1656 request = compat_urllib_request.Request(url)
1657 try:
80d3177e 1658 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1660 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1661 return
1662
1663 # Extract video identifiers
1664 ids_in_page = []
1665 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1666 if mobj.group(1) not in ids_in_page:
1667 ids_in_page.append(mobj.group(1))
1668 video_ids.extend(ids_in_page)
1669
9789a05c 1670 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1671 break
1672 pagenum = pagenum + 1
1673
9789a05c
FV
1674 total = len(video_ids)
1675
59ae15a5
PH
1676 playliststart = self._downloader.params.get('playliststart', 1) - 1
1677 playlistend = self._downloader.params.get('playlistend', -1)
1678 if playlistend == -1:
1679 video_ids = video_ids[playliststart:]
1680 else:
1681 video_ids = video_ids[playliststart:playlistend]
1682
9789a05c
FV
1683 if len(video_ids) == total:
1684 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1685 else:
1686 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1687
59ae15a5
PH
1688 for id in video_ids:
1689 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1690 return
d77c3dfd
FV
1691
1692
902b2a0a 1693class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1694 """Information Extractor for YouTube channels."""
1695
1696 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1697 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
9789a05c 1698 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1699 IE_NAME = u'youtube:channel'
1700
1701 def report_download_page(self, channel_id, pagenum):
1702 """Report attempt to download channel page with given number."""
1703 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1704
1705 def _real_extract(self, url):
1706 # Extract channel id
1707 mobj = re.match(self._VALID_URL, url)
1708 if mobj is None:
1709 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710 return
1711
1712 # Download channel pages
1713 channel_id = mobj.group(1)
1714 video_ids = []
1715 pagenum = 1
1716
1717 while True:
1718 self.report_download_page(channel_id, pagenum)
1719 url = self._TEMPLATE_URL % (channel_id, pagenum)
1720 request = compat_urllib_request.Request(url)
1721 try:
9789a05c 1722 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5
PH
1723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725 return
1726
1727 # Extract video identifiers
1728 ids_in_page = []
1729 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1730 if mobj.group(1) not in ids_in_page:
1731 ids_in_page.append(mobj.group(1))
1732 video_ids.extend(ids_in_page)
1733
9789a05c 1734 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1735 break
1736 pagenum = pagenum + 1
1737
9789a05c
FV
1738 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1739
59ae15a5
PH
1740 for id in video_ids:
1741 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1742 return
902b2a0a
FV
1743
1744
d77c3dfd 1745class YoutubeUserIE(InfoExtractor):
59ae15a5 1746 """Information Extractor for YouTube users."""
d77c3dfd 1747
59ae15a5
PH
1748 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1749 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1750 _GDATA_PAGE_SIZE = 50
1751 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1752 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1753 IE_NAME = u'youtube:user'
d77c3dfd 1754
59ae15a5
PH
1755 def __init__(self, downloader=None):
1756 InfoExtractor.__init__(self, downloader)
d77c3dfd 1757
59ae15a5
PH
1758 def report_download_page(self, username, start_index):
1759 """Report attempt to download user page."""
1760 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1761 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1762
59ae15a5
PH
1763 def _real_extract(self, url):
1764 # Extract username
1765 mobj = re.match(self._VALID_URL, url)
1766 if mobj is None:
1767 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1768 return
d77c3dfd 1769
59ae15a5 1770 username = mobj.group(1)
d77c3dfd 1771
59ae15a5
PH
1772 # Download video ids using YouTube Data API. Result size per
1773 # query is limited (currently to 50 videos) so we need to query
1774 # page by page until there are no video ids - it means we got
1775 # all of them.
d77c3dfd 1776
59ae15a5
PH
1777 video_ids = []
1778 pagenum = 0
d77c3dfd 1779
59ae15a5
PH
1780 while True:
1781 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1782 self.report_download_page(username, start_index)
d77c3dfd 1783
59ae15a5 1784 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1785
59ae15a5 1786 try:
80d3177e 1787 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
1788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1790 return
d77c3dfd 1791
59ae15a5
PH
1792 # Extract video identifiers
1793 ids_in_page = []
d77c3dfd 1794
59ae15a5
PH
1795 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796 if mobj.group(1) not in ids_in_page:
1797 ids_in_page.append(mobj.group(1))
d77c3dfd 1798
59ae15a5 1799 video_ids.extend(ids_in_page)
d77c3dfd 1800
59ae15a5
PH
1801 # A little optimization - if current page is not
1802 # "full", ie. does not contain PAGE_SIZE video ids then
1803 # we can assume that this page is the last one - there
1804 # are no more ids on further pages - no need to query
1805 # again.
d77c3dfd 1806
59ae15a5
PH
1807 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1808 break
d77c3dfd 1809
59ae15a5 1810 pagenum += 1
d77c3dfd 1811
59ae15a5
PH
1812 all_ids_count = len(video_ids)
1813 playliststart = self._downloader.params.get('playliststart', 1) - 1
1814 playlistend = self._downloader.params.get('playlistend', -1)
d77c3dfd 1815
59ae15a5
PH
1816 if playlistend == -1:
1817 video_ids = video_ids[playliststart:]
1818 else:
1819 video_ids = video_ids[playliststart:playlistend]
d77c3dfd 1820
59ae15a5
PH
1821 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1822 (username, all_ids_count, len(video_ids)))
d77c3dfd 1823
59ae15a5
PH
1824 for video_id in video_ids:
1825 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1826
1827
eeeb4daa 1828class BlipTVUserIE(InfoExtractor):
59ae15a5 1829 """Information Extractor for blip.tv users."""
eeeb4daa 1830
59ae15a5
PH
1831 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1832 _PAGE_SIZE = 12
1833 IE_NAME = u'blip.tv:user'
eeeb4daa 1834
59ae15a5
PH
1835 def __init__(self, downloader=None):
1836 InfoExtractor.__init__(self, downloader)
eeeb4daa 1837
59ae15a5
PH
1838 def report_download_page(self, username, pagenum):
1839 """Report attempt to download user page."""
1840 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1841 (self.IE_NAME, username, pagenum))
eeeb4daa 1842
59ae15a5
PH
1843 def _real_extract(self, url):
1844 # Extract username
1845 mobj = re.match(self._VALID_URL, url)
1846 if mobj is None:
1847 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848 return
eeeb4daa 1849
59ae15a5 1850 username = mobj.group(1)
eeeb4daa 1851
59ae15a5 1852 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1853
59ae15a5 1854 request = compat_urllib_request.Request(url)
eeeb4daa 1855
59ae15a5
PH
1856 try:
1857 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1858 mobj = re.search(r'data-users-id="([^"]+)"', page)
1859 page_base = page_base % mobj.group(1)
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1862 return
eeeb4daa
JCGS
1863
1864
59ae15a5
PH
1865 # Download video ids using BlipTV Ajax calls. Result size per
1866 # query is limited (currently to 12 videos) so we need to query
1867 # page by page until there are no video ids - it means we got
1868 # all of them.
eeeb4daa 1869
59ae15a5
PH
1870 video_ids = []
1871 pagenum = 1
eeeb4daa 1872
59ae15a5
PH
1873 while True:
1874 self.report_download_page(username, pagenum)
eeeb4daa 1875
59ae15a5 1876 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
eeeb4daa 1877
59ae15a5
PH
1878 try:
1879 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1880 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1881 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1882 return
eeeb4daa 1883
59ae15a5
PH
1884 # Extract video identifiers
1885 ids_in_page = []
eeeb4daa 1886
59ae15a5
PH
1887 for mobj in re.finditer(r'href="/([^"]+)"', page):
1888 if mobj.group(1) not in ids_in_page:
1889 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 1890
59ae15a5 1891 video_ids.extend(ids_in_page)
eeeb4daa 1892
59ae15a5
PH
1893 # A little optimization - if current page is not
1894 # "full", ie. does not contain PAGE_SIZE video ids then
1895 # we can assume that this page is the last one - there
1896 # are no more ids on further pages - no need to query
1897 # again.
eeeb4daa 1898
59ae15a5
PH
1899 if len(ids_in_page) < self._PAGE_SIZE:
1900 break
eeeb4daa 1901
59ae15a5 1902 pagenum += 1
eeeb4daa 1903
59ae15a5
PH
1904 all_ids_count = len(video_ids)
1905 playliststart = self._downloader.params.get('playliststart', 1) - 1
1906 playlistend = self._downloader.params.get('playlistend', -1)
eeeb4daa 1907
59ae15a5
PH
1908 if playlistend == -1:
1909 video_ids = video_ids[playliststart:]
1910 else:
1911 video_ids = video_ids[playliststart:playlistend]
eeeb4daa 1912
59ae15a5
PH
1913 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1914 (self.IE_NAME, username, all_ids_count, len(video_ids)))
eeeb4daa 1915
59ae15a5
PH
1916 for video_id in video_ids:
1917 self._downloader.download([u'http://blip.tv/'+video_id])
eeeb4daa
JCGS
1918
1919
d77c3dfd 1920class DepositFilesIE(InfoExtractor):
59ae15a5
PH
1921 """Information extractor for depositfiles.com"""
1922
1923 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59ae15a5
PH
1924
1925 def report_download_webpage(self, file_id):
1926 """Report webpage download."""
1927 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1928
1929 def report_extraction(self, file_id):
1930 """Report information extraction."""
1931 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1932
1933 def _real_extract(self, url):
1934 file_id = url.split('/')[-1]
1935 # Rebuild url in english locale
1936 url = 'http://depositfiles.com/en/files/' + file_id
1937
1938 # Retrieve file webpage with 'Free download' button pressed
1939 free_download_indication = { 'gateway_result' : '1' }
1940 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1941 try:
1942 self.report_download_webpage(file_id)
1943 webpage = compat_urllib_request.urlopen(request).read()
1944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1945 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1946 return
1947
1948 # Search for the real file URL
1949 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1950 if (mobj is None) or (mobj.group(1) is None):
1951 # Try to figure out reason of the error.
1952 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1953 if (mobj is not None) and (mobj.group(1) is not None):
1954 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1955 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1956 else:
1957 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1958 return
1959
1960 file_url = mobj.group(1)
1961 file_extension = os.path.splitext(file_url)[1][1:]
1962
1963 # Search for file title
1964 mobj = re.search(r'<b title="(.*?)">', webpage)
1965 if mobj is None:
1966 self._downloader.trouble(u'ERROR: unable to extract title')
1967 return
1968 file_title = mobj.group(1).decode('utf-8')
1969
1970 return [{
1971 'id': file_id.decode('utf-8'),
1972 'url': file_url.decode('utf-8'),
1973 'uploader': None,
1974 'upload_date': None,
1975 'title': file_title,
1976 'ext': file_extension.decode('utf-8'),
1977 }]
d77c3dfd
FV
1978
1979
1980class FacebookIE(InfoExtractor):
59ae15a5
PH
1981 """Information Extractor for Facebook"""
1982
1983 _WORKING = False
1984 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986 _NETRC_MACHINE = 'facebook'
1987 _available_formats = ['video', 'highqual', 'lowqual']
1988 _video_extensions = {
1989 'video': 'mp4',
1990 'highqual': 'mp4',
1991 'lowqual': 'mp4',
1992 }
1993 IE_NAME = u'facebook'
1994
1995 def __init__(self, downloader=None):
1996 InfoExtractor.__init__(self, downloader)
1997
1998 def _reporter(self, message):
1999 """Add header and report message."""
2000 self._downloader.to_screen(u'[facebook] %s' % message)
2001
2002 def report_login(self):
2003 """Report attempt to log in."""
2004 self._reporter(u'Logging in')
2005
2006 def report_video_webpage_download(self, video_id):
2007 """Report attempt to download video webpage."""
2008 self._reporter(u'%s: Downloading video webpage' % video_id)
2009
2010 def report_information_extraction(self, video_id):
2011 """Report attempt to extract video information."""
2012 self._reporter(u'%s: Extracting video information' % video_id)
2013
2014 def _parse_page(self, video_webpage):
2015 """Extract video information from page"""
2016 # General data
2017 data = {'title': r'\("video_title", "(.*?)"\)',
2018 'description': r'<div class="datawrap">(.*?)</div>',
2019 'owner': r'\("video_owner_name", "(.*?)"\)',
2020 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2021 }
2022 video_info = {}
2023 for piece in data.keys():
2024 mobj = re.search(data[piece], video_webpage)
2025 if mobj is not None:
2026 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2027
2028 # Video urls
2029 video_urls = {}
2030 for fmt in self._available_formats:
2031 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2032 if mobj is not None:
2033 # URL is in a Javascript segment inside an escaped Unicode format within
2034 # the generally utf-8 page
2035 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2036 video_info['video_urls'] = video_urls
2037
2038 return video_info
2039
2040 def _real_initialize(self):
2041 if self._downloader is None:
2042 return
2043
2044 useremail = None
2045 password = None
2046 downloader_params = self._downloader.params
2047
2048 # Attempt to use provided username and password or .netrc data
2049 if downloader_params.get('username', None) is not None:
2050 useremail = downloader_params['username']
2051 password = downloader_params['password']
2052 elif downloader_params.get('usenetrc', False):
2053 try:
2054 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2055 if info is not None:
2056 useremail = info[0]
2057 password = info[2]
2058 else:
2059 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2060 except (IOError, netrc.NetrcParseError) as err:
2061 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2062 return
2063
2064 if useremail is None:
2065 return
2066
2067 # Log in
2068 login_form = {
2069 'email': useremail,
2070 'pass': password,
2071 'login': 'Log+In'
2072 }
2073 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2074 try:
2075 self.report_login()
2076 login_results = compat_urllib_request.urlopen(request).read()
2077 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2078 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2079 return
2080 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2082 return
2083
2084 def _real_extract(self, url):
2085 mobj = re.match(self._VALID_URL, url)
2086 if mobj is None:
2087 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2088 return
2089 video_id = mobj.group('ID')
2090
2091 # Get video webpage
2092 self.report_video_webpage_download(video_id)
2093 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2094 try:
2095 page = compat_urllib_request.urlopen(request)
2096 video_webpage = page.read()
2097 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2098 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2099 return
2100
2101 # Start extracting information
2102 self.report_information_extraction(video_id)
2103
2104 # Extract information
2105 video_info = self._parse_page(video_webpage)
2106
2107 # uploader
2108 if 'owner' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2110 return
2111 video_uploader = video_info['owner']
2112
2113 # title
2114 if 'title' not in video_info:
2115 self._downloader.trouble(u'ERROR: unable to extract video title')
2116 return
2117 video_title = video_info['title']
2118 video_title = video_title.decode('utf-8')
2119
2120 # thumbnail image
2121 if 'thumbnail' not in video_info:
2122 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2123 video_thumbnail = ''
2124 else:
2125 video_thumbnail = video_info['thumbnail']
2126
2127 # upload date
2128 upload_date = None
2129 if 'upload_date' in video_info:
2130 upload_time = video_info['upload_date']
2131 timetuple = email.utils.parsedate_tz(upload_time)
2132 if timetuple is not None:
2133 try:
2134 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2135 except:
2136 pass
2137
2138 # description
2139 video_description = video_info.get('description', 'No description available.')
2140
2141 url_map = video_info['video_urls']
99b0a129 2142 if url_map:
59ae15a5
PH
2143 # Decide which formats to download
2144 req_format = self._downloader.params.get('format', None)
2145 format_limit = self._downloader.params.get('format_limit', None)
2146
2147 if format_limit is not None and format_limit in self._available_formats:
2148 format_list = self._available_formats[self._available_formats.index(format_limit):]
2149 else:
2150 format_list = self._available_formats
2151 existing_formats = [x for x in format_list if x in url_map]
2152 if len(existing_formats) == 0:
2153 self._downloader.trouble(u'ERROR: no known formats available for video')
2154 return
2155 if req_format is None:
2156 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2157 elif req_format == 'worst':
2158 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2159 elif req_format == '-1':
2160 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2161 else:
2162 # Specific format
2163 if req_format not in url_map:
2164 self._downloader.trouble(u'ERROR: requested format not available')
2165 return
2166 video_url_list = [(req_format, url_map[req_format])] # Specific format
2167
2168 results = []
2169 for format_param, video_real_url in video_url_list:
2170 # Extension
2171 video_extension = self._video_extensions.get(format_param, 'mp4')
2172
2173 results.append({
2174 'id': video_id.decode('utf-8'),
2175 'url': video_real_url.decode('utf-8'),
2176 'uploader': video_uploader.decode('utf-8'),
2177 'upload_date': upload_date,
2178 'title': video_title,
2179 'ext': video_extension.decode('utf-8'),
2180 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2181 'thumbnail': video_thumbnail.decode('utf-8'),
2182 'description': video_description.decode('utf-8'),
2183 })
2184 return results
d77c3dfd
FV
2185
2186class BlipTVIE(InfoExtractor):
59ae15a5
PH
2187 """Information extractor for blip.tv"""
2188
2189 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2190 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2191 IE_NAME = u'blip.tv'
2192
2193 def report_extraction(self, file_id):
2194 """Report information extraction."""
2195 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2196
2197 def report_direct_download(self, title):
2198 """Report information extraction."""
2199 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2200
2201 def _real_extract(self, url):
2202 mobj = re.match(self._VALID_URL, url)
2203 if mobj is None:
2204 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2205 return
2206
2207 if '?' in url:
2208 cchar = '&'
2209 else:
2210 cchar = '?'
2211 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2212 request = compat_urllib_request.Request(json_url)
3446dfb7 2213 request.add_header('User-Agent', 'iTunes/10.6.1')
59ae15a5
PH
2214 self.report_extraction(mobj.group(1))
2215 info = None
2216 try:
2217 urlh = compat_urllib_request.urlopen(request)
2218 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2219 basename = url.split('/')[-1]
2220 title,ext = os.path.splitext(basename)
2221 title = title.decode('UTF-8')
2222 ext = ext.replace('.', '')
2223 self.report_direct_download(title)
2224 info = {
2225 'id': title,
2226 'url': url,
2227 'uploader': None,
2228 'upload_date': None,
2229 'title': title,
2230 'ext': ext,
2231 'urlhandle': urlh
2232 }
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446dfb7 2234 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
59ae15a5
PH
2235 if info is None: # Regular URL
2236 try:
55c05398
PH
2237 json_code_bytes = urlh.read()
2238 json_code = json_code_bytes.decode('utf-8')
59ae15a5
PH
2239 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2240 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2241 return
2242
2243 try:
2244 json_data = json.loads(json_code)
2245 if 'Post' in json_data:
2246 data = json_data['Post']
2247 else:
2248 data = json_data
2249
2250 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2251 video_url = data['media']['url']
2252 umobj = re.match(self._URL_EXT, video_url)
2253 if umobj is None:
2254 raise ValueError('Can not determine filename extension')
2255 ext = umobj.group(1)
2256
2257 info = {
2258 'id': data['item_id'],
2259 'url': video_url,
2260 'uploader': data['display_name'],
2261 'upload_date': upload_date,
2262 'title': data['title'],
2263 'ext': ext,
2264 'format': data['media']['mimeType'],
2265 'thumbnail': data['thumbnailUrl'],
2266 'description': data['description'],
3446dfb7
PH
2267 'player_url': data['embedUrl'],
2268 'user_agent': 'iTunes/10.6.1',
59ae15a5
PH
2269 }
2270 except (ValueError,KeyError) as err:
2271 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2272 return
2273
59ae15a5 2274 return [info]
d77c3dfd
FV
2275
2276
2277class MyVideoIE(InfoExtractor):
59ae15a5
PH
2278 """Information Extractor for myvideo.de."""
2279
2280 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2281 IE_NAME = u'myvideo'
2282
2283 def __init__(self, downloader=None):
2284 InfoExtractor.__init__(self, downloader)
cdb30764 2285
59ae15a5
PH
2286 def report_extraction(self, video_id):
2287 """Report information extraction."""
2288 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2289
2290 def _real_extract(self,url):
2291 mobj = re.match(self._VALID_URL, url)
2292 if mobj is None:
2293 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2294 return
2295
2296 video_id = mobj.group(1)
2297
2298 # Get video webpage
5f955171
PH
2299 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2300 webpage = self._download_webpage(webpage_url, video_id)
59ae15a5
PH
2301
2302 self.report_extraction(video_id)
2303 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2304 webpage)
2305 if mobj is None:
2306 self._downloader.trouble(u'ERROR: unable to extract media URL')
2307 return
2308 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2309
2310 mobj = re.search('<title>([^<]+)</title>', webpage)
2311 if mobj is None:
2312 self._downloader.trouble(u'ERROR: unable to extract title')
2313 return
2314
2315 video_title = mobj.group(1)
2316
2317 return [{
2318 'id': video_id,
2319 'url': video_url,
2320 'uploader': None,
2321 'upload_date': None,
2322 'title': video_title,
2323 'ext': u'flv',
2324 }]
d77c3dfd
FV
2325
2326class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2327 """Information extractor for The Daily Show and Colbert Report """
2328
ca6849e6 2329 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2330 # urls for episodes like:
ca6849e6 2331 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2332 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2333 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2334 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2335 |(https?://)?(www\.)?
2336 (?P<showname>thedailyshow|colbertnation)\.com/
2337 (full-episodes/(?P<episode>.*)|
2338 (?P<clip>
2339 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2340 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2341 $"""
59ae15a5
PH
2342
2343 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2344
2345 _video_extensions = {
2346 '3500': 'mp4',
2347 '2200': 'mp4',
2348 '1700': 'mp4',
2349 '1200': 'mp4',
2350 '750': 'mp4',
2351 '400': 'mp4',
2352 }
2353 _video_dimensions = {
2354 '3500': '1280x720',
2355 '2200': '960x540',
2356 '1700': '768x432',
2357 '1200': '640x360',
2358 '750': '512x288',
2359 '400': '384x216',
2360 }
2361
ca6849e6 2362 def suitable(self, url):
2363 """Receives a URL and returns True if suitable for this IE."""
2364 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2365
59ae15a5
PH
2366 def report_extraction(self, episode_id):
2367 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2368
32635ec6
PH
2369 def report_config_download(self, episode_id, media_id):
2370 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
59ae15a5
PH
2371
2372 def report_index_download(self, episode_id):
2373 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2374
59ae15a5
PH
2375 def _print_formats(self, formats):
2376 print('Available formats:')
2377 for x in formats:
2378 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2379
2380
2381 def _real_extract(self, url):
ca6849e6 2382 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2383 if mobj is None:
2384 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2385 return
2386
2387 if mobj.group('shortname'):
2388 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2389 url = u'http://www.thedailyshow.com/full-episodes/'
2390 else:
2391 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2392 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2393 assert mobj is not None
2394
ca6849e6 2395 if mobj.group('clip'):
2396 if mobj.group('showname') == 'thedailyshow':
2397 epTitle = mobj.group('tdstitle')
2398 else:
2399 epTitle = mobj.group('cntitle')
2400 dlNewest = False
59ae15a5 2401 else:
ca6849e6 2402 dlNewest = not mobj.group('episode')
2403 if dlNewest:
2404 epTitle = mobj.group('showname')
2405 else:
2406 epTitle = mobj.group('episode')
59ae15a5
PH
2407
2408 req = compat_urllib_request.Request(url)
2409 self.report_extraction(epTitle)
2410 try:
2411 htmlHandle = compat_urllib_request.urlopen(req)
2412 html = htmlHandle.read()
93148102 2413 webpage = html.decode('utf-8')
59ae15a5
PH
2414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2415 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2416 return
2417 if dlNewest:
2418 url = htmlHandle.geturl()
ca6849e6 2419 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2420 if mobj is None:
2421 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2422 return
2423 if mobj.group('episode') == '':
2424 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2425 return
2426 epTitle = mobj.group('episode')
2427
93148102 2428 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
59ae15a5
PH
2429
2430 if len(mMovieParams) == 0:
2431 # The Colbert Report embeds the information in a without
2432 # a URL prefix; so extract the alternate reference
2433 # and then add the URL prefix manually.
2434
93148102 2435 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
59ae15a5
PH
2436 if len(altMovieParams) == 0:
2437 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2438 return
2439 else:
2440 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2441
59ae15a5
PH
2442 uri = mMovieParams[0][1]
2443 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2444 self.report_index_download(epTitle)
2445 try:
2446 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2448 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2449 return
2450
2451 results = []
2452
2453 idoc = xml.etree.ElementTree.fromstring(indexXml)
2454 itemEls = idoc.findall('.//item')
7717ae19 2455 for partNum,itemEl in enumerate(itemEls):
59ae15a5
PH
2456 mediaId = itemEl.findall('./guid')[0].text
2457 shortMediaId = mediaId.split(':')[-1]
2458 showId = mediaId.split(':')[-2].replace('.com', '')
2459 officialTitle = itemEl.findall('./title')[0].text
2460 officialDate = itemEl.findall('./pubDate')[0].text
2461
2462 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2463 compat_urllib_parse.urlencode({'uri': mediaId}))
2464 configReq = compat_urllib_request.Request(configUrl)
32635ec6 2465 self.report_config_download(epTitle, shortMediaId)
59ae15a5
PH
2466 try:
2467 configXml = compat_urllib_request.urlopen(configReq).read()
2468 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2470 return
2471
2472 cdoc = xml.etree.ElementTree.fromstring(configXml)
2473 turls = []
2474 for rendition in cdoc.findall('.//rendition'):
2475 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2476 turls.append(finfo)
2477
2478 if len(turls) == 0:
2479 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2480 continue
cdb30764 2481
59ae15a5
PH
2482 if self._downloader.params.get('listformats', None):
2483 self._print_formats([i[0] for i in turls])
2484 return
2485
2486 # For now, just pick the highest bitrate
32635ec6 2487 format,rtmp_video_url = turls[-1]
59ae15a5
PH
2488
2489 # Get the format arg from the arg stream
2490 req_format = self._downloader.params.get('format', None)
2491
2492 # Select format if we can find one
2493 for f,v in turls:
2494 if f == req_format:
32635ec6 2495 format, rtmp_video_url = f, v
59ae15a5
PH
2496 break
2497
32635ec6
PH
2498 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2499 if not m:
2500 raise ExtractorError(u'Cannot transform RTMP url')
2501 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2502 video_url = base + m.group('finalid')
59ae15a5 2503
7717ae19 2504 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
59ae15a5
PH
2505 info = {
2506 'id': shortMediaId,
2507 'url': video_url,
2508 'uploader': showId,
2509 'upload_date': officialDate,
2510 'title': effTitle,
2511 'ext': 'mp4',
2512 'format': format,
2513 'thumbnail': None,
2514 'description': officialTitle,
59ae15a5 2515 }
59ae15a5 2516 results.append(info)
cdb30764 2517
59ae15a5 2518 return results
d77c3dfd
FV
2519
2520
2521class EscapistIE(InfoExtractor):
59ae15a5
PH
2522 """Information extractor for The Escapist """
2523
2524 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2525 IE_NAME = u'escapist'
2526
2527 def report_extraction(self, showName):
2528 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2529
2530 def report_config_download(self, showName):
2531 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2532
2533 def _real_extract(self, url):
2534 mobj = re.match(self._VALID_URL, url)
2535 if mobj is None:
2536 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2537 return
2538 showName = mobj.group('showname')
2539 videoId = mobj.group('episode')
2540
2541 self.report_extraction(showName)
2542 try:
2543 webPage = compat_urllib_request.urlopen(url)
2544 webPageBytes = webPage.read()
2545 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2546 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2549 return
2550
2551 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2552 description = unescapeHTML(descMatch.group(1))
2553 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2554 imgUrl = unescapeHTML(imgMatch.group(1))
2555 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2556 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2557 configUrlMatch = re.search('config=(.*)$', playerUrl)
2558 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2559
2560 self.report_config_download(showName)
2561 try:
93702113
FV
2562 configJSON = compat_urllib_request.urlopen(configUrl)
2563 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2564 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
59ae15a5
PH
2565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2567 return
2568
2569 # Technically, it's JavaScript, not JSON
2570 configJSON = configJSON.replace("'", '"')
2571
2572 try:
2573 config = json.loads(configJSON)
2574 except (ValueError,) as err:
2575 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2576 return
2577
2578 playlist = config['playlist']
2579 videoUrl = playlist[1]['url']
2580
2581 info = {
2582 'id': videoId,
2583 'url': videoUrl,
2584 'uploader': showName,
2585 'upload_date': None,
2586 'title': showName,
2587 'ext': 'flv',
2588 'thumbnail': imgUrl,
2589 'description': description,
2590 'player_url': playerUrl,
2591 }
2592
2593 return [info]
d77c3dfd 2594
d77c3dfd 2595class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2596 """Information extractor for collegehumor.com"""
2597
0eb0faa2 2598 _WORKING = False
59ae15a5
PH
2599 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2600 IE_NAME = u'collegehumor'
2601
799c0763 2602 def report_manifest(self, video_id):
59ae15a5 2603 """Report information extraction."""
799c0763 2604 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
59ae15a5
PH
2605
2606 def report_extraction(self, video_id):
2607 """Report information extraction."""
2608 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2609
2610 def _real_extract(self, url):
2611 mobj = re.match(self._VALID_URL, url)
2612 if mobj is None:
2613 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2614 return
2615 video_id = mobj.group('videoid')
2616
59ae15a5
PH
2617 info = {
2618 'id': video_id,
59ae15a5
PH
2619 'uploader': None,
2620 'upload_date': None,
2621 }
2622
2623 self.report_extraction(video_id)
799c0763 2624 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2625 try:
2626 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2629 return
2630
2631 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2632 try:
2633 videoNode = mdoc.findall('./video')[0]
2634 info['description'] = videoNode.findall('./description')[0].text
2635 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2636 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2637 manifest_url = videoNode.findall('./file')[0].text
59ae15a5
PH
2638 except IndexError:
2639 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2640 return
2641
799c0763
PH
2642 manifest_url += '?hdcore=2.10.3'
2643 self.report_manifest(video_id)
2644 try:
2645 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2646 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2647 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2648 return
2649
2650 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2651 try:
2652 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2653 node_id = media_node.attrib['url']
2654 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2655 except IndexError as err:
2656 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2657 return
2658
2659 url_pr = compat_urllib_parse_urlparse(manifest_url)
2660 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2661
2662 info['url'] = url
2663 info['ext'] = 'f4f'
59ae15a5 2664 return [info]
d77c3dfd
FV
2665
2666
2667class XVideosIE(InfoExtractor):
59ae15a5 2668 """Information extractor for xvideos.com"""
d77c3dfd 2669
59ae15a5
PH
2670 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2671 IE_NAME = u'xvideos'
d77c3dfd 2672
59ae15a5
PH
2673 def report_extraction(self, video_id):
2674 """Report information extraction."""
2675 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
d77c3dfd 2676
59ae15a5
PH
2677 def _real_extract(self, url):
2678 mobj = re.match(self._VALID_URL, url)
2679 if mobj is None:
2680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2681 return
8588a86f 2682 video_id = mobj.group(1)
d77c3dfd 2683
5f955171 2684 webpage = self._download_webpage(url, video_id)
d77c3dfd 2685
59ae15a5 2686 self.report_extraction(video_id)
d77c3dfd
FV
2687
2688
59ae15a5
PH
2689 # Extract video URL
2690 mobj = re.search(r'flv_url=(.+?)&', webpage)
2691 if mobj is None:
2692 self._downloader.trouble(u'ERROR: unable to extract video url')
2693 return
8588a86f 2694 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2695
2696
59ae15a5
PH
2697 # Extract title
2698 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2699 if mobj is None:
2700 self._downloader.trouble(u'ERROR: unable to extract video title')
2701 return
8588a86f 2702 video_title = mobj.group(1)
d77c3dfd
FV
2703
2704
59ae15a5
PH
2705 # Extract video thumbnail
2706 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2707 if mobj is None:
2708 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2709 return
8588a86f 2710 video_thumbnail = mobj.group(0)
d77c3dfd 2711
59ae15a5
PH
2712 info = {
2713 'id': video_id,
2714 'url': video_url,
2715 'uploader': None,
2716 'upload_date': None,
2717 'title': video_title,
2718 'ext': 'flv',
2719 'thumbnail': video_thumbnail,
2720 'description': None,
2721 }
d77c3dfd 2722
59ae15a5 2723 return [info]
d77c3dfd
FV
2724
2725
2726class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2727 """Information extractor for soundcloud.com
2728 To access the media, the uid of the song and a stream token
2729 must be extracted from the page source and the script must make
2730 a request to media.soundcloud.com/crossdomain.xml. Then
2731 the media can be grabbed by requesting from an url composed
2732 of the stream token and uid
2733 """
2734
2735 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2736 IE_NAME = u'soundcloud'
2737
2738 def __init__(self, downloader=None):
2739 InfoExtractor.__init__(self, downloader)
2740
8fd3afd5 2741 def report_resolve(self, video_id):
59ae15a5 2742 """Report information extraction."""
8fd3afd5 2743 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
59ae15a5
PH
2744
2745 def report_extraction(self, video_id):
2746 """Report information extraction."""
8fd3afd5 2747 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
59ae15a5
PH
2748
2749 def _real_extract(self, url):
2750 mobj = re.match(self._VALID_URL, url)
2751 if mobj is None:
2752 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2753 return
2754
2755 # extract uploader (which is in the url)
15c8d833 2756 uploader = mobj.group(1)
59ae15a5 2757 # extract simple title (uploader + slug of song title)
15c8d833 2758 slug_title = mobj.group(2)
59ae15a5
PH
2759 simple_title = uploader + u'-' + slug_title
2760
8fd3afd5 2761 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2762
8fd3afd5
PH
2763 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2764 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2765 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2766 try:
8fd3afd5
PH
2767 info_json_bytes = compat_urllib_request.urlopen(request).read()
2768 info_json = info_json_bytes.decode('utf-8')
59ae15a5
PH
2769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2770 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2771 return
2772
8fd3afd5
PH
2773 info = json.loads(info_json)
2774 video_id = info['id']
59ae15a5
PH
2775 self.report_extraction('%s/%s' % (uploader, slug_title))
2776
8fd3afd5 2777 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2778 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2779 try:
2780 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2781 stream_json = stream_json_bytes.decode('utf-8')
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5f955171 2783 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
b4cd069d 2784 return
59ae15a5 2785
8fd3afd5 2786 streams = json.loads(stream_json)
c7214f9a 2787 mediaURL = streams['http_mp3_128_url']
59ae15a5
PH
2788
2789 return [{
c7214f9a 2790 'id': info['id'],
59ae15a5 2791 'url': mediaURL,
c7214f9a
PH
2792 'uploader': info['user']['username'],
2793 'upload_date': info['created_at'],
2794 'title': info['title'],
59ae15a5 2795 'ext': u'mp3',
c7214f9a 2796 'description': info['description'],
59ae15a5 2797 }]
d77c3dfd
FV
2798
2799
2800class InfoQIE(InfoExtractor):
59ae15a5 2801 """Information extractor for infoq.com"""
59ae15a5 2802 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
59ae15a5 2803
59ae15a5
PH
2804 def report_extraction(self, video_id):
2805 """Report information extraction."""
2806 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2807
2808 def _real_extract(self, url):
2809 mobj = re.match(self._VALID_URL, url)
2810 if mobj is None:
2811 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2812 return
2813
4fcca4bb 2814 webpage = self._download_webpage(url, video_id=url)
59ae15a5
PH
2815 self.report_extraction(url)
2816
59ae15a5
PH
2817 # Extract video URL
2818 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2819 if mobj is None:
2820 self._downloader.trouble(u'ERROR: unable to extract video url')
2821 return
4fcca4bb
PH
2822 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2823 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
59ae15a5
PH
2824
2825 # Extract title
2826 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2827 if mobj is None:
2828 self._downloader.trouble(u'ERROR: unable to extract video title')
2829 return
4fcca4bb 2830 video_title = mobj.group(1)
59ae15a5
PH
2831
2832 # Extract description
2833 video_description = u'No description available.'
2834 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2835 if mobj is not None:
4fcca4bb 2836 video_description = mobj.group(1)
59ae15a5
PH
2837
2838 video_filename = video_url.split('/')[-1]
2839 video_id, extension = video_filename.split('.')
2840
2841 info = {
2842 'id': video_id,
2843 'url': video_url,
2844 'uploader': None,
2845 'upload_date': None,
2846 'title': video_title,
2847 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2848 'thumbnail': None,
2849 'description': video_description,
2850 }
2851
2852 return [info]
d77c3dfd
FV
2853
2854class MixcloudIE(InfoExtractor):
59ae15a5 2855 """Information extractor for www.mixcloud.com"""
93702113
FV
2856
2857 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2858 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2859 IE_NAME = u'mixcloud'
2860
2861 def __init__(self, downloader=None):
2862 InfoExtractor.__init__(self, downloader)
2863
2864 def report_download_json(self, file_id):
2865 """Report JSON download."""
2866 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2867
2868 def report_extraction(self, file_id):
2869 """Report information extraction."""
2870 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2871
2872 def get_urls(self, jsonData, fmt, bitrate='best'):
2873 """Get urls from 'audio_formats' section in json"""
2874 file_url = None
2875 try:
2876 bitrate_list = jsonData[fmt]
2877 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2878 bitrate = max(bitrate_list) # select highest
2879
2880 url_list = jsonData[fmt][bitrate]
2881 except TypeError: # we have no bitrate info.
2882 url_list = jsonData[fmt]
2883 return url_list
2884
2885 def check_urls(self, url_list):
2886 """Returns 1st active url from list"""
2887 for url in url_list:
2888 try:
2889 compat_urllib_request.urlopen(url)
2890 return url
2891 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2892 url = None
2893
2894 return None
2895
2896 def _print_formats(self, formats):
2897 print('Available formats:')
2898 for fmt in formats.keys():
2899 for b in formats[fmt]:
2900 try:
2901 ext = formats[fmt][b][0]
2902 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2903 except TypeError: # we have no bitrate info
2904 ext = formats[fmt][0]
2905 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2906 break
2907
2908 def _real_extract(self, url):
2909 mobj = re.match(self._VALID_URL, url)
2910 if mobj is None:
2911 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2912 return
2913 # extract uploader & filename from url
2914 uploader = mobj.group(1).decode('utf-8')
2915 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2916
2917 # construct API request
2918 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2919 # retrieve .json file with links to files
2920 request = compat_urllib_request.Request(file_url)
2921 try:
2922 self.report_download_json(file_url)
2923 jsonData = compat_urllib_request.urlopen(request).read()
2924 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2925 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2926 return
2927
2928 # parse JSON
2929 json_data = json.loads(jsonData)
2930 player_url = json_data['player_swf_url']
2931 formats = dict(json_data['audio_formats'])
2932
2933 req_format = self._downloader.params.get('format', None)
2934 bitrate = None
2935
2936 if self._downloader.params.get('listformats', None):
2937 self._print_formats(formats)
2938 return
2939
2940 if req_format is None or req_format == 'best':
2941 for format_param in formats.keys():
2942 url_list = self.get_urls(formats, format_param)
2943 # check urls
2944 file_url = self.check_urls(url_list)
2945 if file_url is not None:
2946 break # got it!
2947 else:
99b0a129 2948 if req_format not in formats:
59ae15a5
PH
2949 self._downloader.trouble(u'ERROR: format is not available')
2950 return
2951
2952 url_list = self.get_urls(formats, req_format)
2953 file_url = self.check_urls(url_list)
2954 format_param = req_format
2955
2956 return [{
2957 'id': file_id.decode('utf-8'),
2958 'url': file_url.decode('utf-8'),
2959 'uploader': uploader.decode('utf-8'),
2960 'upload_date': None,
2961 'title': json_data['name'],
2962 'ext': file_url.split('.')[-1].decode('utf-8'),
2963 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2964 'thumbnail': json_data['thumbnail_url'],
2965 'description': json_data['description'],
2966 'player_url': player_url.decode('utf-8'),
2967 }]
d77c3dfd
FV
2968
2969class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
2970 """Information extractor for Stanford's Open ClassRoom"""
2971
2972 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2973 IE_NAME = u'stanfordoc'
2974
2975 def report_download_webpage(self, objid):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2978
2979 def report_extraction(self, video_id):
2980 """Report information extraction."""
2981 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2982
2983 def _real_extract(self, url):
2984 mobj = re.match(self._VALID_URL, url)
2985 if mobj is None:
2986 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2987 return
2988
2989 if mobj.group('course') and mobj.group('video'): # A specific video
2990 course = mobj.group('course')
2991 video = mobj.group('video')
2992 info = {
2993 'id': course + '_' + video,
2994 'uploader': None,
2995 'upload_date': None,
2996 }
2997
2998 self.report_extraction(info['id'])
2999 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3000 xmlUrl = baseUrl + video + '.xml'
3001 try:
3002 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3005 return
3006 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3007 try:
3008 info['title'] = mdoc.findall('./title')[0].text
3009 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3010 except IndexError:
3011 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3012 return
3013 info['ext'] = info['url'].rpartition('.')[2]
3014 return [info]
3015 elif mobj.group('course'): # A course page
3016 course = mobj.group('course')
3017 info = {
3018 'id': course,
3019 'type': 'playlist',
3020 'uploader': None,
3021 'upload_date': None,
3022 }
3023
3024 self.report_download_webpage(info['id'])
3025 try:
3026 coursepage = compat_urllib_request.urlopen(url).read()
3027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3029 return
3030
3031 m = re.search('<h1>([^<]+)</h1>', coursepage)
3032 if m:
3033 info['title'] = unescapeHTML(m.group(1))
3034 else:
3035 info['title'] = info['id']
3036
3037 m = re.search('<description>([^<]+)</description>', coursepage)
3038 if m:
3039 info['description'] = unescapeHTML(m.group(1))
3040
3041 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3042 info['list'] = [
3043 {
3044 'type': 'reference',
3045 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3046 }
3047 for vpage in links]
3048 results = []
3049 for entry in info['list']:
3050 assert entry['type'] == 'reference'
3051 results += self.extract(entry['url'])
3052 return results
cdb30764 3053
59ae15a5
PH
3054 else: # Root page
3055 info = {
3056 'id': 'Stanford OpenClassroom',
3057 'type': 'playlist',
3058 'uploader': None,
3059 'upload_date': None,
3060 }
3061
3062 self.report_download_webpage(info['id'])
3063 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3064 try:
3065 rootpage = compat_urllib_request.urlopen(rootURL).read()
3066 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3067 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3068 return
3069
3070 info['title'] = info['id']
3071
3072 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3073 info['list'] = [
3074 {
3075 'type': 'reference',
3076 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3077 }
3078 for cpage in links]
3079
3080 results = []
3081 for entry in info['list']:
3082 assert entry['type'] == 'reference'
3083 results += self.extract(entry['url'])
3084 return results
d77c3dfd
FV
3085
3086class MTVIE(InfoExtractor):
59ae15a5
PH
3087 """Information extractor for MTV.com"""
3088
3089 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3090 IE_NAME = u'mtv'
3091
59ae15a5
PH
3092 def report_extraction(self, video_id):
3093 """Report information extraction."""
3094 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3095
3096 def _real_extract(self, url):
3097 mobj = re.match(self._VALID_URL, url)
3098 if mobj is None:
3099 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3100 return
3101 if not mobj.group('proto'):
3102 url = 'http://' + url
3103 video_id = mobj.group('videoid')
59ae15a5 3104
5f955171 3105 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
3106
3107 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3108 if mobj is None:
3109 self._downloader.trouble(u'ERROR: unable to extract song name')
3110 return
3111 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3112 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3113 if mobj is None:
3114 self._downloader.trouble(u'ERROR: unable to extract performer')
3115 return
3116 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 3117 video_title = performer + ' - ' + song_name
59ae15a5
PH
3118
3119 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3120 if mobj is None:
3121 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3122 return
3123 mtvn_uri = mobj.group(1)
3124
3125 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3126 if mobj is None:
3127 self._downloader.trouble(u'ERROR: unable to extract content id')
3128 return
3129 content_id = mobj.group(1)
3130
3131 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3132 self.report_extraction(video_id)
3133 request = compat_urllib_request.Request(videogen_url)
3134 try:
3135 metadataXml = compat_urllib_request.urlopen(request).read()
3136 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3137 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3138 return
3139
3140 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3141 renditions = mdoc.findall('.//rendition')
3142
3143 # For now, always pick the highest quality.
3144 rendition = renditions[-1]
3145
3146 try:
3147 _,_,ext = rendition.attrib['type'].partition('/')
3148 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3149 video_url = rendition.find('./src').text
3150 except KeyError:
3151 self._downloader.trouble('Invalid rendition field.')
3152 return
3153
3154 info = {
3155 'id': video_id,
3156 'url': video_url,
3157 'uploader': performer,
3158 'upload_date': None,
3159 'title': video_title,
3160 'ext': ext,
3161 'format': format,
3162 }
3163
3164 return [info]
6de7ef9b 3165
302efc19 3166
302efc19 3167class YoukuIE(InfoExtractor):
59ae15a5 3168 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59ae15a5
PH
3169
3170 def report_download_webpage(self, file_id):
3171 """Report webpage download."""
a34dd63b 3172 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
59ae15a5
PH
3173
3174 def report_extraction(self, file_id):
3175 """Report information extraction."""
a34dd63b 3176 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
59ae15a5
PH
3177
3178 def _gen_sid(self):
3179 nowTime = int(time.time() * 1000)
3180 random1 = random.randint(1000,1998)
3181 random2 = random.randint(1000,9999)
3182
3183 return "%d%d%d" %(nowTime,random1,random2)
3184
3185 def _get_file_ID_mix_string(self, seed):
3186 mixed = []
3187 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3188 seed = float(seed)
3189 for i in range(len(source)):
3190 seed = (seed * 211 + 30031 ) % 65536
3191 index = math.floor(seed / 65536 * len(source) )
3192 mixed.append(source[int(index)])
3193 source.remove(source[int(index)])
3194 #return ''.join(mixed)
3195 return mixed
3196
3197 def _get_file_id(self, fileId, seed):
3198 mixed = self._get_file_ID_mix_string(seed)
3199 ids = fileId.split('*')
3200 realId = []
3201 for ch in ids:
3202 if ch:
3203 realId.append(mixed[int(ch)])
3204 return ''.join(realId)
3205
3206 def _real_extract(self, url):
3207 mobj = re.match(self._VALID_URL, url)
3208 if mobj is None:
3209 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3210 return
3211 video_id = mobj.group('ID')
3212
3213 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3214
3215 request = compat_urllib_request.Request(info_url, None, std_headers)
3216 try:
3217 self.report_download_webpage(video_id)
3218 jsondata = compat_urllib_request.urlopen(request).read()
3219 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3220 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3221 return
3222
3223 self.report_extraction(video_id)
3224 try:
8f6f40d9
PH
3225 jsonstr = jsondata.decode('utf-8')
3226 config = json.loads(jsonstr)
59ae15a5
PH
3227
3228 video_title = config['data'][0]['title']
3229 seed = config['data'][0]['seed']
3230
3231 format = self._downloader.params.get('format', None)
1a2c3c0f 3232 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
3233
3234 if format is None or format == 'best':
3235 if 'hd2' in supported_format:
3236 format = 'hd2'
3237 else:
3238 format = 'flv'
3239 ext = u'flv'
3240 elif format == 'worst':
3241 format = 'mp4'
3242 ext = u'mp4'
3243 else:
3244 format = 'flv'
3245 ext = u'flv'
3246
3247
3248 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 3249 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 3250 except (UnicodeDecodeError, ValueError, KeyError):
59ae15a5
PH
3251 self._downloader.trouble(u'ERROR: unable to extract info section')
3252 return
3253
3254 files_info=[]
3255 sid = self._gen_sid()
3256 fileid = self._get_file_id(fileid, seed)
3257
3258 #column 8,9 of fileid represent the segment number
3259 #fileid[7:9] should be changed
3260 for index, key in enumerate(keys):
3261
3262 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3263 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3264
3265 info = {
3266 'id': '%s_part%02d' % (video_id, index),
3267 'url': download_url,
3268 'uploader': None,
3269 'upload_date': None,
3270 'title': video_title,
3271 'ext': ext,
3272 }
3273 files_info.append(info)
3274
3275 return files_info
5dc846fa
FV
3276
3277
6de7ef9b 3278class XNXXIE(InfoExtractor):
59ae15a5
PH
3279 """Information extractor for xnxx.com"""
3280
caec7618 3281 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
59ae15a5
PH
3282 IE_NAME = u'xnxx'
3283 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3284 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3285 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3286
3287 def report_webpage(self, video_id):
3288 """Report information extraction"""
3289 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3290
3291 def report_extraction(self, video_id):
3292 """Report information extraction"""
3293 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3294
3295 def _real_extract(self, url):
3296 mobj = re.match(self._VALID_URL, url)
3297 if mobj is None:
3298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3299 return
bec102a8 3300 video_id = mobj.group(1)
59ae15a5
PH
3301
3302 self.report_webpage(video_id)
3303
3304 # Get webpage content
3305 try:
bec102a8
PH
3306 webpage_bytes = compat_urllib_request.urlopen(url).read()
3307 webpage = webpage_bytes.decode('utf-8')
59ae15a5
PH
3308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3310 return
3311
3312 result = re.search(self.VIDEO_URL_RE, webpage)
3313 if result is None:
3314 self._downloader.trouble(u'ERROR: unable to extract video url')
3315 return
bec102a8 3316 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
3317
3318 result = re.search(self.VIDEO_TITLE_RE, webpage)
3319 if result is None:
3320 self._downloader.trouble(u'ERROR: unable to extract video title')
3321 return
bec102a8 3322 video_title = result.group(1)
59ae15a5
PH
3323
3324 result = re.search(self.VIDEO_THUMB_RE, webpage)
3325 if result is None:
3326 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3327 return
bec102a8 3328 video_thumbnail = result.group(1)
59ae15a5
PH
3329
3330 return [{
3331 'id': video_id,
3332 'url': video_url,
3333 'uploader': None,
3334 'upload_date': None,
3335 'title': video_title,
3336 'ext': 'flv',
3337 'thumbnail': video_thumbnail,
3338 'description': None,
3339 }]
fd873c69
FV
3340
3341
d443aca8 3342class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3343 """Information extractor for plus.google.com."""
3344
93702113 3345 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
3346 IE_NAME = u'plus.google'
3347
3348 def __init__(self, downloader=None):
3349 InfoExtractor.__init__(self, downloader)
3350
3351 def report_extract_entry(self, url):
3352 """Report downloading extry"""
93702113 3353 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
59ae15a5
PH
3354
3355 def report_date(self, upload_date):
3356 """Report downloading extry"""
3357 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3358
3359 def report_uploader(self, uploader):
3360 """Report downloading extry"""
93702113 3361 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
59ae15a5
PH
3362
3363 def report_title(self, video_title):
3364 """Report downloading extry"""
93702113 3365 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
59ae15a5
PH
3366
3367 def report_extract_vid_page(self, video_page):
3368 """Report information extraction."""
93702113 3369 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
59ae15a5
PH
3370
3371 def _real_extract(self, url):
3372 # Extract id from URL
3373 mobj = re.match(self._VALID_URL, url)
3374 if mobj is None:
3375 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3376 return
3377
3378 post_url = mobj.group(0)
93702113 3379 video_id = mobj.group(1)
59ae15a5
PH
3380
3381 video_extension = 'flv'
3382
3383 # Step 1, Retrieve post webpage to extract further information
3384 self.report_extract_entry(post_url)
3385 request = compat_urllib_request.Request(post_url)
3386 try:
93702113 3387 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3390 return
3391
3392 # Extract update date
3393 upload_date = None
3394 pattern = 'title="Timestamp">(.*?)</a>'
3395 mobj = re.search(pattern, webpage)
3396 if mobj:
3397 upload_date = mobj.group(1)
3398 # Convert timestring to a format suitable for filename
3399 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3400 upload_date = upload_date.strftime('%Y%m%d')
3401 self.report_date(upload_date)
3402
3403 # Extract uploader
3404 uploader = None
3405 pattern = r'rel\="author".*?>(.*?)</a>'
3406 mobj = re.search(pattern, webpage)
3407 if mobj:
3408 uploader = mobj.group(1)
3409 self.report_uploader(uploader)
3410
3411 # Extract title
3412 # Get the first line for title
3413 video_title = u'NA'
3414 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3415 mobj = re.search(pattern, webpage)
3416 if mobj:
3417 video_title = mobj.group(1)
3418 self.report_title(video_title)
3419
3420 # Step 2, Stimulate clicking the image box to launch video
3421 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3422 mobj = re.search(pattern, webpage)
3423 if mobj is None:
3424 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3425
3426 video_page = mobj.group(1)
3427 request = compat_urllib_request.Request(video_page)
3428 try:
93702113 3429 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5
PH
3430 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3431 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3432 return
3433 self.report_extract_vid_page(video_page)
3434
3435
3436 # Extract video links on video page
3437 """Extract video links of all sizes"""
3438 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3439 mobj = re.findall(pattern, webpage)
3440 if len(mobj) == 0:
3441 self._downloader.trouble(u'ERROR: unable to extract video links')
3442
3443 # Sort in resolution
3444 links = sorted(mobj)
3445
3446 # Choose the lowest of the sort, i.e. highest resolution
3447 video_url = links[-1]
3448 # Only get the url. The resolution part in the tuple has no use anymore
3449 video_url = video_url[-1]
3450 # Treat escaped \u0026 style hex
93702113
FV
3451 try:
3452 video_url = video_url.decode("unicode_escape")
3453 except AttributeError: # Python 3
3454 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3455
3456
3457 return [{
93702113 3458 'id': video_id,
59ae15a5 3459 'url': video_url,
93702113
FV
3460 'uploader': uploader,
3461 'upload_date': upload_date,
3462 'title': video_title,
3463 'ext': video_extension,
59ae15a5 3464 }]
4cc3d074
PH
3465
3466class NBAIE(InfoExtractor):
3467 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3468 IE_NAME = u'nba'
3469
4cc3d074
PH
3470 def _real_extract(self, url):
3471 mobj = re.match(self._VALID_URL, url)
3472 if mobj is None:
3473 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3474 return
3475
3476 video_id = mobj.group(1)
3477 if video_id.endswith('/index.html'):
3478 video_id = video_id[:-len('/index.html')]
3479
5f955171 3480 webpage = self._download_webpage(url, video_id)
4cc3d074
PH
3481
3482 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3483 def _findProp(rexp, default=None):
3484 m = re.search(rexp, webpage)
3485 if m:
3486 return unescapeHTML(m.group(1))
3487 else:
3488 return default
3489
3490 shortened_video_id = video_id.rpartition('/')[2]
3491 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3492 info = {
3493 'id': shortened_video_id,
3494 'url': video_url,
3495 'ext': 'mp4',
3496 'title': title,
3497 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3498 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3499 }
3500 return [info]
0b40544f
DV
3501
3502class JustinTVIE(InfoExtractor):
3503 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3504 # TODO: One broadcast may be split into multiple videos. The key
3505 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3506 # starts at 1 and increases. Can we treat all parts as one video?
3507
4096b609
DV
3508 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3509 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3510 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3511 IE_NAME = u'justin.tv'
3512
3513 def report_extraction(self, file_id):
3514 """Report information extraction."""
3515 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3516
4096b609
DV
3517 def report_download_page(self, channel, offset):
3518 """Report attempt to download a single page of videos."""
3519 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3520 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3521
2ab1c5ed
DV
3522 # Return count of items, list of *valid* items
3523 def _parse_page(self, url):
0b40544f 3524 try:
2ab1c5ed 3525 urlh = compat_urllib_request.urlopen(url)
0b40544f
DV
3526 webpage_bytes = urlh.read()
3527 webpage = webpage_bytes.decode('utf-8', 'ignore')
3528 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3529 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3530 return
cdb30764 3531
0b40544f 3532 response = json.loads(webpage)
fa1bf9c6 3533 if type(response) != list:
3534 error_text = response.get('error', 'unknown error')
3535 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3536 return
0b40544f
DV
3537 info = []
3538 for clip in response:
3539 video_url = clip['video_file_url']
3540 if video_url:
3541 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 3542 video_date = re.sub('-', '', clip['start_time'][:10])
3543 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
3544 video_id = clip['id']
3545 video_title = clip.get('title', video_id)
0b40544f 3546 info.append({
97f194c1 3547 'id': video_id,
0b40544f 3548 'url': video_url,
97f194c1 3549 'title': video_title,
fa1bf9c6 3550 'uploader': clip.get('channel_name', video_uploader_id),
3551 'uploader_id': video_uploader_id,
0b40544f
DV
3552 'upload_date': video_date,
3553 'ext': video_extension,
3554 })
2ab1c5ed
DV
3555 return (len(response), info)
3556
3557 def _real_extract(self, url):
3558 mobj = re.match(self._VALID_URL, url)
3559 if mobj is None:
3560 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3561 return
cdb30764 3562
2ab1c5ed
DV
3563 api = 'http://api.justin.tv'
3564 video_id = mobj.group(mobj.lastindex)
3565 paged = False
3566 if mobj.lastindex == 1:
3567 paged = True
3568 api += '/channel/archives/%s.json'
3569 else:
fa1bf9c6 3570 api += '/broadcast/by_archive/%s.json'
2ab1c5ed 3571 api = api % (video_id,)
cdb30764 3572
2ab1c5ed 3573 self.report_extraction(video_id)
cdb30764 3574
2ab1c5ed
DV
3575 info = []
3576 offset = 0
4096b609
DV
3577 limit = self._JUSTIN_PAGE_LIMIT
3578 while True:
3579 if paged:
3580 self.report_download_page(video_id, offset)
2ab1c5ed
DV
3581 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3582 page_count, page_info = self._parse_page(page_url)
3583 info.extend(page_info)
3584 if not paged or page_count != limit:
3585 break
3586 offset += limit
0b40544f 3587 return info
21a9c6aa
PH
3588
3589class FunnyOrDieIE(InfoExtractor):
3590 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
21a9c6aa 3591
21a9c6aa
PH
3592 def _real_extract(self, url):
3593 mobj = re.match(self._VALID_URL, url)
3594 if mobj is None:
3595 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3596 return
3597
3598 video_id = mobj.group('id')
5f955171 3599 webpage = self._download_webpage(url, video_id)
21a9c6aa
PH
3600
3601 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3602 if not m:
3603 self._downloader.trouble(u'ERROR: unable to find video information')
3604 video_url = unescapeHTML(m.group('url'))
21a9c6aa
PH
3605
3606 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3607 if not m:
3608 self._downloader.trouble(u'Cannot find video title')
3609 title = unescapeHTML(m.group('title'))
3610
3611 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3612 if m:
3613 desc = unescapeHTML(m.group('desc'))
3614 else:
3615 desc = None
3616
3617 info = {
3618 'id': video_id,
3619 'url': video_url,
3620 'ext': 'mp4',
3621 'title': title,
3622 'description': desc,
3623 }
3624 return [info]
d0d4f277
PH
3625
3626class TweetReelIE(InfoExtractor):
3627 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3628
d0d4f277
PH
3629 def _real_extract(self, url):
3630 mobj = re.match(self._VALID_URL, url)
3631 if mobj is None:
3632 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3633 return
3634
3635 video_id = mobj.group('id')
5f955171 3636 webpage = self._download_webpage(url, video_id)
d0d4f277
PH
3637
3638 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3639 if not m:
3640 self._downloader.trouble(u'ERROR: Cannot find status ID')
3641 status_id = m.group(1)
3642
3643 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3644 if not m:
3645 self._downloader.trouble(u'WARNING: Cannot find description')
3646 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3647
3648 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3649 if not m:
3650 self._downloader.trouble(u'ERROR: Cannot find uploader')
3651 uploader = unescapeHTML(m.group('uploader'))
3652 uploader_id = unescapeHTML(m.group('uploader_id'))
3653
3654 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3655 if not m:
3656 self._downloader.trouble(u'ERROR: Cannot find upload date')
3657 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3658
3659 title = desc
3660 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3661
3662 info = {
3663 'id': video_id,
3664 'url': video_url,
3665 'ext': 'mov',
3666 'title': title,
3667 'description': desc,
3668 'uploader': uploader,
3669 'uploader_id': uploader_id,
3670 'internal_id': status_id,
3671 'upload_date': upload_date
3672 }
3673 return [info]
e314ba67
JMF
3674
3675class SteamIE(InfoExtractor):
3676 _VALID_URL = r"""http://store.steampowered.com/
3677 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3678 (?P<gameID>\d+)/?
3679 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3680 """
4aeae91f 3681
e314ba67
JMF
3682 def suitable(self, url):
3683 """Receives a URL and returns True if suitable for this IE."""
3684 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
5f955171 3685
e314ba67
JMF
3686 def _real_extract(self, url):
3687 m = re.match(self._VALID_URL, url, re.VERBOSE)
3688 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3689 gameID = m.group('gameID')
3690 videourl = 'http://store.steampowered.com/video/%s/' % gameID
5f955171 3691 webpage = self._download_webpage(videourl, gameID)
e314ba67 3692 mweb = re.finditer(urlRE, webpage)
5e9d042d
JMF
3693 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3694 titles = re.finditer(namesRE, webpage)
e314ba67 3695 videos = []
5f955171 3696 for vid,vtitle in zip(mweb,titles):
e314ba67 3697 video_id = vid.group('videoID')
5f955171
PH
3698 title = vtitle.group('videoName')
3699 video_url = vid.group('videoURL')
e314ba67
JMF
3700 if not video_url:
3701 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
e314ba67
JMF
3702 info = {
3703 'id':video_id,
3704 'url':video_url,
3705 'ext': 'flv',
5e9d042d 3706 'title': unescapeHTML(title)
e314ba67
JMF
3707 }
3708 videos.append(info)
3709 return videos
ef0c8d5f 3710
278986ea 3711class UstreamIE(InfoExtractor):
ef0c8d5f 3712 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 3713 IE_NAME = u'ustream'
ef0c8d5f 3714
278986ea
JMF
3715 def _real_extract(self, url):
3716 m = re.match(self._VALID_URL, url)
3717 video_id = m.group('videoID')
3718 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 3719 webpage = self._download_webpage(url, video_id)
278986ea
JMF
3720 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3721 title = m.group('title')
3722 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3723 uploader = m.group('uploader')
3724 info = {
3725 'id':video_id,
3726 'url':video_url,
3727 'ext': 'flv',
3728 'title': title,
3729 'uploader': uploader
3730 }
3731 return [info]
4aeae91f 3732
ca0a0bbe
PH
3733class RBMARadioIE(InfoExtractor):
3734 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3735
3736 def _real_extract(self, url):
3737 m = re.match(self._VALID_URL, url)
3738 video_id = m.group('videoID')
3739
3740 webpage = self._download_webpage(url, video_id)
3741 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3742 if not m:
3743 raise ExtractorError(u'Cannot find metadata')
3744 json_data = m.group(1)
3745
3746 try:
3747 data = json.loads(json_data)
3748 except ValueError as e:
3749 raise ExtractorError(u'Invalid JSON: ' + str(e))
3750
3751 video_url = data['akamai_url'] + '&cbr=256'
3752 url_parts = compat_urllib_parse_urlparse(video_url)
3753 video_ext = url_parts.path.rpartition('.')[2]
3754 info = {
3755 'id': video_id,
3756 'url': video_url,
3757 'ext': video_ext,
3758 'title': data['title'],
3759 'description': data.get('teaser_text'),
3760 'location': data.get('country_of_origin'),
3761 'uploader': data.get('host', {}).get('name'),
3762 'uploader_id': data.get('host', {}).get('slug'),
187f491a 3763 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
3764 'duration': data.get('duration'),
3765 }
3766 return [info]
4aeae91f 3767
991ba7fa
JC
3768
3769class YouPornIE(InfoExtractor):
3770 """Information extractor for youporn.com."""
991ba7fa 3771 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
ca6710ee 3772
991ba7fa
JC
3773 def _print_formats(self, formats):
3774 """Print all available formats"""
565f7519 3775 print(u'Available formats:')
ca6710ee
JC
3776 print(u'ext\t\tformat')
3777 print(u'---------------------------------')
991ba7fa 3778 for format in formats:
ca6710ee 3779 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
3780
3781 def _specific(self, req_format, formats):
3782 for x in formats:
3783 if(x["format"]==req_format):
3784 return x
3785 return None
3786
991ba7fa
JC
3787 def _real_extract(self, url):
3788 mobj = re.match(self._VALID_URL, url)
3789 if mobj is None:
3790 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3791 return
3792
ca6710ee 3793 video_id = mobj.group('videoid')
991ba7fa 3794
629fcdd1
PH
3795 req = compat_urllib_request.Request(url)
3796 req.add_header('Cookie', 'age_verified=1')
3797 webpage = self._download_webpage(req, video_id)
991ba7fa
JC
3798
3799 # Get the video title
629fcdd1 3800 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
991ba7fa 3801 if result is None:
629fcdd1 3802 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 3803 video_title = result.group('title').strip()
991ba7fa
JC
3804
3805 # Get the video date
629fcdd1 3806 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
991ba7fa 3807 if result is None:
629fcdd1
PH
3808 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3809 upload_date = None
3810 else:
3811 upload_date = result.group('date').strip()
991ba7fa
JC
3812
3813 # Get the video uploader
629fcdd1 3814 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
991ba7fa 3815 if result is None:
629fcdd1
PH
3816 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3817 video_uploader = None
3818 else:
3819 video_uploader = result.group('uploader').strip()
3820 video_uploader = clean_html( video_uploader )
991ba7fa
JC
3821
3822 # Get all of the formats available
ca6710ee
JC
3823 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3824 result = re.search(DOWNLOAD_LIST_RE, webpage)
991ba7fa 3825 if result is None:
629fcdd1 3826 raise ExtractorError(u'Unable to extract download list')
ca6710ee 3827 download_list_html = result.group('download_list').strip()
991ba7fa
JC
3828
3829 # Get all of the links from the page
ca6710ee
JC
3830 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3831 links = re.findall(LINK_RE, download_list_html)
991ba7fa 3832 if(len(links) == 0):
629fcdd1 3833 raise ExtractorError(u'ERROR: no known formats available for video')
991ba7fa
JC
3834
3835 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3836
3837 formats = []
3838 for link in links:
3839
3840 # A link looks like this:
3841 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3842 # A path looks like this:
3843 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
3844 video_url = unescapeHTML( link )
3845 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
3846 extension = os.path.splitext( path )[1][1:]
3847 format = path.split('/')[4].split('_')[:2]
3848 size = format[0]
3849 bitrate = format[1]
3850 format = "-".join( format )
3851 title = u'%s-%s-%s' % (video_title, size, bitrate)
3852
3853 formats.append({
3854 'id': video_id,
3855 'url': video_url,
3856 'uploader': video_uploader,
3857 'upload_date': upload_date,
3858 'title': title,
3859 'ext': extension,
3860 'format': format,
3861 'thumbnail': None,
3862 'description': None,
3863 'player_url': None
3864 })
3865
3866 if self._downloader.params.get('listformats', None):
3867 self._print_formats(formats)
3868 return
3869
3870 req_format = self._downloader.params.get('format', None)
991ba7fa
JC
3871 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3872
991ba7fa
JC
3873 if req_format is None or req_format == 'best':
3874 return [formats[0]]
3875 elif req_format == 'worst':
3876 return [formats[-1]]
3877 elif req_format in ('-1', 'all'):
3878 return formats
3879 else:
3880 format = self._specific( req_format, formats )
3881 if result is None:
3882 self._downloader.trouble(u'ERROR: requested format not available')
3883 return
3884 return [format]
3885
3886
3887
3888class PornotubeIE(InfoExtractor):
3889 """Information extractor for pornotube.com."""
991ba7fa 3890 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 3891
991ba7fa
JC
3892 def _real_extract(self, url):
3893 mobj = re.match(self._VALID_URL, url)
3894 if mobj is None:
3895 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3896 return
3897
ca6710ee
JC
3898 video_id = mobj.group('videoid')
3899 video_title = mobj.group('title')
991ba7fa
JC
3900
3901 # Get webpage content
ca6710ee 3902 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3903
3904 # Get the video URL
ca6710ee
JC
3905 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3906 result = re.search(VIDEO_URL_RE, webpage)
991ba7fa
JC
3907 if result is None:
3908 self._downloader.trouble(u'ERROR: unable to extract video url')
3909 return
ca6710ee 3910 video_url = compat_urllib_parse.unquote(result.group('url'))
991ba7fa
JC
3911
3912 #Get the uploaded date
ca6710ee
JC
3913 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3914 result = re.search(VIDEO_UPLOADED_RE, webpage)
991ba7fa
JC
3915 if result is None:
3916 self._downloader.trouble(u'ERROR: unable to extract video title')
3917 return
ca6710ee 3918 upload_date = result.group('date')
991ba7fa
JC
3919
3920 info = {'id': video_id,
3921 'url': video_url,
3922 'uploader': None,
3923 'upload_date': upload_date,
3924 'title': video_title,
3925 'ext': 'flv',
565f7519 3926 'format': 'flv'}
991ba7fa
JC
3927
3928 return [info]
3929
3930
3931
3932class YouJizzIE(InfoExtractor):
3933 """Information extractor for youjizz.com."""
ca6710ee 3934 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 3935
991ba7fa 3936 def _real_extract(self, url):
ca6710ee
JC
3937 mobj = re.match(self._VALID_URL, url)
3938 if mobj is None:
3939 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
991ba7fa 3940 return
ca6710ee
JC
3941
3942 video_id = mobj.group('videoid')
3943
3944 # Get webpage content
3945 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
3946
3947 # Get the video title
db16276b 3948 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
991ba7fa 3949 if result is None:
db16276b 3950 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 3951 video_title = result.group('title').strip()
991ba7fa
JC
3952
3953 # Get the embed page
db16276b 3954 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 3955 if result is None:
db16276b 3956 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 3957
ca6710ee
JC
3958 embed_page_url = result.group(0).strip()
3959 video_id = result.group('videoid')
991ba7fa 3960
ca6710ee
JC
3961 webpage = self._download_webpage(embed_page_url, video_id)
3962
991ba7fa 3963 # Get the video URL
db16276b 3964 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
991ba7fa 3965 if result is None:
db16276b 3966 raise ExtractorError(u'ERROR: unable to extract video url')
ca6710ee 3967 video_url = result.group('source')
991ba7fa
JC
3968
3969 info = {'id': video_id,
3970 'url': video_url,
991ba7fa
JC
3971 'title': video_title,
3972 'ext': 'flv',
3973 'format': 'flv',
991ba7fa
JC
3974 'player_url': embed_page_url}
3975
3976 return [info]
3977
3978
4aeae91f
PH
3979def gen_extractors():
3980 """ Return a list of an instance of every supported extractor.
3981 The order does matter; the first extractor matched is the one handling the URL.
3982 """
3983 return [
3984 YoutubePlaylistIE(),
3985 YoutubeChannelIE(),
3986 YoutubeUserIE(),
3987 YoutubeSearchIE(),
3988 YoutubeIE(),
3989 MetacafeIE(),
3990 DailymotionIE(),
3991 GoogleSearchIE(),
3992 PhotobucketIE(),
3993 YahooIE(),
3994 YahooSearchIE(),
3995 DepositFilesIE(),
3996 FacebookIE(),
3997 BlipTVUserIE(),
3998 BlipTVIE(),
3999 VimeoIE(),
4000 MyVideoIE(),
4001 ComedyCentralIE(),
4002 EscapistIE(),
4003 CollegeHumorIE(),
4004 XVideosIE(),
4005 SoundcloudIE(),
4006 InfoQIE(),
4007 MixcloudIE(),
4008 StanfordOpenClassroomIE(),
4009 MTVIE(),
4010 YoukuIE(),
4011 XNXXIE(),
18be482a
JC
4012 YouJizzIE(),
4013 PornotubeIE(),
4014 YouPornIE(),
4aeae91f
PH
4015 GooglePlusIE(),
4016 ArteTvIE(),
4017 NBAIE(),
4018 JustinTVIE(),
4019 FunnyOrDieIE(),
4020 TweetReelIE(),
4021 SteamIE(),
4022 UstreamIE(),
ca0a0bbe 4023 RBMARadioIE(),
4aeae91f
PH
4024 GenericIE()
4025 ]
4026
4027