2 # -*- coding: utf-8 -*-
4 from __future__
import absolute_import
13 import xml
.etree
.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self
, downloader
=None):
69 """Constructor. Receives an optional downloader."""
71 self
.set_downloader(downloader
)
73 def suitable(self
, url
):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re
.match(self
._VALID
_URL
, url
) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self
._real
_initialize
()
87 def extract(self
, url
):
88 """Extracts URL information and returns it in list of dicts."""
90 return self
._real
_extract
(url
)
92 def set_downloader(self
, downloader
):
93 """Sets the downloader for this IE."""
94 self
._downloader
= downloader
96 def _real_initialize(self
):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self
, url
):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self
).__name
__[:-2]
108 class YoutubeIE(InfoExtractor
):
109 """Information extractor for youtube.com."""
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
131 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE
= 'youtube'
136 # Listed in order of quality
137 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions
= {
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
151 _video_dimensions
= {
169 def suitable(self
, url
):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) is not None
173 def report_lang(self
):
174 """Report attempt to set language."""
175 self
._downloader
.to_screen(u
'[youtube] Setting language')
177 def report_login(self
):
178 """Report attempt to log in."""
179 self
._downloader
.to_screen(u
'[youtube] Logging in')
181 def report_age_confirmation(self
):
182 """Report attempt to confirm age."""
183 self
._downloader
.to_screen(u
'[youtube] Confirming age')
185 def report_video_webpage_download(self
, video_id
):
186 """Report attempt to download video webpage."""
187 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
189 def report_video_info_webpage_download(self
, video_id
):
190 """Report attempt to download video info webpage."""
191 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
193 def report_video_subtitles_download(self
, video_id
):
194 """Report attempt to download video info webpage."""
195 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video subtitles' % video_id
)
197 def report_information_extraction(self
, video_id
):
198 """Report attempt to extract video information."""
199 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
201 def report_unavailable_format(self
, video_id
, format
):
202 """Report extracted video URL."""
203 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
205 def report_rtmp_download(self
):
206 """Indicate the download will use the RTMP protocol."""
207 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
209 def _closed_captions_xml_to_srt(self
, xml_string
):
211 texts
= re
.findall(r
'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string
, re
.MULTILINE
)
212 # TODO parse xml instead of regex
213 for n
, (start
, dur_tag
, dur
, caption
) in enumerate(texts
):
214 if not dur
: dur
= '4'
216 end
= start
+ float(dur
)
217 start
= "%02i:%02i:%02i,%03i" %(start
/(60*60), start
/60%60, start
%60, start
%1*1000)
218 end
= "%02i:%02i:%02i,%03i" %(end
/(60*60), end
/60%60, end
%60, end
%1*1000)
219 caption
= unescapeHTML(caption
)
220 caption
= unescapeHTML(caption
) # double cycle, intentional
221 srt
+= str(n
+1) + '\n'
222 srt
+= start
+ ' --> ' + end
+ '\n'
223 srt
+= caption
+ '\n\n'
226 def _extract_subtitles(self
, video_id
):
227 self
.report_video_subtitles_download(video_id
)
228 request
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
230 srt_list
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
231 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
232 return (u
'WARNING: unable to download video subtitles: %s' % compat_str(err
), None)
233 srt_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list
)
234 srt_lang_list
= dict((l
[1], l
[0]) for l
in srt_lang_list
)
235 if not srt_lang_list
:
236 return (u
'WARNING: video has no closed captions', None)
237 if self
._downloader
.params
.get('subtitleslang', False):
238 srt_lang
= self
._downloader
.params
.get('subtitleslang')
239 elif 'en' in srt_lang_list
:
242 srt_lang
= list(srt_lang_list
.keys())[0]
243 if not srt_lang
in srt_lang_list
:
244 return (u
'WARNING: no closed captions found in the specified language', None)
245 request
= compat_urllib_request
.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang
, srt_lang_list
[srt_lang
], video_id
))
247 srt_xml
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
248 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
249 return (u
'WARNING: unable to download video subtitles: %s' % compat_str(err
), None)
251 return (u
'WARNING: unable to download video subtitles', None)
252 return (None, self
._closed
_captions
_xml
_to
_srt
(srt_xml
))
254 def _print_formats(self
, formats
):
255 print('Available formats:')
257 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???')))
259 def _real_initialize(self
):
260 if self
._downloader
is None:
265 downloader_params
= self
._downloader
.params
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params
.get('username', None) is not None:
269 username
= downloader_params
['username']
270 password
= downloader_params
['password']
271 elif downloader_params
.get('usenetrc', False):
273 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
278 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
279 except (IOError, netrc
.NetrcParseError
) as err
:
280 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % compat_str(err
))
284 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
287 compat_urllib_request
.urlopen(request
).read()
288 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
289 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % compat_str(err
))
292 # No authentication to be performed
298 'current_form': 'loginForm',
300 'action_login': 'Log In',
301 'username': username
,
302 'password': password
,
304 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
307 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
308 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
309 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
311 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
312 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % compat_str(err
))
318 'action_confirm': 'Confirm',
320 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
322 self
.report_age_confirmation()
323 age_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
324 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
325 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % compat_str(err
))
328 def _extract_id(self
, url
):
329 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
331 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
333 video_id
= mobj
.group(2)
336 def _real_extract(self
, url
):
337 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
338 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
340 url
= 'http://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
341 video_id
= self
._extract
_id
(url
)
344 self
.report_video_webpage_download(video_id
)
345 url
= 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
346 request
= compat_urllib_request
.Request(url
)
348 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
349 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
350 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
353 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
355 # Attempt to extract SWF player URL
356 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
358 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
363 self
.report_video_info_webpage_download(video_id
)
364 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
365 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
366 % (video_id
, el_type
))
367 request
= compat_urllib_request
.Request(video_info_url
)
369 video_info_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
370 video_info_webpage
= video_info_webpage_bytes
.decode('utf-8', 'ignore')
371 video_info
= compat_parse_qs(video_info_webpage
)
372 if 'token' in video_info
:
374 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
375 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
377 if 'token' not in video_info
:
378 if 'reason' in video_info
:
379 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0])
381 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
384 # Check for "rental" videos
385 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
386 self
._downloader
.trouble(u
'ERROR: "rental" videos not supported')
389 # Start extracting information
390 self
.report_information_extraction(video_id
)
393 if 'author' not in video_info
:
394 self
._downloader
.trouble(u
'ERROR: unable to extract uploader name')
396 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
399 video_uploader_id
= None
400 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage
)
402 video_uploader_id
= mobj
.group(1)
404 self
._downloader
.trouble(u
'WARNING: unable to extract uploader nickname')
407 if 'title' not in video_info
:
408 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
410 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
413 if 'thumbnail_url' not in video_info
:
414 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
416 else: # don't panic if we can't find it
417 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
421 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
423 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
424 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y']
425 for expression
in format_expressions
:
427 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
432 video_description
= get_element_by_id("eow-description", video_webpage
)
433 if video_description
:
434 video_description
= clean_html(video_description
)
436 video_description
= ''
439 video_subtitles
= None
440 if self
._downloader
.params
.get('writesubtitles', False):
441 (srt_error
, video_subtitles
) = self
._extract
_subtitles
(video_id
)
443 self
._downloader
.trouble(srt_error
)
445 if 'length_seconds' not in video_info
:
446 self
._downloader
.trouble(u
'WARNING: unable to extract video duration')
449 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
452 video_token
= compat_urllib_parse
.unquote_plus(video_info
['token'][0])
454 # Decide which formats to download
455 req_format
= self
._downloader
.params
.get('format', None)
457 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
458 self
.report_rtmp_download()
459 video_url_list
= [(None, video_info
['conn'][0])]
460 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
461 url_data_strs
= video_info
['url_encoded_fmt_stream_map'][0].split(',')
462 url_data
= [compat_parse_qs(uds
) for uds
in url_data_strs
]
463 url_data
= [ud
for ud
in url_data
if 'itag' in ud
and 'url' in ud
]
464 url_map
= dict((ud
['itag'][0], ud
['url'][0] + '&signature=' + ud
['sig'][0]) for ud
in url_data
)
466 format_limit
= self
._downloader
.params
.get('format_limit', None)
467 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
468 if format_limit
is not None and format_limit
in available_formats
:
469 format_list
= available_formats
[available_formats
.index(format_limit
):]
471 format_list
= available_formats
472 existing_formats
= [x
for x
in format_list
if x
in url_map
]
473 if len(existing_formats
) == 0:
474 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
476 if self
._downloader
.params
.get('listformats', None):
477 self
._print
_formats
(existing_formats
)
479 if req_format
is None or req_format
== 'best':
480 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
481 elif req_format
== 'worst':
482 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
483 elif req_format
in ('-1', 'all'):
484 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
486 # Specific formats. We pick the first in a slash-delimeted sequence.
487 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
488 req_formats
= req_format
.split('/')
489 video_url_list
= None
490 for rf
in req_formats
:
492 video_url_list
= [(rf
, url_map
[rf
])]
494 if video_url_list
is None:
495 self
._downloader
.trouble(u
'ERROR: requested format not available')
498 self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
502 for format_param
, video_real_url
in video_url_list
:
504 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
506 video_format
= '{0} - {1}'.format(format_param
if format_param
else video_extension
,
507 self
._video
_dimensions
.get(format_param
, '???'))
511 'url': video_real_url
,
512 'uploader': video_uploader
,
513 'uploader_id': video_uploader_id
,
514 'upload_date': upload_date
,
515 'title': video_title
,
516 'ext': video_extension
,
517 'format': video_format
,
518 'thumbnail': video_thumbnail
,
519 'description': video_description
,
520 'player_url': player_url
,
521 'subtitles': video_subtitles
,
522 'duration': video_duration
527 class MetacafeIE(InfoExtractor
):
528 """Information Extractor for metacafe.com."""
530 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
531 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
532 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
533 IE_NAME
= u
'metacafe'
535 def __init__(self
, downloader
=None):
536 InfoExtractor
.__init
__(self
, downloader
)
538 def report_disclaimer(self
):
539 """Report disclaimer retrieval."""
540 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
542 def report_age_confirmation(self
):
543 """Report attempt to confirm age."""
544 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
546 def report_download_webpage(self
, video_id
):
547 """Report webpage download."""
548 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
550 def report_extraction(self
, video_id
):
551 """Report information extraction."""
552 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
554 def _real_initialize(self
):
555 # Retrieve disclaimer
556 request
= compat_urllib_request
.Request(self
._DISCLAIMER
)
558 self
.report_disclaimer()
559 disclaimer
= compat_urllib_request
.urlopen(request
).read()
560 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
561 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % compat_str(err
))
567 'submit': "Continue - I'm over 18",
569 request
= compat_urllib_request
.Request(self
._FILTER
_POST
, compat_urllib_parse
.urlencode(disclaimer_form
))
571 self
.report_age_confirmation()
572 disclaimer
= compat_urllib_request
.urlopen(request
).read()
573 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
574 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % compat_str(err
))
577 def _real_extract(self
, url
):
578 # Extract id and simplified title from URL
579 mobj
= re
.match(self
._VALID
_URL
, url
)
581 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
584 video_id
= mobj
.group(1)
586 # Check if video comes from YouTube
587 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
588 if mobj2
is not None:
589 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % mobj2
.group(1)])
592 # Retrieve video webpage to extract further information
593 request
= compat_urllib_request
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
595 self
.report_download_webpage(video_id
)
596 webpage
= compat_urllib_request
.urlopen(request
).read()
597 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
598 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % compat_str(err
))
601 # Extract URL, uploader and title from webpage
602 self
.report_extraction(video_id
)
603 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
605 mediaURL
= compat_urllib_parse
.unquote(mobj
.group(1))
606 video_extension
= mediaURL
[-3:]
608 # Extract gdaKey if available
609 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
613 gdaKey
= mobj
.group(1)
614 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
616 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
618 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
620 vardict
= compat_parse_qs(mobj
.group(1))
621 if 'mediaData' not in vardict
:
622 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
624 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
626 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
628 mediaURL
= mobj
.group(1).replace('\\/', '/')
629 video_extension
= mediaURL
[-3:]
630 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
632 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
634 self
._downloader
.trouble(u
'ERROR: unable to extract title')
636 video_title
= mobj
.group(1).decode('utf-8')
638 mobj
= re
.search(r
'submitter=(.*?);', webpage
)
640 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
642 video_uploader
= mobj
.group(1)
645 'id': video_id
.decode('utf-8'),
646 'url': video_url
.decode('utf-8'),
647 'uploader': video_uploader
.decode('utf-8'),
649 'title': video_title
,
650 'ext': video_extension
.decode('utf-8'),
654 class DailymotionIE(InfoExtractor
):
655 """Information Extractor for Dailymotion"""
657 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
658 IE_NAME
= u
'dailymotion'
660 def __init__(self
, downloader
=None):
661 InfoExtractor
.__init
__(self
, downloader
)
663 def report_download_webpage(self
, video_id
):
664 """Report webpage download."""
665 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
667 def report_extraction(self
, video_id
):
668 """Report information extraction."""
669 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
671 def _real_extract(self
, url
):
672 # Extract id and simplified title from URL
673 mobj
= re
.match(self
._VALID
_URL
, url
)
675 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
678 video_id
= mobj
.group(1).split('_')[0].split('?')[0]
680 video_extension
= 'mp4'
682 # Retrieve video webpage to extract further information
683 request
= compat_urllib_request
.Request(url
)
684 request
.add_header('Cookie', 'family_filter=off')
686 self
.report_download_webpage(video_id
)
687 webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
688 webpage
= webpage_bytes
.decode('utf-8')
689 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
690 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % compat_str(err
))
693 # Extract URL, uploader and title from webpage
694 self
.report_extraction(video_id
)
695 mobj
= re
.search(r
'\s*var flashvars = (.*)', webpage
)
697 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
699 flashvars
= compat_urllib_parse
.unquote(mobj
.group(1))
701 for key
in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
704 self
._downloader
.to_screen(u
'[dailymotion] Using %s' % key
)
707 self
._downloader
.trouble(u
'ERROR: unable to extract video URL')
710 mobj
= re
.search(r
'"' + max_quality
+ r
'":"(.+?)"', flashvars
)
712 self
._downloader
.trouble(u
'ERROR: unable to extract video URL')
715 video_url
= compat_urllib_parse
.unquote(mobj
.group(1)).replace('\\/', '/')
717 # TODO: support choosing qualities
719 mobj
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
)
721 self
._downloader
.trouble(u
'ERROR: unable to extract title')
723 video_title
= unescapeHTML(mobj
.group('title'))
725 video_uploader
= None
726 mobj
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage
)
728 # lookin for official user
729 mobj_official
= re
.search(r
'<span rel="author"[^>]+?>([^<]+?)</span>', webpage
)
730 if mobj_official
is None:
731 self
._downloader
.trouble(u
'WARNING: unable to extract uploader nickname')
733 video_uploader
= mobj_official
.group(1)
735 video_uploader
= mobj
.group(1)
737 video_upload_date
= None
738 mobj
= re
.search(r
'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage
)
740 video_upload_date
= mobj
.group(3) + mobj
.group(2) + mobj
.group(1)
745 'uploader': video_uploader
,
746 'upload_date': video_upload_date
,
747 'title': video_title
,
748 'ext': video_extension
,
752 class PhotobucketIE(InfoExtractor
):
753 """Information extractor for photobucket.com."""
755 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
756 IE_NAME
= u
'photobucket'
758 def __init__(self
, downloader
=None):
759 InfoExtractor
.__init
__(self
, downloader
)
761 def report_download_webpage(self
, video_id
):
762 """Report webpage download."""
763 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
765 def report_extraction(self
, video_id
):
766 """Report information extraction."""
767 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
769 def _real_extract(self
, url
):
770 # Extract id from URL
771 mobj
= re
.match(self
._VALID
_URL
, url
)
773 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
776 video_id
= mobj
.group(1)
778 video_extension
= 'flv'
780 # Retrieve video webpage to extract further information
781 request
= compat_urllib_request
.Request(url
)
783 self
.report_download_webpage(video_id
)
784 webpage
= compat_urllib_request
.urlopen(request
).read()
785 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
786 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
789 # Extract URL, uploader, and title from webpage
790 self
.report_extraction(video_id
)
791 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
793 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
795 mediaURL
= compat_urllib_parse
.unquote(mobj
.group(1))
799 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
801 self
._downloader
.trouble(u
'ERROR: unable to extract title')
803 video_title
= mobj
.group(1).decode('utf-8')
805 video_uploader
= mobj
.group(2).decode('utf-8')
808 'id': video_id
.decode('utf-8'),
809 'url': video_url
.decode('utf-8'),
810 'uploader': video_uploader
,
812 'title': video_title
,
813 'ext': video_extension
.decode('utf-8'),
817 class YahooIE(InfoExtractor
):
818 """Information extractor for video.yahoo.com."""
821 # _VALID_URL matches all Yahoo! Video URLs
822 # _VPAGE_URL matches only the extractable '/watch/' URLs
823 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
824 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
825 IE_NAME
= u
'video.yahoo'
827 def __init__(self
, downloader
=None):
828 InfoExtractor
.__init
__(self
, downloader
)
830 def report_download_webpage(self
, video_id
):
831 """Report webpage download."""
832 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
834 def report_extraction(self
, video_id
):
835 """Report information extraction."""
836 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
838 def _real_extract(self
, url
, new_video
=True):
839 # Extract ID from URL
840 mobj
= re
.match(self
._VALID
_URL
, url
)
842 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
845 video_id
= mobj
.group(2)
846 video_extension
= 'flv'
848 # Rewrite valid but non-extractable URLs as
849 # extractable English language /watch/ URLs
850 if re
.match(self
._VPAGE
_URL
, url
) is None:
851 request
= compat_urllib_request
.Request(url
)
853 webpage
= compat_urllib_request
.urlopen(request
).read()
854 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
855 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
858 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
860 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
862 yahoo_id
= mobj
.group(1)
864 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
866 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
868 yahoo_vid
= mobj
.group(1)
870 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
871 return self
._real
_extract
(url
, new_video
=False)
873 # Retrieve video webpage to extract further information
874 request
= compat_urllib_request
.Request(url
)
876 self
.report_download_webpage(video_id
)
877 webpage
= compat_urllib_request
.urlopen(request
).read()
878 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
879 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
882 # Extract uploader and title from webpage
883 self
.report_extraction(video_id
)
884 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
886 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
888 video_title
= mobj
.group(1).decode('utf-8')
890 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
892 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
894 video_uploader
= mobj
.group(1).decode('utf-8')
896 # Extract video thumbnail
897 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
899 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
901 video_thumbnail
= mobj
.group(1).decode('utf-8')
903 # Extract video description
904 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
906 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
908 video_description
= mobj
.group(1).decode('utf-8')
909 if not video_description
:
910 video_description
= 'No description available.'
912 # Extract video height and width
913 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
915 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
917 yv_video_height
= mobj
.group(1)
919 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
921 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
923 yv_video_width
= mobj
.group(1)
925 # Retrieve video playlist to extract media URL
926 # I'm not completely sure what all these options are, but we
927 # seem to need most of them, otherwise the server sends a 401.
928 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
929 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
930 request
= compat_urllib_request
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
931 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
932 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
934 self
.report_download_webpage(video_id
)
935 webpage
= compat_urllib_request
.urlopen(request
).read()
936 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
937 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
940 # Extract media URL from playlist XML
941 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
943 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
945 video_url
= compat_urllib_parse
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
946 video_url
= unescapeHTML(video_url
)
949 'id': video_id
.decode('utf-8'),
951 'uploader': video_uploader
,
953 'title': video_title
,
954 'ext': video_extension
.decode('utf-8'),
955 'thumbnail': video_thumbnail
.decode('utf-8'),
956 'description': video_description
,
960 class VimeoIE(InfoExtractor
):
961 """Information extractor for vimeo.com."""
963 # _VALID_URL matches Vimeo URLs
964 _VALID_URL
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
967 def __init__(self
, downloader
=None):
968 InfoExtractor
.__init
__(self
, downloader
)
970 def report_download_webpage(self
, video_id
):
971 """Report webpage download."""
972 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
)
974 def report_extraction(self
, video_id
):
975 """Report information extraction."""
976 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
)
978 def _real_extract(self
, url
, new_video
=True):
979 # Extract ID from URL
980 mobj
= re
.match(self
._VALID
_URL
, url
)
982 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
985 video_id
= mobj
.group(1)
987 # Retrieve video webpage to extract further information
988 request
= compat_urllib_request
.Request(url
, None, std_headers
)
990 self
.report_download_webpage(video_id
)
991 webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
992 webpage
= webpage_bytes
.decode('utf-8')
993 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
994 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
997 # Now we begin extracting as much information as we can from what we
998 # retrieved. First we extract the information common to all extractors,
999 # and latter we extract those that are Vimeo specific.
1000 self
.report_extraction(video_id
)
1002 # Extract the config JSON
1004 config
= webpage
.split(' = {config:')[1].split(',assets:')[0]
1005 config
= json
.loads(config
)
1007 self
._downloader
.trouble(u
'ERROR: unable to extract info section')
1011 video_title
= config
["video"]["title"]
1013 # Extract uploader and uploader_id
1014 video_uploader
= config
["video"]["owner"]["name"]
1015 video_uploader_id
= config
["video"]["owner"]["url"].split('/')[-1]
1017 # Extract video thumbnail
1018 video_thumbnail
= config
["video"]["thumbnail"]
1020 # Extract video description
1021 video_description
= get_element_by_attribute("itemprop", "description", webpage
)
1022 if video_description
: video_description
= clean_html(video_description
)
1023 else: video_description
= ''
1025 # Extract upload date
1026 video_upload_date
= None
1027 mobj
= re
.search(r
'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage
)
1028 if mobj
is not None:
1029 video_upload_date
= mobj
.group(1) + mobj
.group(2) + mobj
.group(3)
1031 # Vimeo specific: extract request signature and timestamp
1032 sig
= config
['request']['signature']
1033 timestamp
= config
['request']['timestamp']
1035 # Vimeo specific: extract video codec and quality information
1036 # First consider quality, then codecs, then take everything
1037 # TODO bind to format param
1038 codecs
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039 files
= { 'hd': [], 'sd': [], 'other': []}
1040 for codec_name
, codec_extension
in codecs
:
1041 if codec_name
in config
["video"]["files"]:
1042 if 'hd' in config
["video"]["files"][codec_name
]:
1043 files
['hd'].append((codec_name
, codec_extension
, 'hd'))
1044 elif 'sd' in config
["video"]["files"][codec_name
]:
1045 files
['sd'].append((codec_name
, codec_extension
, 'sd'))
1047 files
['other'].append((codec_name
, codec_extension
, config
["video"]["files"][codec_name
][0]))
1049 for quality
in ('hd', 'sd', 'other'):
1050 if len(files
[quality
]) > 0:
1051 video_quality
= files
[quality
][0][2]
1052 video_codec
= files
[quality
][0][0]
1053 video_extension
= files
[quality
][0][1]
1054 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading %s file at %s quality' % (video_id
, video_codec
.upper(), video_quality
))
1057 self
._downloader
.trouble(u
'ERROR: no known codec found')
1060 video_url
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061 %(video_id
, sig
, timestamp
, video_quality
, video_codec
.upper())
1066 'uploader': video_uploader
,
1067 'uploader_id': video_uploader_id
,
1068 'upload_date': video_upload_date
,
1069 'title': video_title
,
1070 'ext': video_extension
,
1071 'thumbnail': video_thumbnail
,
1072 'description': video_description
,
1076 class ArteTvIE(InfoExtractor
):
1077 """arte.tv information extractor."""
1079 _VALID_URL
= r
'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080 _LIVE_URL
= r
'index-[0-9]+\.html$'
1082 IE_NAME
= u
'arte.tv'
1084 def __init__(self
, downloader
=None):
1085 InfoExtractor
.__init
__(self
, downloader
)
1087 def report_download_webpage(self
, video_id
):
1088 """Report webpage download."""
1089 self
._downloader
.to_screen(u
'[arte.tv] %s: Downloading webpage' % video_id
)
1091 def report_extraction(self
, video_id
):
1092 """Report information extraction."""
1093 self
._downloader
.to_screen(u
'[arte.tv] %s: Extracting information' % video_id
)
1095 def fetch_webpage(self
, url
):
1096 request
= compat_urllib_request
.Request(url
)
1098 self
.report_download_webpage(url
)
1099 webpage
= compat_urllib_request
.urlopen(request
).read()
1100 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1101 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1103 except ValueError as err
:
1104 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1108 def grep_webpage(self
, url
, regex
, regexFlags
, matchTuples
):
1109 page
= self
.fetch_webpage(url
)
1110 mobj
= re
.search(regex
, page
, regexFlags
)
1114 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1117 for (i
, key
, err
) in matchTuples
:
1118 if mobj
.group(i
) is None:
1119 self
._downloader
.trouble(err
)
1122 info
[key
] = mobj
.group(i
)
1126 def extractLiveStream(self
, url
):
1127 video_lang
= url
.split('/')[-4]
1128 info
= self
.grep_webpage(
1130 r
'src="(.*?/videothek_js.*?\.js)',
1133 (1, 'url', u
'ERROR: Invalid URL: %s' % url
)
1136 http_host
= url
.split('/')[2]
1137 next_url
= 'http://%s%s' % (http_host
, compat_urllib_parse
.unquote(info
.get('url')))
1138 info
= self
.grep_webpage(
1140 r
'(s_artestras_scst_geoFRDE_' + video_lang
+ '.*?)\'.*?' +
1141 '(http://.*?\.swf).*?' +
1145 (1, 'path', u
'ERROR: could not extract video path: %s' % url
),
1146 (2, 'player', u
'ERROR: could not extract video player: %s' % url
),
1147 (3, 'url', u
'ERROR: could not extract video url: %s' % url
)
1150 video_url
= u
'%s/%s' % (info
.get('url'), info
.get('path'))
1152 def extractPlus7Stream(self
, url
):
1153 video_lang
= url
.split('/')[-3]
1154 info
= self
.grep_webpage(
1156 r
'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1159 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1162 next_url = compat_urllib_parse.unquote(info.get('url'))
1163 info = self.grep_webpage(
1165 r'<video lang="%s" ref="(http
[^
\'"&]*)' % video_lang,
1168 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1171 next_url = compat_urllib_parse.unquote(info.get('url'))
1173 info = self.grep_webpage(
1175 r'<video id="(.*?
)".*?>.*?' +
1176 '<name>(.*?)</name>.*?' +
1177 '<dateVideo>(.*?)</dateVideo>.*?' +
1178 '<url quality="hd
">(.*?)</url>',
1181 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1182 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1184 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1189 'id': info.get('id'),
1190 'url': compat_urllib_parse.unquote(info.get('url')),
1191 'uploader': u'arte.tv',
1192 'upload_date': info.get('date'),
1193 'title': info.get('title').decode('utf-8'),
1199 def _real_extract(self, url):
1200 video_id = url.split('/')[-1]
1201 self.report_extraction(video_id)
1203 if re.search(self._LIVE_URL, video_id) is not None:
1204 self.extractLiveStream(url)
1207 info = self.extractPlus7Stream(url)
1212 class GenericIE(InfoExtractor):
1213 """Generic last-resort information extractor."""
1216 IE_NAME = u'generic'
1218 def __init__(self, downloader=None):
1219 InfoExtractor.__init__(self, downloader)
1221 def report_download_webpage(self, video_id):
1222 """Report webpage download."""
1223 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1230 def report_following_redirect(self, new_url):
1231 """Report information extraction."""
1232 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1234 def _test_redirect(self, url):
1235 """Check if it is a redirect, like url shorteners, in case restart chain."""
1236 class HeadRequest(compat_urllib_request.Request):
1237 def get_method(self):
1240 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1242 Subclass the HTTPRedirectHandler to make it use our
1243 HeadRequest also on the redirected URL
1245 def redirect_request(self, req, fp, code, msg, headers, newurl):
1246 if code in (301, 302, 303, 307):
1247 newurl = newurl.replace(' ', '%20')
1248 newheaders = dict((k,v) for k,v in req.headers.items()
1249 if k.lower() not in ("content
-length
", "content
-type"))
1250 return HeadRequest(newurl,
1252 origin_req_host=req.get_origin_req_host(),
1255 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1257 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1259 Fallback to GET if HEAD is not allowed (405 HTTP error)
1261 def http_error_405(self, req, fp, code, msg, headers):
1265 newheaders = dict((k,v) for k,v in req.headers.items()
1266 if k.lower() not in ("content
-length
", "content
-type"))
1267 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1269 origin_req_host=req.get_origin_req_host(),
1273 opener = compat_urllib_request.OpenerDirector()
1274 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275 HTTPMethodFallback, HEADRedirectHandler,
1276 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277 opener.add_handler(handler())
1279 response = opener.open(HeadRequest(url))
1280 new_url = response.geturl()
1285 self.report_following_redirect(new_url)
1286 self._downloader.download([new_url])
1289 def _real_extract(self, url):
1290 if self._test_redirect(url): return
1292 video_id = url.split('/')[-1]
1293 request = compat_urllib_request.Request(url)
1295 self.report_download_webpage(video_id)
1296 webpage = compat_urllib_request.urlopen(request).read()
1297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1300 except ValueError as err:
1301 # since this is the last-resort InfoExtractor, if
1302 # this error is thrown, it'll be thrown here
1303 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306 self.report_extraction(video_id)
1307 # Start with something easy: JW Player in SWFObject
1308 mobj = re.search(r'flashvars: [\'"](?
:.*&)?
file=(http
[^
\'"&]*)', webpage)
1310 # Broaden the search a little bit
1311 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1313 self._downloader.trouble(u'ERROR
: Invalid URL
: %s' % url)
1316 # It's possible that one of the regexes
1317 # matched, but returned an empty group:
1318 if mobj
.group(1) is None:
1319 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1322 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
1323 video_id
= os
.path
.basename(video_url
)
1325 # here's a fun little line of code for you:
1326 video_extension
= os
.path
.splitext(video_id
)[1][1:]
1327 video_id
= os
.path
.splitext(video_id
)[0]
1329 # it's tempting to parse this further, but you would
1330 # have to take into account all the variations like
1331 # Video Title - Site Name
1332 # Site Name | Video Title
1333 # Video Title - Tagline | Site Name
1334 # and so on and so forth; it's just not practical
1335 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1337 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1339 video_title
= mobj
.group(1)
1341 # video uploader is domain name
1342 mobj
= re
.match(r
'(?:https?://)?([^/]*)/.*', url
)
1344 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1346 video_uploader
= mobj
.group(1)
1351 'uploader': video_uploader
,
1352 'upload_date': None,
1353 'title': video_title
,
1354 'ext': video_extension
,
1358 class YoutubeSearchIE(InfoExtractor
):
1359 """Information Extractor for YouTube search queries."""
1360 _VALID_URL
= r
'ytsearch(\d+|all)?:[\s\S]+'
1361 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362 _max_youtube_results
= 1000
1363 IE_NAME
= u
'youtube:search'
1365 def __init__(self
, downloader
=None):
1366 InfoExtractor
.__init
__(self
, downloader
)
1368 def report_download_page(self
, query
, pagenum
):
1369 """Report attempt to download search page with given number."""
1370 query
= query
.decode(preferredencoding())
1371 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1373 def _real_extract(self
, query
):
1374 mobj
= re
.match(self
._VALID
_URL
, query
)
1376 self
._downloader
.trouble(u
'ERROR: invalid search query "%s"' % query
)
1379 prefix
, query
= query
.split(':')
1381 query
= query
.encode('utf-8')
1383 self
._download
_n
_results
(query
, 1)
1385 elif prefix
== 'all':
1386 self
._download
_n
_results
(query
, self
._max
_youtube
_results
)
1392 self
._downloader
.trouble(u
'ERROR: invalid download number %s for query "%s"' % (n
, query
))
1394 elif n
> self
._max
_youtube
_results
:
1395 self
._downloader
.to_stderr(u
'WARNING: ytsearch returns max %i results (you requested %i)' % (self
._max
_youtube
_results
, n
))
1396 n
= self
._max
_youtube
_results
1397 self
._download
_n
_results
(query
, n
)
1399 except ValueError: # parsing prefix as integer fails
1400 self
._download
_n
_results
(query
, 1)
1403 def _download_n_results(self
, query
, n
):
1404 """Downloads a specified number of results for a query"""
1410 while (50 * pagenum
) < limit
:
1411 self
.report_download_page(query
, pagenum
+1)
1412 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1413 request
= compat_urllib_request
.Request(result_url
)
1415 data
= compat_urllib_request
.urlopen(request
).read()
1416 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1417 self
._downloader
.trouble(u
'ERROR: unable to download API page: %s' % compat_str(err
))
1419 api_response
= json
.loads(data
)['data']
1421 new_ids
= list(video
['id'] for video
in api_response
['items'])
1422 video_ids
+= new_ids
1424 limit
= min(n
, api_response
['totalItems'])
1427 if len(video_ids
) > n
:
1428 video_ids
= video_ids
[:n
]
1429 for id in video_ids
:
1430 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1434 class GoogleSearchIE(InfoExtractor
):
1435 """Information Extractor for Google Video search queries."""
1436 _VALID_URL
= r
'gvsearch(\d+|all)?:[\s\S]+'
1437 _TEMPLATE_URL
= 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438 _VIDEO_INDICATOR
= r
'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439 _MORE_PAGES_INDICATOR
= r
'class="pn" id="pnnext"'
1440 _max_google_results
= 1000
1441 IE_NAME
= u
'video.google:search'
1443 def __init__(self
, downloader
=None):
1444 InfoExtractor
.__init
__(self
, downloader
)
1446 def report_download_page(self
, query
, pagenum
):
1447 """Report attempt to download playlist page with given number."""
1448 query
= query
.decode(preferredencoding())
1449 self
._downloader
.to_screen(u
'[video.google] query "%s": Downloading page %s' % (query
, pagenum
))
1451 def _real_extract(self
, query
):
1452 mobj
= re
.match(self
._VALID
_URL
, query
)
1454 self
._downloader
.trouble(u
'ERROR: invalid search query "%s"' % query
)
1457 prefix
, query
= query
.split(':')
1459 query
= query
.encode('utf-8')
1461 self
._download
_n
_results
(query
, 1)
1463 elif prefix
== 'all':
1464 self
._download
_n
_results
(query
, self
._max
_google
_results
)
1470 self
._downloader
.trouble(u
'ERROR: invalid download number %s for query "%s"' % (n
, query
))
1472 elif n
> self
._max
_google
_results
:
1473 self
._downloader
.to_stderr(u
'WARNING: gvsearch returns max %i results (you requested %i)' % (self
._max
_google
_results
, n
))
1474 n
= self
._max
_google
_results
1475 self
._download
_n
_results
(query
, n
)
1477 except ValueError: # parsing prefix as integer fails
1478 self
._download
_n
_results
(query
, 1)
1481 def _download_n_results(self
, query
, n
):
1482 """Downloads a specified number of results for a query"""
1488 self
.report_download_page(query
, pagenum
)
1489 result_url
= self
._TEMPLATE
_URL
% (compat_urllib_parse
.quote_plus(query
), pagenum
*10)
1490 request
= compat_urllib_request
.Request(result_url
)
1492 page
= compat_urllib_request
.urlopen(request
).read()
1493 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1494 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1497 # Extract video identifiers
1498 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1499 video_id
= mobj
.group(1)
1500 if video_id
not in video_ids
:
1501 video_ids
.append(video_id
)
1502 if len(video_ids
) == n
:
1503 # Specified n videos reached
1504 for id in video_ids
:
1505 self
._downloader
.download(['http://video.google.com/videoplay?docid=%s' % id])
1508 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1509 for id in video_ids
:
1510 self
._downloader
.download(['http://video.google.com/videoplay?docid=%s' % id])
1513 pagenum
= pagenum
+ 1
1516 class YahooSearchIE(InfoExtractor
):
1517 """Information Extractor for Yahoo! Video search queries."""
1520 _VALID_URL
= r
'yvsearch(\d+|all)?:[\s\S]+'
1521 _TEMPLATE_URL
= 'http://video.yahoo.com/search/?p=%s&o=%s'
1522 _VIDEO_INDICATOR
= r
'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1523 _MORE_PAGES_INDICATOR
= r
'\s*Next'
1524 _max_yahoo_results
= 1000
1525 IE_NAME
= u
'video.yahoo:search'
1527 def __init__(self
, downloader
=None):
1528 InfoExtractor
.__init
__(self
, downloader
)
1530 def report_download_page(self
, query
, pagenum
):
1531 """Report attempt to download playlist page with given number."""
1532 query
= query
.decode(preferredencoding())
1533 self
._downloader
.to_screen(u
'[video.yahoo] query "%s": Downloading page %s' % (query
, pagenum
))
1535 def _real_extract(self
, query
):
1536 mobj
= re
.match(self
._VALID
_URL
, query
)
1538 self
._downloader
.trouble(u
'ERROR: invalid search query "%s"' % query
)
1541 prefix
, query
= query
.split(':')
1543 query
= query
.encode('utf-8')
1545 self
._download
_n
_results
(query
, 1)
1547 elif prefix
== 'all':
1548 self
._download
_n
_results
(query
, self
._max
_yahoo
_results
)
1554 self
._downloader
.trouble(u
'ERROR: invalid download number %s for query "%s"' % (n
, query
))
1556 elif n
> self
._max
_yahoo
_results
:
1557 self
._downloader
.to_stderr(u
'WARNING: yvsearch returns max %i results (you requested %i)' % (self
._max
_yahoo
_results
, n
))
1558 n
= self
._max
_yahoo
_results
1559 self
._download
_n
_results
(query
, n
)
1561 except ValueError: # parsing prefix as integer fails
1562 self
._download
_n
_results
(query
, 1)
1565 def _download_n_results(self
, query
, n
):
1566 """Downloads a specified number of results for a query"""
1569 already_seen
= set()
1573 self
.report_download_page(query
, pagenum
)
1574 result_url
= self
._TEMPLATE
_URL
% (compat_urllib_parse
.quote_plus(query
), pagenum
)
1575 request
= compat_urllib_request
.Request(result_url
)
1577 page
= compat_urllib_request
.urlopen(request
).read()
1578 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1579 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1582 # Extract video identifiers
1583 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1584 video_id
= mobj
.group(1)
1585 if video_id
not in already_seen
:
1586 video_ids
.append(video_id
)
1587 already_seen
.add(video_id
)
1588 if len(video_ids
) == n
:
1589 # Specified n videos reached
1590 for id in video_ids
:
1591 self
._downloader
.download(['http://video.yahoo.com/watch/%s' % id])
1594 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1595 for id in video_ids
:
1596 self
._downloader
.download(['http://video.yahoo.com/watch/%s' % id])
1599 pagenum
= pagenum
+ 1
1602 class YoutubePlaylistIE(InfoExtractor
):
1603 """Information Extractor for YouTube playlists."""
1605 _VALID_URL
= r
'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1606 _TEMPLATE_URL
= 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1607 _VIDEO_INDICATOR_TEMPLATE
= r
'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1608 _MORE_PAGES_INDICATOR
= u
"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1609 IE_NAME
= u
'youtube:playlist'
1611 def __init__(self
, downloader
=None):
1612 InfoExtractor
.__init
__(self
, downloader
)
1614 def report_download_page(self
, playlist_id
, pagenum
):
1615 """Report attempt to download playlist page with given number."""
1616 self
._downloader
.to_screen(u
'[youtube] PL %s: Downloading page #%s' % (playlist_id
, pagenum
))
1618 def _real_extract(self
, url
):
1619 # Extract playlist id
1620 mobj
= re
.match(self
._VALID
_URL
, url
)
1622 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1626 if mobj
.group(3) is not None:
1627 self
._downloader
.download([mobj
.group(3)])
1630 # Download playlist pages
1631 # prefix is 'p' as default for playlists but there are other types that need extra care
1632 playlist_prefix
= mobj
.group(1)
1633 if playlist_prefix
== 'a':
1634 playlist_access
= 'artist'
1636 playlist_prefix
= 'p'
1637 playlist_access
= 'view_play_list'
1638 playlist_id
= mobj
.group(2)
1643 self
.report_download_page(playlist_id
, pagenum
)
1644 url
= self
._TEMPLATE
_URL
% (playlist_access
, playlist_prefix
, playlist_id
, pagenum
)
1645 request
= compat_urllib_request
.Request(url
)
1647 page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1648 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1649 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1652 # Extract video identifiers
1654 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
_TEMPLATE
% playlist_id
, page
):
1655 if mobj
.group(1) not in ids_in_page
:
1656 ids_in_page
.append(mobj
.group(1))
1657 video_ids
.extend(ids_in_page
)
1659 if self
._MORE
_PAGES
_INDICATOR
not in page
:
1661 pagenum
= pagenum
+ 1
1663 total
= len(video_ids
)
1665 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1666 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1667 if playlistend
== -1:
1668 video_ids
= video_ids
[playliststart
:]
1670 video_ids
= video_ids
[playliststart
:playlistend
]
1672 if len(video_ids
) == total
:
1673 self
._downloader
.to_screen(u
'[youtube] PL %s: Found %i videos' % (playlist_id
, total
))
1675 self
._downloader
.to_screen(u
'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id
, total
, len(video_ids
)))
1677 for id in video_ids
:
1678 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1682 class YoutubeChannelIE(InfoExtractor
):
1683 """Information Extractor for YouTube channels."""
1685 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1686 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687 _MORE_PAGES_INDICATOR
= u
"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688 IE_NAME
= u
'youtube:channel'
1690 def report_download_page(self
, channel_id
, pagenum
):
1691 """Report attempt to download channel page with given number."""
1692 self
._downloader
.to_screen(u
'[youtube] Channel %s: Downloading page #%s' % (channel_id
, pagenum
))
1694 def _real_extract(self
, url
):
1695 # Extract channel id
1696 mobj
= re
.match(self
._VALID
_URL
, url
)
1698 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1701 # Download channel pages
1702 channel_id
= mobj
.group(1)
1707 self
.report_download_page(channel_id
, pagenum
)
1708 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
1709 request
= compat_urllib_request
.Request(url
)
1711 page
= compat_urllib_request
.urlopen(request
).read().decode('utf8')
1712 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1713 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1716 # Extract video identifiers
1718 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&', page
):
1719 if mobj
.group(1) not in ids_in_page
:
1720 ids_in_page
.append(mobj
.group(1))
1721 video_ids
.extend(ids_in_page
)
1723 if self
._MORE
_PAGES
_INDICATOR
not in page
:
1725 pagenum
= pagenum
+ 1
1727 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1729 for id in video_ids
:
1730 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1734 class YoutubeUserIE(InfoExtractor
):
1735 """Information Extractor for YouTube users."""
1737 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1738 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1739 _GDATA_PAGE_SIZE
= 50
1740 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1741 _VIDEO_INDICATOR
= r
'/watch\?v=(.+?)[\<&]'
1742 IE_NAME
= u
'youtube:user'
1744 def __init__(self
, downloader
=None):
1745 InfoExtractor
.__init
__(self
, downloader
)
1747 def report_download_page(self
, username
, start_index
):
1748 """Report attempt to download user page."""
1749 self
._downloader
.to_screen(u
'[youtube] user %s: Downloading video ids from %d to %d' %
1750 (username
, start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1752 def _real_extract(self
, url
):
1754 mobj
= re
.match(self
._VALID
_URL
, url
)
1756 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1759 username
= mobj
.group(1)
1761 # Download video ids using YouTube Data API. Result size per
1762 # query is limited (currently to 50 videos) so we need to query
1763 # page by page until there are no video ids - it means we got
1770 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1771 self
.report_download_page(username
, start_index
)
1773 request
= compat_urllib_request
.Request(self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
))
1776 page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1777 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1778 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1781 # Extract video identifiers
1784 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1785 if mobj
.group(1) not in ids_in_page
:
1786 ids_in_page
.append(mobj
.group(1))
1788 video_ids
.extend(ids_in_page
)
1790 # A little optimization - if current page is not
1791 # "full", ie. does not contain PAGE_SIZE video ids then
1792 # we can assume that this page is the last one - there
1793 # are no more ids on further pages - no need to query
1796 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1801 all_ids_count
= len(video_ids
)
1802 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1803 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1805 if playlistend
== -1:
1806 video_ids
= video_ids
[playliststart
:]
1808 video_ids
= video_ids
[playliststart
:playlistend
]
1810 self
._downloader
.to_screen(u
"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1811 (username
, all_ids_count
, len(video_ids
)))
1813 for video_id
in video_ids
:
1814 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % video_id
])
1817 class BlipTVUserIE(InfoExtractor
):
1818 """Information Extractor for blip.tv users."""
1820 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1822 IE_NAME
= u
'blip.tv:user'
1824 def __init__(self
, downloader
=None):
1825 InfoExtractor
.__init
__(self
, downloader
)
1827 def report_download_page(self
, username
, pagenum
):
1828 """Report attempt to download user page."""
1829 self
._downloader
.to_screen(u
'[%s] user %s: Downloading video ids from page %d' %
1830 (self
.IE_NAME
, username
, pagenum
))
1832 def _real_extract(self
, url
):
1834 mobj
= re
.match(self
._VALID
_URL
, url
)
1836 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1839 username
= mobj
.group(1)
1841 page_base
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1843 request
= compat_urllib_request
.Request(url
)
1846 page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1847 mobj
= re
.search(r
'data-users-id="([^"]+)"', page
)
1848 page_base
= page_base
% mobj
.group(1)
1849 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1850 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1854 # Download video ids using BlipTV Ajax calls. Result size per
1855 # query is limited (currently to 12 videos) so we need to query
1856 # page by page until there are no video ids - it means we got
1863 self
.report_download_page(username
, pagenum
)
1865 request
= compat_urllib_request
.Request( page_base
+ "&page=" + str(pagenum
) )
1868 page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1869 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1870 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % str(err
))
1873 # Extract video identifiers
1876 for mobj
in re
.finditer(r
'href="/([^"]+)"', page
):
1877 if mobj
.group(1) not in ids_in_page
:
1878 ids_in_page
.append(unescapeHTML(mobj
.group(1)))
1880 video_ids
.extend(ids_in_page
)
1882 # A little optimization - if current page is not
1883 # "full", ie. does not contain PAGE_SIZE video ids then
1884 # we can assume that this page is the last one - there
1885 # are no more ids on further pages - no need to query
1888 if len(ids_in_page
) < self
._PAGE
_SIZE
:
1893 all_ids_count
= len(video_ids
)
1894 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1895 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1897 if playlistend
== -1:
1898 video_ids
= video_ids
[playliststart
:]
1900 video_ids
= video_ids
[playliststart
:playlistend
]
1902 self
._downloader
.to_screen(u
"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1903 (self
.IE_NAME
, username
, all_ids_count
, len(video_ids
)))
1905 for video_id
in video_ids
:
1906 self
._downloader
.download([u
'http://blip.tv/'+video_id
])
1909 class DepositFilesIE(InfoExtractor
):
1910 """Information extractor for depositfiles.com"""
1912 _VALID_URL
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1914 def report_download_webpage(self
, file_id
):
1915 """Report webpage download."""
1916 self
._downloader
.to_screen(u
'[DepositFiles] %s: Downloading webpage' % file_id
)
1918 def report_extraction(self
, file_id
):
1919 """Report information extraction."""
1920 self
._downloader
.to_screen(u
'[DepositFiles] %s: Extracting information' % file_id
)
1922 def _real_extract(self
, url
):
1923 file_id
= url
.split('/')[-1]
1924 # Rebuild url in english locale
1925 url
= 'http://depositfiles.com/en/files/' + file_id
1927 # Retrieve file webpage with 'Free download' button pressed
1928 free_download_indication
= { 'gateway_result' : '1' }
1929 request
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
))
1931 self
.report_download_webpage(file_id
)
1932 webpage
= compat_urllib_request
.urlopen(request
).read()
1933 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1934 self
._downloader
.trouble(u
'ERROR: Unable to retrieve file webpage: %s' % compat_str(err
))
1937 # Search for the real file URL
1938 mobj
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
)
1939 if (mobj
is None) or (mobj
.group(1) is None):
1940 # Try to figure out reason of the error.
1941 mobj
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
)
1942 if (mobj
is not None) and (mobj
.group(1) is not None):
1943 restriction_message
= re
.sub('\s+', ' ', mobj
.group(1)).strip()
1944 self
._downloader
.trouble(u
'ERROR: %s' % restriction_message
)
1946 self
._downloader
.trouble(u
'ERROR: unable to extract download URL from: %s' % url
)
1949 file_url
= mobj
.group(1)
1950 file_extension
= os
.path
.splitext(file_url
)[1][1:]
1952 # Search for file title
1953 mobj
= re
.search(r
'<b title="(.*?)">', webpage
)
1955 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1957 file_title
= mobj
.group(1).decode('utf-8')
1960 'id': file_id
.decode('utf-8'),
1961 'url': file_url
.decode('utf-8'),
1963 'upload_date': None,
1964 'title': file_title
,
1965 'ext': file_extension
.decode('utf-8'),
1969 class FacebookIE(InfoExtractor
):
1970 """Information Extractor for Facebook"""
1973 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1974 _LOGIN_URL
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1975 _NETRC_MACHINE
= 'facebook'
1976 _available_formats
= ['video', 'highqual', 'lowqual']
1977 _video_extensions
= {
1982 IE_NAME
= u
'facebook'
1984 def __init__(self
, downloader
=None):
1985 InfoExtractor
.__init
__(self
, downloader
)
1987 def _reporter(self
, message
):
1988 """Add header and report message."""
1989 self
._downloader
.to_screen(u
'[facebook] %s' % message
)
1991 def report_login(self
):
1992 """Report attempt to log in."""
1993 self
._reporter
(u
'Logging in')
1995 def report_video_webpage_download(self
, video_id
):
1996 """Report attempt to download video webpage."""
1997 self
._reporter
(u
'%s: Downloading video webpage' % video_id
)
1999 def report_information_extraction(self
, video_id
):
2000 """Report attempt to extract video information."""
2001 self
._reporter
(u
'%s: Extracting video information' % video_id
)
2003 def _parse_page(self
, video_webpage
):
2004 """Extract video information from page"""
2006 data
= {'title': r
'\("video_title", "(.*?)"\)',
2007 'description': r
'<div class="datawrap">(.*?)</div>',
2008 'owner': r
'\("video_owner_name", "(.*?)"\)',
2009 'thumbnail': r
'\("thumb_url", "(?P<THUMB>.*?)"\)',
2012 for piece
in data
.keys():
2013 mobj
= re
.search(data
[piece
], video_webpage
)
2014 if mobj
is not None:
2015 video_info
[piece
] = compat_urllib_parse
.unquote_plus(mobj
.group(1).decode("unicode_escape"))
2019 for fmt
in self
._available
_formats
:
2020 mobj
= re
.search(r
'\("%s_src\", "(.+?)"\)' % fmt
, video_webpage
)
2021 if mobj
is not None:
2022 # URL is in a Javascript segment inside an escaped Unicode format within
2023 # the generally utf-8 page
2024 video_urls
[fmt
] = compat_urllib_parse
.unquote_plus(mobj
.group(1).decode("unicode_escape"))
2025 video_info
['video_urls'] = video_urls
2029 def _real_initialize(self
):
2030 if self
._downloader
is None:
2035 downloader_params
= self
._downloader
.params
2037 # Attempt to use provided username and password or .netrc data
2038 if downloader_params
.get('username', None) is not None:
2039 useremail
= downloader_params
['username']
2040 password
= downloader_params
['password']
2041 elif downloader_params
.get('usenetrc', False):
2043 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
2044 if info
is not None:
2048 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
2049 except (IOError, netrc
.NetrcParseError
) as err
:
2050 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % compat_str(err
))
2053 if useremail
is None:
2062 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
2065 login_results
= compat_urllib_request
.urlopen(request
).read()
2066 if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None:
2067 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2069 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2070 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % compat_str(err
))
2073 def _real_extract(self
, url
):
2074 mobj
= re
.match(self
._VALID
_URL
, url
)
2076 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2078 video_id
= mobj
.group('ID')
2081 self
.report_video_webpage_download(video_id
)
2082 request
= compat_urllib_request
.Request('https://www.facebook.com/video/video.php?v=%s' % video_id
)
2084 page
= compat_urllib_request
.urlopen(request
)
2085 video_webpage
= page
.read()
2086 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2087 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2090 # Start extracting information
2091 self
.report_information_extraction(video_id
)
2093 # Extract information
2094 video_info
= self
._parse
_page
(video_webpage
)
2097 if 'owner' not in video_info
:
2098 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
2100 video_uploader
= video_info
['owner']
2103 if 'title' not in video_info
:
2104 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2106 video_title
= video_info
['title']
2107 video_title
= video_title
.decode('utf-8')
2110 if 'thumbnail' not in video_info
:
2111 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
2112 video_thumbnail
= ''
2114 video_thumbnail
= video_info
['thumbnail']
2118 if 'upload_date' in video_info
:
2119 upload_time
= video_info
['upload_date']
2120 timetuple
= email
.utils
.parsedate_tz(upload_time
)
2121 if timetuple
is not None:
2123 upload_date
= time
.strftime('%Y%m%d', timetuple
[0:9])
2128 video_description
= video_info
.get('description', 'No description available.')
2130 url_map
= video_info
['video_urls']
2132 # Decide which formats to download
2133 req_format
= self
._downloader
.params
.get('format', None)
2134 format_limit
= self
._downloader
.params
.get('format_limit', None)
2136 if format_limit
is not None and format_limit
in self
._available
_formats
:
2137 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
2139 format_list
= self
._available
_formats
2140 existing_formats
= [x
for x
in format_list
if x
in url_map
]
2141 if len(existing_formats
) == 0:
2142 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
2144 if req_format
is None:
2145 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
2146 elif req_format
== 'worst':
2147 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
2148 elif req_format
== '-1':
2149 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
2152 if req_format
not in url_map
:
2153 self
._downloader
.trouble(u
'ERROR: requested format not available')
2155 video_url_list
= [(req_format
, url_map
[req_format
])] # Specific format
2158 for format_param
, video_real_url
in video_url_list
:
2160 video_extension
= self
._video
_extensions
.get(format_param
, 'mp4')
2163 'id': video_id
.decode('utf-8'),
2164 'url': video_real_url
.decode('utf-8'),
2165 'uploader': video_uploader
.decode('utf-8'),
2166 'upload_date': upload_date
,
2167 'title': video_title
,
2168 'ext': video_extension
.decode('utf-8'),
2169 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
2170 'thumbnail': video_thumbnail
.decode('utf-8'),
2171 'description': video_description
.decode('utf-8'),
2175 class BlipTVIE(InfoExtractor
):
2176 """Information extractor for blip.tv"""
2178 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2179 _URL_EXT
= r
'^.*\.([a-z0-9]+)$'
2180 IE_NAME
= u
'blip.tv'
2182 def report_extraction(self
, file_id
):
2183 """Report information extraction."""
2184 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
2186 def report_direct_download(self
, title
):
2187 """Report information extraction."""
2188 self
._downloader
.to_screen(u
'[%s] %s: Direct download detected' % (self
.IE_NAME
, title
))
2190 def _real_extract(self
, url
):
2191 mobj
= re
.match(self
._VALID
_URL
, url
)
2193 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2200 json_url
= url
+ cchar
+ 'skin=json&version=2&no_wrap=1'
2201 request
= compat_urllib_request
.Request(json_url
)
2202 self
.report_extraction(mobj
.group(1))
2205 urlh
= compat_urllib_request
.urlopen(request
)
2206 if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download
2207 basename
= url
.split('/')[-1]
2208 title
,ext
= os
.path
.splitext(basename
)
2209 title
= title
.decode('UTF-8')
2210 ext
= ext
.replace('.', '')
2211 self
.report_direct_download(title
)
2216 'upload_date': None,
2221 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2222 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
2224 if info
is None: # Regular URL
2226 json_code_bytes
= urlh
.read()
2227 json_code
= json_code_bytes
.decode('utf-8')
2228 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2229 self
._downloader
.trouble(u
'ERROR: unable to read video info webpage: %s' % compat_str(err
))
2233 json_data
= json
.loads(json_code
)
2234 if 'Post' in json_data
:
2235 data
= json_data
['Post']
2239 upload_date
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2240 video_url
= data
['media']['url']
2241 umobj
= re
.match(self
._URL
_EXT
, video_url
)
2243 raise ValueError('Can not determine filename extension')
2244 ext
= umobj
.group(1)
2247 'id': data
['item_id'],
2249 'uploader': data
['display_name'],
2250 'upload_date': upload_date
,
2251 'title': data
['title'],
2253 'format': data
['media']['mimeType'],
2254 'thumbnail': data
['thumbnailUrl'],
2255 'description': data
['description'],
2256 'player_url': data
['embedUrl']
2258 except (ValueError,KeyError) as err
:
2259 self
._downloader
.trouble(u
'ERROR: unable to parse video information: %s' % repr(err
))
2262 std_headers
['User-Agent'] = 'iTunes/10.6.1'
2266 class MyVideoIE(InfoExtractor
):
2267 """Information Extractor for myvideo.de."""
2269 _VALID_URL
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2270 IE_NAME
= u
'myvideo'
2272 def __init__(self
, downloader
=None):
2273 InfoExtractor
.__init
__(self
, downloader
)
2275 def report_download_webpage(self
, video_id
):
2276 """Report webpage download."""
2277 self
._downloader
.to_screen(u
'[myvideo] %s: Downloading webpage' % video_id
)
2279 def report_extraction(self
, video_id
):
2280 """Report information extraction."""
2281 self
._downloader
.to_screen(u
'[myvideo] %s: Extracting information' % video_id
)
2283 def _real_extract(self
,url
):
2284 mobj
= re
.match(self
._VALID
_URL
, url
)
2286 self
._download
.trouble(u
'ERROR: invalid URL: %s' % url
)
2289 video_id
= mobj
.group(1)
2292 request
= compat_urllib_request
.Request('http://www.myvideo.de/watch/%s' % video_id
)
2294 self
.report_download_webpage(video_id
)
2295 webpage
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
2296 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2297 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
2300 self
.report_extraction(video_id
)
2301 mobj
= re
.search(r
'<link rel=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />',
2304 self._downloader.trouble(u'ERROR
: unable to extract media URL
')
2306 video_url = mobj.group(1) + ('/%s.flv
' % video_id)
2308 mobj = re.search('<title
>([^
<]+)</title
>', webpage)
2310 self._downloader.trouble(u'ERROR
: unable to extract title
')
2313 video_title = mobj.group(1)
2319 'upload_date
': None,
2320 'title
': video_title,
2324 class ComedyCentralIE(InfoExtractor):
2325 """Information extractor for The Daily Show and Colbert Report """
2327 # urls can be abbreviations like :thedailyshow or :colbert
2328 # urls for episodes like:
2329 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2330 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2331 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2332 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2333 |(https?://)?(www\.)?
2334 (?P<showname>thedailyshow|colbertnation)\.com/
2335 (full-episodes/(?P<episode>.*)|
2337 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2338 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2340 IE_NAME = u'comedycentral
'
2342 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2344 _video_extensions = {
2352 _video_dimensions = {
2361 def suitable(self, url):
2362 """Receives a URL and returns True if suitable for this IE."""
2363 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2365 def report_extraction(self, episode_id):
2366 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id)
2368 def report_config_download(self, episode_id):
2369 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id)
2371 def report_index_download(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral
] %s: Downloading show index
' % episode_id)
2374 def report_player_url(self, episode_id):
2375 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id)
2378 def _print_formats(self, formats):
2379 print('Available formats
:')
2381 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4
'), self._video_dimensions.get(x, '???
')))
2384 def _real_extract(self, url):
2385 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2390 if mobj.group('shortname
'):
2391 if mobj.group('shortname
') in ('tds
', 'thedailyshow
'):
2392 url = u'http
://www
.thedailyshow
.com
/full
-episodes
/'
2394 url = u'http
://www
.colbertnation
.com
/full
-episodes
/'
2395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396 assert mobj is not None
2398 if mobj.group('clip
'):
2399 if mobj.group('showname
') == 'thedailyshow
':
2400 epTitle = mobj.group('tdstitle
')
2402 epTitle = mobj.group('cntitle
')
2405 dlNewest = not mobj.group('episode
')
2407 epTitle = mobj.group('showname
')
2409 epTitle = mobj.group('episode
')
2411 req = compat_urllib_request.Request(url)
2412 self.report_extraction(epTitle)
2414 htmlHandle = compat_urllib_request.urlopen(req)
2415 html = htmlHandle.read()
2416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % compat_str(err))
2420 url = htmlHandle.geturl()
2421 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2423 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url)
2425 if mobj.group('episode
') == '':
2426 self._downloader.trouble(u'ERROR
: Redirected URL
is still
not specific
: ' + url)
2428 epTitle = mobj.group('episode
')
2430 mMovieParams = re.findall('(?
:<param name
="movie" value
="|var url = ")(http
://media
.mtvnservices
.com
/([^
"]*(?:episode|video).*?:.*?))"', html)
2432 if len(mMovieParams) == 0:
2433 # The Colbert Report embeds the information in a without
2434 # a URL prefix; so extract the alternate reference
2435 # and then add the URL prefix manually.
2437 altMovieParams = re.findall('data
-mgid
="([^"]*(?
:episode|video
).*?
:.*?
)"', html)
2438 if len(altMovieParams) == 0:
2439 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2442 mMovieParams = [("http
://media
.mtvnservices
.com
/" + altMovieParams[0], altMovieParams[0])]
2444 playerUrl_raw = mMovieParams[0][0]
2445 self.report_player_url(epTitle)
2447 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2448 playerUrl = urlHandle.geturl()
2449 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2453 uri = mMovieParams[0][1]
2454 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2455 self.report_index_download(epTitle)
2457 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2464 idoc = xml.etree.ElementTree.fromstring(indexXml)
2465 itemEls = idoc.findall('.//item')
2466 for itemEl in itemEls:
2467 mediaId = itemEl.findall('./guid')[0].text
2468 shortMediaId = mediaId.split(':')[-1]
2469 showId = mediaId.split(':')[-2].replace('.com', '')
2470 officialTitle = itemEl.findall('./title')[0].text
2471 officialDate = itemEl.findall('./pubDate')[0].text
2473 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2474 compat_urllib_parse.urlencode({'uri': mediaId}))
2475 configReq = compat_urllib_request.Request(configUrl)
2476 self.report_config_download(epTitle)
2478 configXml = compat_urllib_request.urlopen(configReq).read()
2479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2483 cdoc = xml.etree.ElementTree.fromstring(configXml)
2485 for rendition in cdoc.findall('.//rendition'):
2486 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2490 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2493 if self._downloader.params.get('listformats', None):
2494 self._print_formats([i[0] for i in turls])
2497 # For now, just pick the highest bitrate
2498 format,video_url = turls[-1]
2500 # Get the format arg from the arg stream
2501 req_format = self._downloader.params.get('format', None)
2503 # Select format if we can find one
2506 format, video_url = f, v
2509 # Patch to download from alternative CDN, which does not
2510 # break on current RTMPDump builds
2511 broken_cdn = "rtmpe
://viacomccstrmfs
.fplive
.net
/viacomccstrm
/gsp
.comedystor
/"
2512 better_cdn = "rtmpe
://cp10740
.edgefcs
.net
/ondemand
/mtvnorigin
/gsp
.comedystor
/"
2514 if video_url.startswith(broken_cdn):
2515 video_url = video_url.replace(broken_cdn, better_cdn)
2517 effTitle = showId + u'-' + epTitle
2522 'upload_date': officialDate,
2527 'description': officialTitle,
2528 'player_url': None #playerUrl
2531 results.append(info)
2536 class EscapistIE(InfoExtractor):
2537 """Information extractor for The Escapist """
2539 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540 IE_NAME = u'escapist'
2542 def report_extraction(self, showName):
2543 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2545 def report_config_download(self, showName):
2546 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2553 showName = mobj.group('showname')
2554 videoId = mobj.group('episode')
2556 self.report_extraction(showName)
2558 webPage = compat_urllib_request.urlopen(url)
2559 webPageBytes = webPage.read()
2560 m = re.match(r'text/html; charset="?
([^
"]+)"?
', webPage.headers['Content
-Type
'])
2561 webPage = webPageBytes.decode(m.group(1) if m else 'utf
-8')
2562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563 self._downloader.trouble(u'ERROR
: unable to download webpage
: ' + compat_str(err))
2566 descMatch = re.search('<meta name
="description" content
="([^"]*)"', webPage)
2567 description = unescapeHTML(descMatch.group(1))
2568 imgMatch = re.search('<meta property="og
:image
" content="([^
"]*)"', webPage)
2569 imgUrl = unescapeHTML(imgMatch.group(1))
2570 playerUrlMatch = re.search('<meta
property="og:video" content
="([^"]*)"', webPage)
2571 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572 configUrlMatch = re.search('config=(.*)$', playerUrl)
2573 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2575 self.report_config_download(showName)
2577 configJSON = compat_urllib_request.urlopen(configUrl)
2578 m = re.match(r'text/html; charset="?
([^
"]+)"?
', configJSON.headers['Content
-Type
'])
2579 configJSON = configJSON.read().decode(m.group(1) if m else 'utf
-8')
2580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581 self._downloader.trouble(u'ERROR
: unable to download configuration
: ' + compat_str(err))
2584 # Technically, it's JavaScript
, not JSON
2585 configJSON
= configJSON
.replace("'", '"')
2588 config
= json
.loads(configJSON
)
2589 except (ValueError,) as err
:
2590 self
._downloader
.trouble(u
'ERROR: Invalid JSON in configuration file: ' + compat_str(err
))
2593 playlist
= config
['playlist']
2594 videoUrl
= playlist
[1]['url']
2599 'uploader': showName
,
2600 'upload_date': None,
2603 'thumbnail': imgUrl
,
2604 'description': description
,
2605 'player_url': playerUrl
,
2611 class CollegeHumorIE(InfoExtractor
):
2612 """Information extractor for collegehumor.com"""
2615 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616 IE_NAME
= u
'collegehumor'
2618 def report_manifest(self
, video_id
):
2619 """Report information extraction."""
2620 self
._downloader
.to_screen(u
'[%s] %s: Downloading XML manifest' % (self
.IE_NAME
, video_id
))
2622 def report_extraction(self
, video_id
):
2623 """Report information extraction."""
2624 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2626 def _real_extract(self
, url
):
2627 mobj
= re
.match(self
._VALID
_URL
, url
)
2629 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2631 video_id
= mobj
.group('videoid')
2636 'upload_date': None,
2639 self
.report_extraction(video_id
)
2640 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
2642 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
2643 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2644 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % compat_str(err
))
2647 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
2649 videoNode
= mdoc
.findall('./video')[0]
2650 info
['description'] = videoNode
.findall('./description')[0].text
2651 info
['title'] = videoNode
.findall('./caption')[0].text
2652 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
2653 manifest_url
= videoNode
.findall('./file')[0].text
2655 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file')
2658 manifest_url
+= '?hdcore=2.10.3'
2659 self
.report_manifest(video_id
)
2661 manifestXml
= compat_urllib_request
.urlopen(manifest_url
).read()
2662 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2663 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % compat_str(err
))
2666 adoc
= xml
.etree
.ElementTree
.fromstring(manifestXml
)
2668 media_node
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669 node_id
= media_node
.attrib
['url']
2670 video_id
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671 except IndexError as err
:
2672 self
._downloader
.trouble(u
'\nERROR: Invalid manifest file')
2675 url_pr
= compat_urllib_parse_urlparse(manifest_url
)
2676 url
= url_pr
.scheme
+ '://' + url_pr
.netloc
+ '/z' + video_id
[:-2] + '/' + node_id
+ 'Seg1-Frag1'
2683 class XVideosIE(InfoExtractor
):
2684 """Information extractor for xvideos.com"""
2686 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687 IE_NAME
= u
'xvideos'
2689 def report_webpage(self
, video_id
):
2690 """Report information extraction."""
2691 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2693 def report_extraction(self
, video_id
):
2694 """Report information extraction."""
2695 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2697 def _real_extract(self
, url
):
2698 mobj
= re
.match(self
._VALID
_URL
, url
)
2700 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2702 video_id
= mobj
.group(1)
2704 self
.report_webpage(video_id
)
2706 request
= compat_urllib_request
.Request(r
'http://www.xvideos.com/video' + video_id
)
2708 webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
2709 webpage
= webpage_bytes
.decode('utf-8', 'replace')
2710 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2711 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2714 self
.report_extraction(video_id
)
2718 mobj
= re
.search(r
'flv_url=(.+?)&', webpage
)
2720 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
2722 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
2726 mobj
= re
.search(r
'<title>(.*?)\s+-\s+XVID', webpage
)
2728 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2730 video_title
= mobj
.group(1)
2733 # Extract video thumbnail
2734 mobj
= re
.search(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage
)
2736 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
2738 video_thumbnail
= mobj
.group(0)
2744 'upload_date': None,
2745 'title': video_title
,
2747 'thumbnail': video_thumbnail
,
2748 'description': None,
2754 class SoundcloudIE(InfoExtractor
):
2755 """Information extractor for soundcloud.com
2756 To access the media, the uid of the song and a stream token
2757 must be extracted from the page source and the script must make
2758 a request to media.soundcloud.com/crossdomain.xml. Then
2759 the media can be grabbed by requesting from an url composed
2760 of the stream token and uid
2763 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2764 IE_NAME
= u
'soundcloud'
2766 def __init__(self
, downloader
=None):
2767 InfoExtractor
.__init
__(self
, downloader
)
2769 def report_resolve(self
, video_id
):
2770 """Report information extraction."""
2771 self
._downloader
.to_screen(u
'[%s] %s: Resolving id' % (self
.IE_NAME
, video_id
))
2773 def report_extraction(self
, video_id
):
2774 """Report information extraction."""
2775 self
._downloader
.to_screen(u
'[%s] %s: Retrieving stream' % (self
.IE_NAME
, video_id
))
2777 def _real_extract(self
, url
):
2778 mobj
= re
.match(self
._VALID
_URL
, url
)
2780 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2783 # extract uploader (which is in the url)
2784 uploader
= mobj
.group(1)
2785 # extract simple title (uploader + slug of song title)
2786 slug_title
= mobj
.group(2)
2787 simple_title
= uploader
+ u
'-' + slug_title
2789 self
.report_resolve('%s/%s' % (uploader
, slug_title
))
2791 url
= 'http://soundcloud.com/%s/%s' % (uploader
, slug_title
)
2792 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793 request
= compat_urllib_request
.Request(resolv_url
)
2795 info_json_bytes
= compat_urllib_request
.urlopen(request
).read()
2796 info_json
= info_json_bytes
.decode('utf-8')
2797 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2798 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2801 info
= json
.loads(info_json
)
2802 video_id
= info
['id']
2803 self
.report_extraction('%s/%s' % (uploader
, slug_title
))
2805 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2806 request
= compat_urllib_request
.Request(streams_url
)
2808 stream_json_bytes
= compat_urllib_request
.urlopen(request
).read()
2809 stream_json
= stream_json_bytes
.decode('utf-8')
2810 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2811 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2814 streams
= json
.loads(stream_json
)
2815 mediaURL
= streams
['http_mp3_128_url']
2820 'uploader': info
['user']['username'],
2821 'upload_date': info
['created_at'],
2822 'title': info
['title'],
2824 'description': info
['description'],
2828 class InfoQIE(InfoExtractor
):
2829 """Information extractor for infoq.com"""
2831 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2834 def report_webpage(self
, video_id
):
2835 """Report information extraction."""
2836 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2838 def report_extraction(self
, video_id
):
2839 """Report information extraction."""
2840 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2842 def _real_extract(self
, url
):
2843 mobj
= re
.match(self
._VALID
_URL
, url
)
2845 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2848 self
.report_webpage(url
)
2850 request
= compat_urllib_request
.Request(url
)
2852 webpage
= compat_urllib_request
.urlopen(request
).read()
2853 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2854 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2857 self
.report_extraction(url
)
2861 mobj
= re
.search(r
"jsclassref='([^']*)'", webpage
)
2863 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
2865 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse
.unquote(mobj
.group(1).decode('base64'))
2869 mobj
= re
.search(r
'contentTitle = "(.*?)";', webpage
)
2871 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2873 video_title
= mobj
.group(1).decode('utf-8')
2875 # Extract description
2876 video_description
= u
'No description available.'
2877 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', webpage
)
2878 if mobj
is not None:
2879 video_description
= mobj
.group(1).decode('utf-8')
2881 video_filename
= video_url
.split('/')[-1]
2882 video_id
, extension
= video_filename
.split('.')
2888 'upload_date': None,
2889 'title': video_title
,
2890 'ext': extension
, # Extension is always(?) mp4, but seems to be flv
2892 'description': video_description
,
2897 class MixcloudIE(InfoExtractor
):
2898 """Information extractor for www.mixcloud.com"""
2900 _WORKING
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2901 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2902 IE_NAME
= u
'mixcloud'
2904 def __init__(self
, downloader
=None):
2905 InfoExtractor
.__init
__(self
, downloader
)
2907 def report_download_json(self
, file_id
):
2908 """Report JSON download."""
2909 self
._downloader
.to_screen(u
'[%s] Downloading json' % self
.IE_NAME
)
2911 def report_extraction(self
, file_id
):
2912 """Report information extraction."""
2913 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
2915 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
2916 """Get urls from 'audio_formats' section in json"""
2919 bitrate_list
= jsonData
[fmt
]
2920 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
2921 bitrate
= max(bitrate_list
) # select highest
2923 url_list
= jsonData
[fmt
][bitrate
]
2924 except TypeError: # we have no bitrate info.
2925 url_list
= jsonData
[fmt
]
2928 def check_urls(self
, url_list
):
2929 """Returns 1st active url from list"""
2930 for url
in url_list
:
2932 compat_urllib_request
.urlopen(url
)
2934 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2939 def _print_formats(self
, formats
):
2940 print('Available formats:')
2941 for fmt
in formats
.keys():
2942 for b
in formats
[fmt
]:
2944 ext
= formats
[fmt
][b
][0]
2945 print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]))
2946 except TypeError: # we have no bitrate info
2947 ext
= formats
[fmt
][0]
2948 print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]))
2951 def _real_extract(self
, url
):
2952 mobj
= re
.match(self
._VALID
_URL
, url
)
2954 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2956 # extract uploader & filename from url
2957 uploader
= mobj
.group(1).decode('utf-8')
2958 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
2960 # construct API request
2961 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
2962 # retrieve .json file with links to files
2963 request
= compat_urllib_request
.Request(file_url
)
2965 self
.report_download_json(file_url
)
2966 jsonData
= compat_urllib_request
.urlopen(request
).read()
2967 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2968 self
._downloader
.trouble(u
'ERROR: Unable to retrieve file: %s' % compat_str(err
))
2972 json_data
= json
.loads(jsonData
)
2973 player_url
= json_data
['player_swf_url']
2974 formats
= dict(json_data
['audio_formats'])
2976 req_format
= self
._downloader
.params
.get('format', None)
2979 if self
._downloader
.params
.get('listformats', None):
2980 self
._print
_formats
(formats
)
2983 if req_format
is None or req_format
== 'best':
2984 for format_param
in formats
.keys():
2985 url_list
= self
.get_urls(formats
, format_param
)
2987 file_url
= self
.check_urls(url_list
)
2988 if file_url
is not None:
2991 if req_format
not in formats
:
2992 self
._downloader
.trouble(u
'ERROR: format is not available')
2995 url_list
= self
.get_urls(formats
, req_format
)
2996 file_url
= self
.check_urls(url_list
)
2997 format_param
= req_format
3000 'id': file_id
.decode('utf-8'),
3001 'url': file_url
.decode('utf-8'),
3002 'uploader': uploader
.decode('utf-8'),
3003 'upload_date': None,
3004 'title': json_data
['name'],
3005 'ext': file_url
.split('.')[-1].decode('utf-8'),
3006 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
3007 'thumbnail': json_data
['thumbnail_url'],
3008 'description': json_data
['description'],
3009 'player_url': player_url
.decode('utf-8'),
3012 class StanfordOpenClassroomIE(InfoExtractor
):
3013 """Information extractor for Stanford's Open ClassRoom"""
3015 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3016 IE_NAME
= u
'stanfordoc'
3018 def report_download_webpage(self
, objid
):
3019 """Report information extraction."""
3020 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, objid
))
3022 def report_extraction(self
, video_id
):
3023 """Report information extraction."""
3024 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3026 def _real_extract(self
, url
):
3027 mobj
= re
.match(self
._VALID
_URL
, url
)
3029 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3032 if mobj
.group('course') and mobj
.group('video'): # A specific video
3033 course
= mobj
.group('course')
3034 video
= mobj
.group('video')
3036 'id': course
+ '_' + video
,
3038 'upload_date': None,
3041 self
.report_extraction(info
['id'])
3042 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
3043 xmlUrl
= baseUrl
+ video
+ '.xml'
3045 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
3046 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3047 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % compat_str(err
))
3049 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
3051 info
['title'] = mdoc
.findall('./title')[0].text
3052 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
3054 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file')
3056 info
['ext'] = info
['url'].rpartition('.')[2]
3058 elif mobj
.group('course'): # A course page
3059 course
= mobj
.group('course')
3064 'upload_date': None,
3067 self
.report_download_webpage(info
['id'])
3069 coursepage
= compat_urllib_request
.urlopen(url
).read()
3070 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3071 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + compat_str(err
))
3074 m
= re
.search('<h1>([^<]+)</h1>', coursepage
)
3076 info
['title'] = unescapeHTML(m
.group(1))
3078 info
['title'] = info
['id']
3080 m
= re
.search('<description>([^<]+)</description>', coursepage
)
3082 info
['description'] = unescapeHTML(m
.group(1))
3084 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
3087 'type': 'reference',
3088 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
3092 for entry
in info
['list']:
3093 assert entry
['type'] == 'reference'
3094 results
+= self
.extract(entry
['url'])
3099 'id': 'Stanford OpenClassroom',
3102 'upload_date': None,
3105 self
.report_download_webpage(info
['id'])
3106 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3108 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
3109 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3110 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + compat_str(err
))
3113 info
['title'] = info
['id']
3115 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
3118 'type': 'reference',
3119 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
3124 for entry
in info
['list']:
3125 assert entry
['type'] == 'reference'
3126 results
+= self
.extract(entry
['url'])
3129 class MTVIE(InfoExtractor
):
3130 """Information extractor for MTV.com"""
3132 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3135 def report_webpage(self
, video_id
):
3136 """Report information extraction."""
3137 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
3139 def report_extraction(self
, video_id
):
3140 """Report information extraction."""
3141 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3143 def _real_extract(self
, url
):
3144 mobj
= re
.match(self
._VALID
_URL
, url
)
3146 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3148 if not mobj
.group('proto'):
3149 url
= 'http://' + url
3150 video_id
= mobj
.group('videoid')
3151 self
.report_webpage(video_id
)
3153 request
= compat_urllib_request
.Request(url
)
3155 webpage
= compat_urllib_request
.urlopen(request
).read()
3156 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3157 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
3160 mobj
= re
.search(r
'<meta name="mtv_vt" content="([^"]+)"/>', webpage
)
3162 self
._downloader
.trouble(u
'ERROR: unable to extract song name')
3164 song_name
= unescapeHTML(mobj
.group(1).decode('iso-8859-1'))
3165 mobj
= re
.search(r
'<meta name="mtv_an" content="([^"]+)"/>', webpage
)
3167 self
._downloader
.trouble(u
'ERROR: unable to extract performer')
3169 performer
= unescapeHTML(mobj
.group(1).decode('iso-8859-1'))
3170 video_title
= performer
+ ' - ' + song_name
3172 mobj
= re
.search(r
'<meta name="mtvn_uri" content="([^"]+)"/>', webpage
)
3174 self
._downloader
.trouble(u
'ERROR: unable to mtvn_uri')
3176 mtvn_uri
= mobj
.group(1)
3178 mobj
= re
.search(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage
)
3180 self
._downloader
.trouble(u
'ERROR: unable to extract content id')
3182 content_id
= mobj
.group(1)
3184 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3185 self
.report_extraction(video_id
)
3186 request
= compat_urllib_request
.Request(videogen_url
)
3188 metadataXml
= compat_urllib_request
.urlopen(request
).read()
3189 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3190 self
._downloader
.trouble(u
'ERROR: unable to download video metadata: %s' % compat_str(err
))
3193 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
3194 renditions
= mdoc
.findall('.//rendition')
3196 # For now, always pick the highest quality.
3197 rendition
= renditions
[-1]
3200 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
3201 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
3202 video_url
= rendition
.find('./src').text
3204 self
._downloader
.trouble('Invalid rendition field.')
3210 'uploader': performer
,
3211 'upload_date': None,
3212 'title': video_title
,
3220 class YoukuIE(InfoExtractor
):
3221 _VALID_URL
= r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3223 def report_download_webpage(self
, file_id
):
3224 """Report webpage download."""
3225 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, file_id
))
3227 def report_extraction(self
, file_id
):
3228 """Report information extraction."""
3229 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
3232 nowTime
= int(time
.time() * 1000)
3233 random1
= random
.randint(1000,1998)
3234 random2
= random
.randint(1000,9999)
3236 return "%d%d%d" %(nowTime
,random1
,random2
)
3238 def _get_file_ID_mix_string(self
, seed
):
3240 source
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3242 for i
in range(len(source
)):
3243 seed
= (seed
* 211 + 30031 ) % 65536
3244 index
= math
.floor(seed
/ 65536 * len(source
) )
3245 mixed
.append(source
[int(index
)])
3246 source
.remove(source
[int(index
)])
3247 #return ''.join(mixed)
3250 def _get_file_id(self
, fileId
, seed
):
3251 mixed
= self
._get
_file
_ID
_mix
_string
(seed
)
3252 ids
= fileId
.split('*')
3256 realId
.append(mixed
[int(ch
)])
3257 return ''.join(realId
)
3259 def _real_extract(self
, url
):
3260 mobj
= re
.match(self
._VALID
_URL
, url
)
3262 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3264 video_id
= mobj
.group('ID')
3266 info_url
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3268 request
= compat_urllib_request
.Request(info_url
, None, std_headers
)
3270 self
.report_download_webpage(video_id
)
3271 jsondata
= compat_urllib_request
.urlopen(request
).read()
3272 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3273 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
3276 self
.report_extraction(video_id
)
3278 jsonstr
= jsondata
.decode('utf-8')
3279 config
= json
.loads(jsonstr
)
3281 video_title
= config
['data'][0]['title']
3282 seed
= config
['data'][0]['seed']
3284 format
= self
._downloader
.params
.get('format', None)
3285 supported_format
= list(config
['data'][0]['streamfileids'].keys())
3287 if format
is None or format
== 'best':
3288 if 'hd2' in supported_format
:
3293 elif format
== 'worst':
3301 fileid
= config
['data'][0]['streamfileids'][format
]
3302 keys
= [s
['k'] for s
in config
['data'][0]['segs'][format
]]
3303 except (UnicodeDecodeError, ValueError, KeyError):
3304 self
._downloader
.trouble(u
'ERROR: unable to extract info section')
3308 sid
= self
._gen
_sid
()
3309 fileid
= self
._get
_file
_id
(fileid
, seed
)
3311 #column 8,9 of fileid represent the segment number
3312 #fileid[7:9] should be changed
3313 for index
, key
in enumerate(keys
):
3315 temp_fileid
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:])
3316 download_url
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
)
3319 'id': '%s_part%02d' % (video_id
, index
),
3320 'url': download_url
,
3322 'upload_date': None,
3323 'title': video_title
,
3326 files_info
.append(info
)
3331 class XNXXIE(InfoExtractor
):
3332 """Information extractor for xnxx.com"""
3334 _VALID_URL
= r
'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3336 VIDEO_URL_RE
= r
'flv_url=(.*?)&'
3337 VIDEO_TITLE_RE
= r
'<title>(.*?)\s+-\s+XNXX.COM'
3338 VIDEO_THUMB_RE
= r
'url_bigthumb=(.*?)&'
3340 def report_webpage(self
, video_id
):
3341 """Report information extraction"""
3342 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
3344 def report_extraction(self
, video_id
):
3345 """Report information extraction"""
3346 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3348 def _real_extract(self
, url
):
3349 mobj
= re
.match(self
._VALID
_URL
, url
)
3351 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3353 video_id
= mobj
.group(1)
3355 self
.report_webpage(video_id
)
3357 # Get webpage content
3359 webpage_bytes
= compat_urllib_request
.urlopen(url
).read()
3360 webpage
= webpage_bytes
.decode('utf-8')
3361 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3362 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % err
)
3365 result
= re
.search(self
.VIDEO_URL_RE
, webpage
)
3367 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
3369 video_url
= compat_urllib_parse
.unquote(result
.group(1))
3371 result
= re
.search(self
.VIDEO_TITLE_RE
, webpage
)
3373 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
3375 video_title
= result
.group(1)
3377 result
= re
.search(self
.VIDEO_THUMB_RE
, webpage
)
3379 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
3381 video_thumbnail
= result
.group(1)
3387 'upload_date': None,
3388 'title': video_title
,
3390 'thumbnail': video_thumbnail
,
3391 'description': None,
3395 class GooglePlusIE(InfoExtractor
):
3396 """Information extractor for plus.google.com."""
3398 _VALID_URL
= r
'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3399 IE_NAME
= u
'plus.google'
3401 def __init__(self
, downloader
=None):
3402 InfoExtractor
.__init
__(self
, downloader
)
3404 def report_extract_entry(self
, url
):
3405 """Report downloading extry"""
3406 self
._downloader
.to_screen(u
'[plus.google] Downloading entry: %s' % url
)
3408 def report_date(self
, upload_date
):
3409 """Report downloading extry"""
3410 self
._downloader
.to_screen(u
'[plus.google] Entry date: %s' % upload_date
)
3412 def report_uploader(self
, uploader
):
3413 """Report downloading extry"""
3414 self
._downloader
.to_screen(u
'[plus.google] Uploader: %s' % uploader
)
3416 def report_title(self
, video_title
):
3417 """Report downloading extry"""
3418 self
._downloader
.to_screen(u
'[plus.google] Title: %s' % video_title
)
3420 def report_extract_vid_page(self
, video_page
):
3421 """Report information extraction."""
3422 self
._downloader
.to_screen(u
'[plus.google] Extracting video page: %s' % video_page
)
3424 def _real_extract(self
, url
):
3425 # Extract id from URL
3426 mobj
= re
.match(self
._VALID
_URL
, url
)
3428 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
3431 post_url
= mobj
.group(0)
3432 video_id
= mobj
.group(1)
3434 video_extension
= 'flv'
3436 # Step 1, Retrieve post webpage to extract further information
3437 self
.report_extract_entry(post_url
)
3438 request
= compat_urllib_request
.Request(post_url
)
3440 webpage
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
3441 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3442 self
._downloader
.trouble(u
'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err
))
3445 # Extract update date
3447 pattern
= 'title="Timestamp">(.*?)</a>'
3448 mobj
= re
.search(pattern
, webpage
)
3450 upload_date
= mobj
.group(1)
3451 # Convert timestring to a format suitable for filename
3452 upload_date
= datetime
.datetime
.strptime(upload_date
, "%Y-%m-%d")
3453 upload_date
= upload_date
.strftime('%Y%m%d')
3454 self
.report_date(upload_date
)
3458 pattern
= r
'rel\="author".*?>(.*?)</a>'
3459 mobj
= re
.search(pattern
, webpage
)
3461 uploader
= mobj
.group(1)
3462 self
.report_uploader(uploader
)
3465 # Get the first line for title
3467 pattern
= r
'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3468 mobj
= re
.search(pattern
, webpage
)
3470 video_title
= mobj
.group(1)
3471 self
.report_title(video_title
)
3473 # Step 2, Stimulate clicking the image box to launch video
3474 pattern
= '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3475 mobj
= re
.search(pattern
, webpage
)
3477 self
._downloader
.trouble(u
'ERROR: unable to extract video page URL')
3479 video_page
= mobj
.group(1)
3480 request
= compat_urllib_request
.Request(video_page
)
3482 webpage
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
3483 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3484 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
3486 self
.report_extract_vid_page(video_page
)
3489 # Extract video links on video page
3490 """Extract video links of all sizes"""
3491 pattern
= '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3492 mobj
= re
.findall(pattern
, webpage
)
3494 self
._downloader
.trouble(u
'ERROR: unable to extract video links')
3496 # Sort in resolution
3497 links
= sorted(mobj
)
3499 # Choose the lowest of the sort, i.e. highest resolution
3500 video_url
= links
[-1]
3501 # Only get the url. The resolution part in the tuple has no use anymore
3502 video_url
= video_url
[-1]
3503 # Treat escaped \u0026 style hex
3505 video_url
= video_url
.decode("unicode_escape")
3506 except AttributeError: # Python 3
3507 video_url
= bytes(video_url
, 'ascii').decode('unicode-escape')
3513 'uploader': uploader
,
3514 'upload_date': upload_date
,
3515 'title': video_title
,
3516 'ext': video_extension
,
3519 class NBAIE(InfoExtractor
):
3520 _VALID_URL
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3523 def report_extraction(self
, video_id
):
3524 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3526 def _real_extract(self
, url
):
3527 mobj
= re
.match(self
._VALID
_URL
, url
)
3529 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3532 video_id
= mobj
.group(1)
3533 if video_id
.endswith('/index.html'):
3534 video_id
= video_id
[:-len('/index.html')]
3536 self
.report_extraction(video_id
)
3538 urlh
= compat_urllib_request
.urlopen(url
)
3539 webpage_bytes
= urlh
.read()
3540 webpage
= webpage_bytes
.decode('utf-8', 'ignore')
3541 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3542 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % compat_str(err
))
3545 video_url
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id
+ '_nba_1280x720.mp4'
3546 def _findProp(rexp
, default
=None):
3547 m
= re
.search(rexp
, webpage
)
3549 return unescapeHTML(m
.group(1))
3553 shortened_video_id
= video_id
.rpartition('/')[2]
3554 title
= _findProp(r
'<meta property="og:title" content="(.*?)"', shortened_video_id
).replace('NBA.com: ', '')
3556 'id': shortened_video_id
,
3560 'uploader_date': _findProp(r
'<b>Date:</b> (.*?)</div>'),
3561 'description': _findProp(r
'<div class="description">(.*?)</h1>'),
3565 class JustinTVIE(InfoExtractor
):
3566 """Information extractor for justin.tv and twitch.tv"""
3567 # TODO: One broadcast may be split into multiple videos. The key
3568 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3569 # starts at 1 and increases. Can we treat all parts as one video?
3571 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3572 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3573 _JUSTIN_PAGE_LIMIT
= 100
3574 IE_NAME
= u
'justin.tv'
3576 def report_extraction(self
, file_id
):
3577 """Report information extraction."""
3578 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
3580 def report_download_page(self
, channel
, offset
):
3581 """Report attempt to download a single page of videos."""
3582 self
._downloader
.to_screen(u
'[%s] %s: Downloading video information from %d to %d' %
3583 (self
.IE_NAME
, channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
3585 # Return count of items, list of *valid* items
3586 def _parse_page(self
, url
):
3588 urlh
= compat_urllib_request
.urlopen(url
)
3589 webpage_bytes
= urlh
.read()
3590 webpage
= webpage_bytes
.decode('utf-8', 'ignore')
3591 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3592 self
._downloader
.trouble(u
'ERROR: unable to download video info JSON: %s' % compat_str(err
))
3595 response
= json
.loads(webpage
)
3597 for clip
in response
:
3598 video_url
= clip
['video_file_url']
3600 video_extension
= os
.path
.splitext(video_url
)[1][1:]
3601 video_date
= re
.sub('-', '', clip
['created_on'][:10])
3605 'title': clip
['title'],
3606 'uploader': clip
.get('user_id', clip
.get('channel_id')),
3607 'upload_date': video_date
,
3608 'ext': video_extension
,
3610 return (len(response
), info
)
3612 def _real_extract(self
, url
):
3613 mobj
= re
.match(self
._VALID
_URL
, url
)
3615 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3618 api
= 'http://api.justin.tv'
3619 video_id
= mobj
.group(mobj
.lastindex
)
3621 if mobj
.lastindex
== 1:
3623 api
+= '/channel/archives/%s.json'
3625 api
+= '/clip/show/%s.json'
3626 api
= api
% (video_id
,)
3628 self
.report_extraction(video_id
)
3632 limit
= self
._JUSTIN
_PAGE
_LIMIT
3635 self
.report_download_page(video_id
, offset
)
3636 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
3637 page_count
, page_info
= self
._parse
_page
(page_url
)
3638 info
.extend(page_info
)
3639 if not paged
or page_count
!= limit
:
3644 class FunnyOrDieIE(InfoExtractor
):
3645 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3647 def report_extraction(self
, video_id
):
3648 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3650 def _real_extract(self
, url
):
3651 mobj
= re
.match(self
._VALID
_URL
, url
)
3653 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3656 video_id
= mobj
.group('id')
3657 self
.report_extraction(video_id
)
3659 urlh
= compat_urllib_request
.urlopen(url
)
3660 webpage_bytes
= urlh
.read()
3661 webpage
= webpage_bytes
.decode('utf-8', 'ignore')
3662 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3663 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
3666 m
= re
.search(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage
, re
.DOTALL
)
3668 self
._downloader
.trouble(u
'ERROR: unable to find video information')
3669 video_url
= unescapeHTML(m
.group('url'))
3671 m
= re
.search(r
"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage
)
3673 self
._downloader
.trouble(u
'Cannot find video title')
3674 title
= unescapeHTML(m
.group('title'))
3676 m
= re
.search(r
'<meta property="og:description" content="(?P<desc>.*?)"', webpage
)
3678 desc
= unescapeHTML(m
.group('desc'))
3687 'description': desc
,
3691 class TweetReelIE(InfoExtractor
):
3692 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3694 def report_extraction(self
, video_id
):
3695 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
3697 def _real_extract(self
, url
):
3698 mobj
= re
.match(self
._VALID
_URL
, url
)
3700 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
3703 video_id
= mobj
.group('id')
3704 self
.report_extraction(video_id
)
3706 urlh
= compat_urllib_request
.urlopen(url
)
3707 webpage_bytes
= urlh
.read()
3708 webpage
= webpage_bytes
.decode('utf-8', 'ignore')
3709 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3710 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
3713 m
= re
.search(r
'<div id="left" status_id="([0-9]+)">', webpage
)
3715 self
._downloader
.trouble(u
'ERROR: Cannot find status ID')
3716 status_id
= m
.group(1)
3718 m
= re
.search(r
'<div class="tweet_text">(.*?)</div>', webpage
, flags
=re
.DOTALL
)
3720 self
._downloader
.trouble(u
'WARNING: Cannot find description')
3721 desc
= unescapeHTML(re
.sub('<a.*?</a>', '', m
.group(1))).strip()
3723 m
= re
.search(r
'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage
, flags
=re
.DOTALL
)
3725 self
._downloader
.trouble(u
'ERROR: Cannot find uploader')
3726 uploader
= unescapeHTML(m
.group('uploader'))
3727 uploader_id
= unescapeHTML(m
.group('uploader_id'))
3729 m
= re
.search(r
'<span unixtime="([0-9]+)"', webpage
)
3731 self
._downloader
.trouble(u
'ERROR: Cannot find upload date')
3732 upload_date
= datetime
.datetime
.fromtimestamp(int(m
.group(1))).strftime('%Y%m%d')
3735 video_url
= 'http://files.tweetreel.com/video/' + status_id
+ '.mov'
3742 'description': desc
,
3743 'uploader': uploader
,
3744 'uploader_id': uploader_id
,
3745 'internal_id': status_id
,
3746 'upload_date': upload_date
3750 class SteamIE(InfoExtractor
):
3751 _VALID_URL
= r
"""http://store.steampowered.com/
3752 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3754 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3757 def suitable(self
, url
):
3758 """Receives a URL and returns True if suitable for this IE."""
3759 return re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) is not None
3761 def report_download_video_page(self
, game_id
):
3762 self
._downloader
.to_screen(u
'[%s] %s: Downloading video page' % (self
.IE_NAME
, game_id
))
3764 def _real_extract(self
, url
):
3765 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
3766 urlRE
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3767 gameID
= m
.group('gameID')
3768 videourl
= 'http://store.steampowered.com/video/%s/' % gameID
3770 self
.report_download_video_page(gameID
)
3771 urlh
= compat_urllib_request
.urlopen(videourl
)
3772 webpage_bytes
= urlh
.read()
3773 webpage
= webpage_bytes
.decode('utf-8', 'ignore')
3774 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3775 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
3777 mweb
= re
.finditer(urlRE
, webpage
)
3778 namesRE
= r
'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3779 titles
= list(re
.finditer(namesRE
, webpage
))
3783 video_id
= vid
.group('videoID')
3784 title
= titles
[i
].group('videoName')
3785 video_url
=vid
.group('videoURL')
3787 self
._downloader
.trouble(u
'ERROR: Cannot find video url for %s' % video_id
)
3798 class UstreamIE(InfoExtractor
):
3799 _VALID_URL
= r
'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3800 IE_NAME
= u
'ustream'
3802 def _real_extract(self
, url
):
3803 m
= re
.match(self
._VALID
_URL
, url
)
3804 video_id
= m
.group('videoID')
3805 video_url
= u
'http://tcdn.ustream.tv/video/%s' % video_id
3807 urlh
= compat_urllib_request
.urlopen(url
)
3808 webpage_bytes
= urlh
.read()
3809 webpage
= webpage_bytes
.decode('utf-8', 'ignore')
3810 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
3811 raise ExtractorError(u
'unable to download webpage: %s' % compat_str(err
))
3812 m
= re
.search(r
'data-title="(?P<title>.+)"',webpage
)
3813 title
= m
.group('title')
3814 m
= re
.search(r
'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage
)
3815 uploader
= m
.group('uploader')
3821 'uploader': uploader
3826 def gen_extractors():
3827 """ Return a list of an instance of every supported extractor.
3828 The order does matter; the first extractor matched is the one handling the URL.
3831 YoutubePlaylistIE(),
3855 StanfordOpenClassroomIE(),