def IE_NAME(self):
return type(self).__name__[:-2]
- def _download_webpage(self, url, video_id, note=None, errnote=None):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ """ Returns the response handle """
if note is None:
note = u'Downloading video webpage'
self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
try:
- urlh = compat_urllib_request.urlopen(url)
- webpage_bytes = urlh.read()
- return webpage_bytes.decode('utf-8', 'replace')
+ return compat_urllib_request.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ """ Returns the data of the page as a string """
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+ webpage_bytes = urlh.read()
+ return webpage_bytes.decode('utf-8', 'replace')
class YoutubeIE(InfoExtractor):
# uploader_id
video_uploader_id = None
- mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
+ mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
if mobj is not None:
video_uploader_id = mobj.group(1)
else:
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url)
request.add_header('Cookie', 'family_filter=off')
- try:
- self.report_download_webpage(video_id)
- webpage_bytes = compat_urllib_request.urlopen(request).read()
- webpage = webpage_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(request, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
cchar = '?'
json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
request = compat_urllib_request.Request(json_url)
+ request.add_header('User-Agent', 'iTunes/10.6.1')
self.report_extraction(mobj.group(1))
info = None
try:
'urlhandle': urlh
}
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
- return
+ raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
if info is None: # Regular URL
try:
json_code_bytes = urlh.read()
'format': data['media']['mimeType'],
'thumbnail': data['thumbnailUrl'],
'description': data['description'],
- 'player_url': data['embedUrl']
+ 'player_url': data['embedUrl'],
+ 'user_agent': 'iTunes/10.6.1',
}
except (ValueError,KeyError) as err:
self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
return
- std_headers['User-Agent'] = 'iTunes/10.6.1'
return [info]
(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
$"""
- IE_NAME = u'comedycentral'
_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
def report_extraction(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
- def report_config_download(self, episode_id):
- self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
+ def report_config_download(self, episode_id, media_id):
+ self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
def report_index_download(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
- def report_player_url(self, episode_id):
- self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
-
-
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
try:
htmlHandle = compat_urllib_request.urlopen(req)
html = htmlHandle.read()
+ webpage = html.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
return
return
epTitle = mobj.group('episode')
- mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
+ mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
if len(mMovieParams) == 0:
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
- altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
+ altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
if len(altMovieParams) == 0:
self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
return
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
- playerUrl_raw = mMovieParams[0][0]
- self.report_player_url(epTitle)
- try:
- urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
- playerUrl = urlHandle.geturl()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
- return
-
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
self.report_index_download(epTitle)
idoc = xml.etree.ElementTree.fromstring(indexXml)
itemEls = idoc.findall('.//item')
- for itemEl in itemEls:
+ for partNum,itemEl in enumerate(itemEls):
mediaId = itemEl.findall('./guid')[0].text
shortMediaId = mediaId.split(':')[-1]
showId = mediaId.split(':')[-2].replace('.com', '')
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
configReq = compat_urllib_request.Request(configUrl)
- self.report_config_download(epTitle)
+ self.report_config_download(epTitle, shortMediaId)
try:
configXml = compat_urllib_request.urlopen(configReq).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
return
# For now, just pick the highest bitrate
- format,video_url = turls[-1]
+ format,rtmp_video_url = turls[-1]
# Get the format arg from the arg stream
req_format = self._downloader.params.get('format', None)
# Select format if we can find one
for f,v in turls:
if f == req_format:
- format, video_url = f, v
+ format, rtmp_video_url = f, v
break
- # Patch to download from alternative CDN, which does not
- # break on current RTMPDump builds
- broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
- better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
+ m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
+ if not m:
+ raise ExtractorError(u'Cannot transform RTMP url')
+ base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+ video_url = base + m.group('finalid')
- if video_url.startswith(broken_cdn):
- video_url = video_url.replace(broken_cdn, better_cdn)
-
- effTitle = showId + u'-' + epTitle
+ effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
info = {
'id': shortMediaId,
'url': video_url,
'format': format,
'thumbnail': None,
'description': officialTitle,
- 'player_url': None #playerUrl
}
-
results.append(info)
return results
return [info]
-
class CollegeHumorIE(InfoExtractor):
"""Information extractor for collegehumor.com"""
class XNXXIE(InfoExtractor):
"""Information extractor for xnxx.com"""
- _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
+ _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
IE_NAME = u'xnxx'
VIDEO_URL_RE = r'flv_url=(.*?)&'
VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
return
response = json.loads(webpage)
+ if type(response) != list:
+ error_text = response.get('error', 'unknown error')
+ self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
+ return
info = []
for clip in response:
video_url = clip['video_file_url']
if video_url:
video_extension = os.path.splitext(video_url)[1][1:]
- video_date = re.sub('-', '', clip['created_on'][:10])
+ video_date = re.sub('-', '', clip['start_time'][:10])
+ video_uploader_id = clip.get('user_id', clip.get('channel_id'))
info.append({
'id': clip['id'],
'url': video_url,
'title': clip['title'],
- 'uploader': clip.get('user_id', clip.get('channel_id')),
+ 'uploader': clip.get('channel_name', video_uploader_id),
+ 'uploader_id': video_uploader_id,
'upload_date': video_date,
'ext': video_extension,
})
paged = True
api += '/channel/archives/%s.json'
else:
- api += '/clip/show/%s.json'
+ api += '/broadcast/by_archive/%s.json'
api = api % (video_id,)
self.report_extraction(video_id)
videourl = 'http://store.steampowered.com/video/%s/' % gameID
webpage = self._download_webpage(videourl, gameID)
mweb = re.finditer(urlRE, webpage)
- namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
- titles = list(re.finditer(namesRE, webpage))
+ namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
+ titles = re.finditer(namesRE, webpage)
videos = []
for vid,vtitle in zip(mweb,titles):
video_id = vid.group('videoID')
'id':video_id,
'url':video_url,
'ext': 'flv',
- 'title': title
+ 'title': unescapeHTML(title)
}
videos.append(info)
return videos
-
+
class UstreamIE(InfoExtractor):
- _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
+ _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
IE_NAME = u'ustream'
-
+
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
return [info]
+
+class YouPornIE(InfoExtractor):
+ """Information extractor for youporn.com."""
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
+
+ def _print_formats(self, formats):
+ """Print all available formats"""
+ print(u'Available formats:')
+ print(u'ext\t\tformat')
+ print(u'---------------------------------')
+ for format in formats:
+ print(u'%s\t\t%s' % (format['ext'], format['format']))
+
+ def _specific(self, req_format, formats):
+ for x in formats:
+ if(x["format"]==req_format):
+ return x
+ return None
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('videoid')
+
+ req = compat_urllib_request.Request(url)
+ req.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(req, video_id)
+
+ # Get the video title
+ result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract video title')
+ video_title = result.group('title').strip()
+
+ # Get the video date
+ result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
+ if result is None:
+ self._downloader.to_stderr(u'WARNING: unable to extract video date')
+ upload_date = None
+ else:
+ upload_date = result.group('date').strip()
+
+ # Get the video uploader
+ result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
+ if result is None:
+ self._downloader.to_stderr(u'ERROR: unable to extract uploader')
+ video_uploader = None
+ else:
+ video_uploader = result.group('uploader').strip()
+ video_uploader = clean_html( video_uploader )
+
+ # Get all of the formats available
+ DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
+ result = re.search(DOWNLOAD_LIST_RE, webpage)
+ if result is None:
+ raise ExtractorError(u'Unable to extract download list')
+ download_list_html = result.group('download_list').strip()
+
+ # Get all of the links from the page
+ LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
+ links = re.findall(LINK_RE, download_list_html)
+ if(len(links) == 0):
+ raise ExtractorError(u'ERROR: no known formats available for video')
+
+ self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
+
+ formats = []
+ for link in links:
+
+ # A link looks like this:
+ # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
+ # A path looks like this:
+ # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
+ video_url = unescapeHTML( link )
+ path = compat_urllib_parse_urlparse( video_url ).path
+ extension = os.path.splitext( path )[1][1:]
+ format = path.split('/')[4].split('_')[:2]
+ size = format[0]
+ bitrate = format[1]
+ format = "-".join( format )
+ title = u'%s-%s-%s' % (video_title, size, bitrate)
+
+ formats.append({
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'upload_date': upload_date,
+ 'title': title,
+ 'ext': extension,
+ 'format': format,
+ 'thumbnail': None,
+ 'description': None,
+ 'player_url': None
+ })
+
+ if self._downloader.params.get('listformats', None):
+ self._print_formats(formats)
+ return
+
+ req_format = self._downloader.params.get('format', None)
+ self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
+
+ if req_format is None or req_format == 'best':
+ return [formats[0]]
+ elif req_format == 'worst':
+ return [formats[-1]]
+ elif req_format in ('-1', 'all'):
+ return formats
+ else:
+ format = self._specific( req_format, formats )
+ if result is None:
+ self._downloader.trouble(u'ERROR: requested format not available')
+ return
+ return [format]
+
+
+
+class PornotubeIE(InfoExtractor):
+ """Information extractor for pornotube.com."""
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('videoid')
+ video_title = mobj.group('title')
+
+ # Get webpage content
+ webpage = self._download_webpage(url, video_id)
+
+ # Get the video URL
+ VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
+ result = re.search(VIDEO_URL_RE, webpage)
+ if result is None:
+ self._downloader.trouble(u'ERROR: unable to extract video url')
+ return
+ video_url = compat_urllib_parse.unquote(result.group('url'))
+
+ #Get the uploaded date
+ VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
+ result = re.search(VIDEO_UPLOADED_RE, webpage)
+ if result is None:
+ self._downloader.trouble(u'ERROR: unable to extract video title')
+ return
+ upload_date = result.group('date')
+
+ info = {'id': video_id,
+ 'url': video_url,
+ 'uploader': None,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': 'flv',
+ 'format': 'flv'}
+
+ return [info]
+
+
+
+class YouJizzIE(InfoExtractor):
+ """Information extractor for youjizz.com."""
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('videoid')
+
+ # Get webpage content
+ webpage = self._download_webpage(url, video_id)
+
+ # Get the video title
+ result = re.search(r'<title>(?P<title>.*)</title>', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract video title')
+ video_title = result.group('title').strip()
+
+ # Get the embed page
+ result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract embed page')
+
+ embed_page_url = result.group(0).strip()
+ video_id = result.group('videoid')
+
+ webpage = self._download_webpage(embed_page_url, video_id)
+
+ # Get the video URL
+ result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
+ if result is None:
+ raise ExtractorError(u'ERROR: unable to extract video url')
+ video_url = result.group('source')
+
+ info = {'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'ext': 'flv',
+ 'format': 'flv',
+ 'player_url': embed_page_url}
+
+ return [info]
+
+
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
MTVIE(),
YoukuIE(),
XNXXIE(),
+ YouJizzIE(),
+ PornotubeIE(),
+ YouPornIE(),
GooglePlusIE(),
ArteTvIE(),
NBAIE(),