X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/60179645808cbc3cff3ba062312bfa360de48965..3446dfb7cb84025f67250be069e44020d3606b84:/youtube_dl/InfoExtractors.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 092bfef22..971248022 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -106,19 +106,24 @@ def _real_extract(self, url): def IE_NAME(self): return type(self).__name__[:-2] - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the response handle """ if note is None: note = u'Downloading video webpage' self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note)) try: - urlh = compat_urllib_request.urlopen(url_or_request) - webpage_bytes = urlh.read() - return webpage_bytes.decode('utf-8', 'replace') + return compat_urllib_request.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the data of the page as a string """ + urlh = self._request_webpage(url_or_request, video_id, note, errnote) + webpage_bytes = urlh.read() + return webpage_bytes.decode('utf-8', 'replace') + class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" @@ -2204,6 +2209,7 @@ def _real_extract(self, url): cchar = '?' json_url = url + cchar + 'skin=json&version=2&no_wrap=1' request = compat_urllib_request.Request(json_url) + request.add_header('User-Agent', 'iTunes/10.6.1') self.report_extraction(mobj.group(1)) info = None try: @@ -2224,8 +2230,7 @@ def _real_extract(self, url): 'urlhandle': urlh } except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) - return + raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) if info is None: # Regular URL try: json_code_bytes = urlh.read() @@ -2258,13 +2263,13 @@ def _real_extract(self, url): 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], 'description': data['description'], - 'player_url': data['embedUrl'] + 'player_url': data['embedUrl'], + 'user_agent': 'iTunes/10.6.1', } except (ValueError,KeyError) as err: self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err)) return - std_headers['User-Agent'] = 'iTunes/10.6.1' return [info] @@ -3726,35 +3731,11 @@ def _real_extract(self, url): class YouPornIE(InfoExtractor): """Information extractor for youporn.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - # def report_id(self, video_id): - # """Report finding video ID""" - # self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id) - - # def report_webpage(self, url): - # """Report downloading page""" - # self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url) - - # def report_title(self, video_title): - # """Report dfinding title""" - # self._downloader.to_screen(u'[youporn] Title: %s' % video_title) - - # def report_uploader(self, uploader): - # """Report dfinding title""" - # self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader) - - # def report_upload_date(self, video_date): - # """Report finding date""" - # self._downloader.to_screen(u'[youporn] Date: %s' % video_date) - def _print_formats(self, formats): """Print all available formats""" - print('Available formats:') + print(u'Available formats:') print(u'ext\t\tformat') print(u'---------------------------------') for format in formats: @@ -3773,53 +3754,46 @@ def _real_extract(self, url): return video_id = mobj.group('videoid') - #self.report_id(video_id) - webpage = self._download_webpage(url, video_id) - #self.report_webpage(url) + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) # Get the video title - VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>' - result = re.search(VIDEO_TITLE_RE, webpage) + result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video title') - return + raise ExtractorError(u'ERROR: unable to extract video title') video_title = result.group('title').strip() - #self.report_title(video_title) # Get the video date - VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>' - result = re.search(VIDEO_DATE_RE, webpage) + result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video date') - return - upload_date = result.group('date').strip() - #self.report_upload_date(upload_date) + self._downloader.to_stderr(u'WARNING: unable to extract video date') + upload_date = None + else: + upload_date = result.group('date').strip() # Get the video uploader - VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>' - result = re.search(VIDEO_UPLOADER_RE, webpage) + result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract uploader') - return - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) - #self.report_uploader(video_uploader) + self._downloader.to_stderr(u'ERROR: unable to extract uploader') + video_uploader = None + else: + video_uploader = result.group('uploader').strip() + video_uploader = clean_html( video_uploader ) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' result = re.search(DOWNLOAD_LIST_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract download list') - return + raise ExtractorError(u'Unable to extract download list') download_list_html = result.group('download_list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) if(len(links) == 0): - self._downloader.trouble(u'ERROR: no known formats available for video') - return + raise ExtractorError(u'ERROR: no known formats available for video') self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) @@ -3857,10 +3831,8 @@ def _real_extract(self, url): return req_format = self._downloader.params.get('format', None) - #format_limit = self._downloader.params.get('format_limit', None) self._downloader.to_screen(u'[youporn] Format: %s' % req_format) - if req_format is None or req_format == 'best': return [formats[0]] elif req_format == 'worst': @@ -3878,28 +3850,8 @@ def _real_extract(self, url): class PornotubeIE(InfoExtractor): """Information extractor for pornotube.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' - # def __init__(self, downloader=None): - # InfoExtractor.__init__(self, downloader) - - # def report_extract_entry(self, url): - # """Report downloading extry""" - # self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8')) - - # def report_date(self, upload_date): - # """Report finding uploaded date""" - # self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date) - - # def report_webpage(self, url): - # """Report downloading page""" - # self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url) - - # def report_title(self, video_title): - # """Report downloading extry""" - # self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8')) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -3908,11 +3860,9 @@ def _real_extract(self, url): video_id = mobj.group('videoid') video_title = mobj.group('title') - #self.report_title(video_title); # Get webpage content webpage = self._download_webpage(url, video_id) - #self.report_webpage(url) # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' @@ -3921,7 +3871,6 @@ def _real_extract(self, url): self._downloader.trouble(u'ERROR: unable to extract video url') return video_url = compat_urllib_parse.unquote(result.group('url')) - #self.report_extract_entry(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' @@ -3930,7 +3879,6 @@ def _real_extract(self, url): self._downloader.trouble(u'ERROR: unable to extract video title') return upload_date = result.group('date') - #self.report_date(upload_date); info = {'id': video_id, 'url': video_url, @@ -3938,10 +3886,7 @@ def _real_extract(self, url): 'upload_date': upload_date, 'title': video_title, 'ext': 'flv', - 'format': 'flv', - 'thumbnail': None, - 'description': None, - 'player_url': None} + 'format': 'flv'} return [info] @@ -3949,28 +3894,8 @@ def _real_extract(self, url): class YouJizzIE(InfoExtractor): """Information extractor for youjizz.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - # def report_extract_entry(self, url): - # """Report downloading extry""" - # self._downloader.to_screen(u'[youjizz] Downloading entry: %s' % url.decode('utf-8')) - - # def report_webpage(self, url): - # """Report downloading page""" - # self._downloader.to_screen(u'[youjizz] Downloaded page: %s' % url) - - # def report_title(self, video_title): - # """Report downloading extry""" - # self._downloader.to_screen(u'[youjizz] Title: %s' % video_title.decode('utf-8')) - - # def report_embed_page(self, embed_page): - # """Report downloading extry""" - # self._downloader.to_screen(u'[youjizz] Embed Page: %s' % embed_page.decode('utf-8')) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -3981,48 +3906,34 @@ def _real_extract(self, url): # Get webpage content webpage = self._download_webpage(url, video_id) - #self.report_webpage(url) # Get the video title - VIDEO_TITLE_RE = r'<title>(?P<title>.*)' - result = re.search(VIDEO_TITLE_RE, webpage) + result = re.search(r'(?P<title>.*)', webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video title') - return + raise ExtractorError(u'ERROR: unable to extract video title') video_title = result.group('title').strip() - #self.report_title(video_title) # Get the embed page - EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P[0-9]+)' - result = re.search(EMBED_PAGE_RE, webpage) + result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract embed page') - return + raise ExtractorError(u'ERROR: unable to extract embed page') embed_page_url = result.group(0).strip() video_id = result.group('videoid') - #self.report_embed_page(embed_page_url) webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);' - result = re.search(SOURCE_RE, webpage) + result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video url') - return + raise ExtractorError(u'ERROR: unable to extract video url') video_url = result.group('source') - #self.report_extract_entry(video_url) info = {'id': video_id, 'url': video_url, - 'uploader': None, - 'upload_date': None, 'title': video_title, 'ext': 'flv', 'format': 'flv', - 'thumbnail': None, - 'description': None, 'player_url': embed_page_url} return [info]