X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/3014b0ae835e3f42b5f3628464ed7e4b2557ef6c..09728d5fbc93c769b3f8971c06e9ed0bfb168b37:/youtube_dl/extractor/youtube.py diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 44c1191bd..6c9f77d95 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -125,6 +125,12 @@ def _login(self): if login_results is False: return False + error_msg = self._html_search_regex( + r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', + login_results, 'error message', default=None) + if error_msg: + raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) @@ -338,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, @@ -1320,9 +1328,9 @@ def add_dash_mpd(video_info): if video_description: video_description = re.sub(r'''(?x) ]*> [^<]+\.{3}\s* @@ -1818,20 +1826,32 @@ def _real_initialize(self): def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading Youtube mix') + ids = [] + last_id = playlist_id[-11:] + for n in itertools.count(1): + url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + webpage = self._download_webpage( + url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) + new_ids = orderedSet(re.findall( + r'''(?xs)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), + webpage)) + # Fetch new pages until all the videos are repeated, it seems that + # there are always 51 unique videos. + new_ids = [_id for _id in new_ids if _id not in ids] + if not new_ids: + break + ids.extend(new_ids) + last_id = ids[-1] + + url_results = self._ids_to_results(ids) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) - ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -2121,10 +2141,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2139,32 +2160,8 @@ class YoutubeSearchURLIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } + return self.playlist_result(self._process_page(webpage), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):