import string
import struct
import traceback
-import xml.etree.ElementTree
import zlib
from .common import InfoExtractor, SearchInfoExtractor
clean_html,
get_cachedir,
get_element_by_id,
+ get_element_by_attribute,
ExtractorError,
unescapeHTML,
unified_strdate,
'asrs': 1,
})
list_url = caption_url + '&' + list_params
- list_page = self._download_webpage(list_url, video_id)
- caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+ caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
self._downloader.report_warning(u'Video doesn\'t have automatic captions')
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
_MORE_PAGES_INDICATOR = r'data-link-type="next"'
- _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&'
+ _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
@classmethod
def _real_initialize(self):
self._login()
+ def _ids_to_results(self, ids):
+ return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
+ def _extract_mix(self, playlist_id):
+ # The mixes are generated from a a single video
+ # the id of the playlist is just 'RD' + video_id
+ url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
+ webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+ title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
+ get_element_by_attribute('class', 'title ', webpage))
+ title = clean_html(title_span)
+ video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
+ ids = orderedSet(re.findall(video_re, webpage))
+ url_results = self._ids_to_results(ids)
+
+ return self.playlist_result(url_results, playlist_id, title)
+
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
else:
self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ if len(playlist_id) == 13: # 'RD' + 11 characters for the video id
+ # Mixes require a custom extraction process
+ return self._extract_mix(playlist_id)
+
# Extract the video ids from the playlist pages
ids = []
for page_num in itertools.count(1):
url = self._TEMPLATE_URL % (playlist_id, page_num)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
- # The ids are duplicated
- new_ids = orderedSet(re.findall(self._VIDEO_RE, page))
+ matches = re.finditer(self._VIDEO_RE, page)
+ # We remove the duplicates and the link with index 0
+ # (it's not the first video of the playlist)
+ new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
ids.extend(new_ids)
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
playlist_title = self._og_search_title(page)
- url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
+ url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title)
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- _PAGING_STEP = 30
# use action_load_personal_feed instead of action_load_system_feed
_PERSONAL_FEED = False
def _real_extract(self, url):
feed_entries = []
- # The step argument is available only in 2.7 or higher
- for i in itertools.count(0):
- paging = i*self._PAGING_STEP
+ paging = 0
+ for i in itertools.count(1):
info = self._download_webpage(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i)
for video_id in ids)
if info['paging'] is None:
break
+ paging = info['paging']
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
_FEED_NAME = 'watch_later'
_PLAYLIST_TITLE = u'Youtube Watch Later'
- _PAGING_STEP = 100
_PERSONAL_FEED = True
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
_PERSONAL_FEED = True
_PLAYLIST_TITLE = u'Youtube Watch History'
- def _real_extract(self, url):
- webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History')
- data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging')
- # The step is actually a ridiculously big number (like 1374343569725646)
- self._PAGING_STEP = int(data_paging)
- return super(YoutubeHistoryIE, self)._real_extract(url)
-
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = u'youtube:favorites'
IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'