get_element_by_id,
HEADRequest,
int_or_none,
+ join_nonempty,
KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext,
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
'uploader': 'yorkmba99@hotmail.com',
'timestamp': 1387699629,
- 'upload_date': "20131222",
+ 'upload_date': '20131222',
},
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
# Archive.org metadata API doesn't clearly demarcate playlist entries
# or subtitle tracks, so we get them from the embeddable player.
- embed_page = self._download_webpage(
- 'https://archive.org/embed/' + identifier, identifier)
+ embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
playlist = self._playlist_data(embed_page)
entries = {}
'thumbnails': [],
'artist': p.get('artist'),
'track': p.get('title'),
- 'subtitles': {}}
+ 'subtitles': {},
+ }
for track in p.get('tracks', []):
if track['kind'] != 'subtitles':
continue
-
entries[p['orig']][track['label']] = {
- 'url': 'https://archive.org/' + track['file'].lstrip('/')}
+ 'url': 'https://archive.org/' + track['file'].lstrip('/')
+ }
- metadata = self._download_json(
- 'http://archive.org/metadata/' + identifier, identifier)
+ metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
m = metadata['metadata']
identifier = m['identifier']
'license': m.get('licenseurl'),
'release_date': unified_strdate(m.get('date')),
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
- 'webpage_url': 'https://archive.org/details/' + identifier,
+ 'webpage_url': f'https://archive.org/details/{identifier}',
'location': m.get('venue'),
'release_year': int_or_none(m.get('year'))}
'discnumber': int_or_none(f.get('disc')),
'release_year': int_or_none(f.get('year'))})
entry = entries[f['name']]
- elif f.get('original') in entries:
+ elif traverse_obj(f, 'original', expected_type=str) in entries:
entry = entries[f['original']]
else:
continue
'filesize': int_or_none(f.get('size')),
'protocol': 'https'})
- # Sort available formats by filesize
for entry in entries.values():
- entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
+ self._sort_formats(entry['formats'])
if len(entries) == 1:
# If there's only one item, use it as the main info dict
- only_video = entries[list(entries.keys())[0]]
+ only_video = next(iter(entries.values()))
if entry_id:
info = merge_dicts(only_video, info)
else:
class YoutubeWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:youtube'
- IE_DESC = 'web.archive.org saved youtube videos'
- _VALID_URL = r"""(?x)^
- (?:https?://)?web\.archive\.org/
- (?:web/)?
- (?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
-
- (?:https?(?::|%3[Aa])//)?
- (?:
- (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
- |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
- )
- (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
- """
+ IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
+ _VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
+ (?:https?://)?web\.archive\.org/
+ (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
+ (?:https?(?::|%3[Aa])//)?(?:
+ (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
+ |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
+ )
+ )(?P<id>[0-9A-Za-z_-]{11})
+ (?(prefix)
+ (?::(?P<date2>[0-9]{14}))?$|
+ (?:%26|[#&]|$)
+ )'''
_TESTS = [
{
}, {
'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
'only_matching': True
- }
+ }, {
+ 'url': 'ytarchive:BaW_jenozKc:20050214000000',
+ 'only_matching': True
+ }, {
+ 'url': 'ytarchive:BaW_jenozKc',
+ 'only_matching': True
+ },
]
_YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
_YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
_OLDEST_CAPTURE_DATE = 20050214000000
_NEWEST_CAPTURE_DATE = 20500101000000
- def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'):
+ def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False):
# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
query = {
'url': url,
'collapse': collapse or [],
**(query or {})
}
- res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query)
+ res = self._download_json(
+ 'https://web.archive.org/cdx/search/cdx', item_id,
+ note or 'Downloading CDX API JSON', query=query, fatal=fatal)
if isinstance(res, list) and len(res) >= 2:
# format response to make it easier to use
return list(dict(zip(res[0], v)) for v in res[1:])
regex), webpage, name, default='{}'), video_id, fatal=False)
def _extract_webpage_title(self, webpage):
- page_title = self._html_search_regex(
- r'<title>([^<]*)</title>', webpage, 'title', default='')
+ page_title = self._html_extract_title(webpage, default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
return self._html_search_regex(
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
page_title, 'title', default='')
def _extract_metadata(self, video_id, webpage):
-
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
player_response = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
# Prefer the new polymer UI captures as we support extracting more metadata from them
# WBM captures seem to all switch to this layout ~July 2020
- modern_captures = list(filter(lambda x: x >= 20200701000000, all_captures))
+ modern_captures = [x for x in all_captures if x >= 20200701000000]
if modern_captures:
capture_dates.append(modern_captures[0])
capture_dates.append(url_date)
# Fallbacks if any of the above fail
capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
- return orderedSet(capture_dates)
+ return orderedSet(filter(None, capture_dates))
def _real_extract(self, url):
-
- url_date, video_id = self._match_valid_url(url).groups()
+ video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
+ url_date = url_date or url_date_2
urlh = None
try:
raise
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
- self.write_debug('Captures to try: ' + ', '.join(str(i) for i in capture_dates if i is not None))
+ self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
info = {'id': video_id}
for capture in capture_dates:
- if not capture:
- continue
webpage = self._download_webpage(
(self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
info['thumbnails'] = self._extract_thumbnails(video_id)
if urlh:
- url = compat_urllib_parse_unquote(urlh.url)
+ url = compat_urllib_parse_unquote(urlh.geturl())
video_file_url_qs = parse_qs(url)
# Attempt to recover any ext & format info from playback url & response headers
format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}