jfr.im git - yt-dlp.git/blame_incremental - yt

Commit	Line	Data
	1	import json
	2	import re
	3	import urllib.parse
	4
	5	from .common import InfoExtractor
	6	from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
	7	from ..compat import compat_HTTPError, compat_urllib_parse_unquote
	8	from ..utils import (
	9	KNOWN_EXTENSIONS,
	10	ExtractorError,
	11	HEADRequest,
	12	bug_reports_message,
	13	clean_html,
	14	dict_get,
	15	extract_attributes,
	16	get_element_by_id,
	17	int_or_none,
	18	join_nonempty,
	19	js_to_json,
	20	merge_dicts,
	21	mimetype2ext,
	22	orderedSet,
	23	parse_duration,
	24	parse_qs,
	25	str_or_none,
	26	str_to_int,
	27	traverse_obj,
	28	try_get,
	29	unified_strdate,
	30	unified_timestamp,
	31	url_or_none,
	32	urlhandle_detect_ext,
	33	)
	34
	35
	36	class ArchiveOrgIE(InfoExtractor):
	37	IE_NAME = 'archive.org'
	38	IE_DESC = 'archive.org video and audio'
	39	_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details\|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
	40	_TESTS = [{
	41	'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
	42	'md5': '8af1d4cf447933ed3c7f4871162602db',
	43	'info_dict': {
	44	'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
	45	'ext': 'ogv',
	46	'title': '1968 Demo - FJCC Conference Presentation Reel #1',
	47	'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
	48	'release_date': '19681210',
	49	'timestamp': 1268695290,
	50	'upload_date': '20100315',
	51	'creator': 'SRI International',
	52	'uploader': 'laura@archive.org',
	53	'thumbnail': r're:https://archive\.org/download/.*\.jpg',
	54	'release_year': 1968,
	55	'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr',
	56	'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect',
	57
	58	},
	59	}, {
	60	'url': 'https://archive.org/details/Cops1922',
	61	'md5': '0869000b4ce265e8ca62738b336b268a',
	62	'info_dict': {
	63	'id': 'Cops1922',
	64	'ext': 'mp4',
	65	'title': 'Buster Keaton\'s "Cops" (1922)',
	66	'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca',
	67	'uploader': 'yorkmba99@hotmail.com',
	68	'timestamp': 1387699629,
	69	'upload_date': '20131222',
	70	'display_id': 'Cops-v2.mp4',
	71	'thumbnail': r're:https://archive\.org/download/.*\.jpg',
	72	'duration': 1091.96,
	73	},
	74	}, {
	75	'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
	76	'only_matching': True,
	77	}, {
	78	'url': 'https://archive.org/details/Election_Ads',
	79	'md5': 'eec5cddebd4793c6a653b69c3b11f2e6',
	80	'info_dict': {
	81	'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
	82	'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
	83	'ext': 'mpg',
	84	'thumbnail': r're:https://archive\.org/download/.*\.jpg',
	85	'duration': 59.77,
	86	'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
	87	},
	88	}, {
	89	'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	90	'md5': 'ea1eed8234e7d4165f38c8c769edef38',
	91	'info_dict': {
	92	'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	93	'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	94	'ext': 'mpg',
	95	'timestamp': 1205588045,
	96	'uploader': 'mikedavisstripmaster@yahoo.com',
	97	'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
	98	'upload_date': '20080315',
	99	'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	100	'duration': 59.51,
	101	'license': 'http://creativecommons.org/licenses/publicdomain/',
	102	'thumbnail': r're:https://archive\.org/download/.*\.jpg',
	103	},
	104	}, {
	105	'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
	106	'md5': '7d07ffb42aba6537c28e053efa4b54c9',
	107	'info_dict': {
	108	'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
	109	'title': 'Turning',
	110	'ext': 'flac',
	111	'track': 'Turning',
	112	'creator': 'Grateful Dead',
	113	'display_id': 'gd1977-05-08d01t01.flac',
	114	'track_number': 1,
	115	'album': '1977-05-08 - Barton Hall - Cornell University',
	116	'duration': 39.8,
	117	},
	118	}, {
	119	'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
	120	'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
	121	'info_dict': {
	122	'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
	123	'title': 'Deal',
	124	'ext': 'flac',
	125	'timestamp': 1205895624,
	126	'uploader': 'mvernon54@yahoo.com',
	127	'description': 'md5:6c921464414814720c6593810a5c7e3d',
	128	'upload_date': '20080319',
	129	'location': 'Barton Hall - Cornell University',
	130	'duration': 438.68,
	131	'track': 'Deal',
	132	'creator': 'Grateful Dead',
	133	'album': '1977-05-08 - Barton Hall - Cornell University',
	134	'release_date': '19770508',
	135	'display_id': 'gd1977-05-08d01t07.flac',
	136	'release_year': 1977,
	137	'track_number': 7,
	138	},
	139	}, {
	140	# FIXME: give a better error message than just IndexError when all available formats are restricted
	141	'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
	142	'md5': '7cb019baa9b332e82ea7c10403acd180',
	143	'info_dict': {
	144	'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
	145	'title': 'Bells Of Rostov',
	146	'ext': 'mp3',
	147	},
	148	'skip': 'restricted'
	149	}, {
	150	'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
	151	'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
	152	'info_dict': {
	153	'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
	154	'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
	155	'ext': 'mp3',
	156	'timestamp': 1569662587,
	157	'uploader': 'associate-joygen-odiongan@archive.org',
	158	'description': 'md5:012b2d668ae753be36896f343d12a236',
	159	'upload_date': '20190928',
	160	},
	161	'skip': 'restricted'
	162	}, {
	163	# Original formats are private
	164	'url': 'https://archive.org/details/irelandthemakingofarepublic',
	165	'info_dict': {
	166	'id': 'irelandthemakingofarepublic',
	167	'title': 'Ireland: The Making of a Republic',
	168	'upload_date': '20160610',
	169	'description': 'md5:f70956a156645a658a0dc9513d9e78b7',
	170	'uploader': 'dimitrios@archive.org',
	171	'creator': ['British Broadcasting Corporation', 'Time-Life Films'],
	172	'timestamp': 1465594947,
	173	},
	174	'playlist': [
	175	{
	176	'md5': '0b211261b26590d49df968f71b90690d',
	177	'info_dict': {
	178	'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov',
	179	'ext': 'mp4',
	180	'title': 'irelandthemakingofarepublicreel1_01.mov',
	181	'duration': 130.46,
	182	'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg',
	183	'display_id': 'irelandthemakingofarepublicreel1_01.mov',
	184	},
	185	}, {
	186	'md5': '67335ee3b23a0da930841981c1e79b02',
	187	'info_dict': {
	188	'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov',
	189	'ext': 'mp4',
	190	'duration': 1395.13,
	191	'title': 'irelandthemakingofarepublicreel1_02.mov',
	192	'display_id': 'irelandthemakingofarepublicreel1_02.mov',
	193	'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg',
	194	},
	195	}, {
	196	'md5': 'e470e86787893603f4a341a16c281eb5',
	197	'info_dict': {
	198	'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov',
	199	'ext': 'mp4',
	200	'duration': 1602.67,
	201	'title': 'irelandthemakingofarepublicreel2.mov',
	202	'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg',
	203	'display_id': 'irelandthemakingofarepublicreel2.mov',
	204	},
	205	}
	206	]
	207	}]
	208
	209	@staticmethod
	210	def _playlist_data(webpage):
	211	element = re.findall(r'''(?xs)
	212	<input
	213	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	214	\s+class=['"]?js-play8-playlist['"]?
	215	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	216	\s*/>
	217	''', webpage)[0]
	218
	219	return json.loads(extract_attributes(element)['value'])
	220
	221	def _real_extract(self, url):
	222	video_id = urllib.parse.unquote_plus(self._match_id(url))
	223	identifier, entry_id = (video_id.split('/', 1) + [None])[:2]
	224
	225	# Archive.org metadata API doesn't clearly demarcate playlist entries
	226	# or subtitle tracks, so we get them from the embeddable player.
	227	embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
	228	playlist = self._playlist_data(embed_page)
	229
	230	entries = {}
	231	for p in playlist:
	232	# If the user specified a playlist entry in the URL, ignore the
	233	# rest of the playlist.
	234	if entry_id and p['orig'] != entry_id:
	235	continue
	236
	237	entries[p['orig']] = {
	238	'formats': [],
	239	'thumbnails': [],
	240	'artist': p.get('artist'),
	241	'track': p.get('title'),
	242	'subtitles': {},
	243	}
	244
	245	for track in p.get('tracks', []):
	246	if track['kind'] != 'subtitles':
	247	continue
	248	entries[p['orig']][track['label']] = {
	249	'url': 'https://archive.org/' + track['file'].lstrip('/')
	250	}
	251
	252	metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
	253	m = metadata['metadata']
	254	identifier = m['identifier']
	255
	256	info = {
	257	'id': identifier,
	258	'title': m['title'],
	259	'description': clean_html(m.get('description')),
	260	'uploader': dict_get(m, ['uploader', 'adder']),
	261	'creator': m.get('creator'),
	262	'license': m.get('licenseurl'),
	263	'release_date': unified_strdate(m.get('date')),
	264	'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
	265	'webpage_url': f'https://archive.org/details/{identifier}',
	266	'location': m.get('venue'),
	267	'release_year': int_or_none(m.get('year'))}
	268
	269	for f in metadata['files']:
	270	if f['name'] in entries:
	271	entries[f['name']] = merge_dicts(entries[f['name']], {
	272	'id': identifier + '/' + f['name'],
	273	'title': f.get('title') or f['name'],
	274	'display_id': f['name'],
	275	'description': clean_html(f.get('description')),
	276	'creator': f.get('creator'),
	277	'duration': parse_duration(f.get('length')),
	278	'track_number': int_or_none(f.get('track')),
	279	'album': f.get('album'),
	280	'discnumber': int_or_none(f.get('disc')),
	281	'release_year': int_or_none(f.get('year'))})
	282	entry = entries[f['name']]
	283	elif traverse_obj(f, 'original', expected_type=str) in entries:
	284	entry = entries[f['original']]
	285	else:
	286	continue
	287
	288	if f.get('format') == 'Thumbnail':
	289	entry['thumbnails'].append({
	290	'id': f['name'],
	291	'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
	292	'width': int_or_none(f.get('width')),
	293	'height': int_or_none(f.get('width')),
	294	'filesize': int_or_none(f.get('size'))})
	295
	296	extension = (f['name'].rsplit('.', 1) + [None])[1]
	297
	298	# We don't want to skip private formats if the user has access to them,
	299	# however without access to an account with such privileges we can't implement/test this.
	300	# For now to be safe, we will only skip them if there is no user logged in.
	301	is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
	302	if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
	303	entry['formats'].append({
	304	'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
	305	'format': f.get('format'),
	306	'width': int_or_none(f.get('width')),
	307	'height': int_or_none(f.get('height')),
	308	'filesize': int_or_none(f.get('size')),
	309	'protocol': 'https',
	310	'source_preference': 0 if f.get('source') == 'original' else -1,
	311	'format_note': f.get('source')
	312	})
	313
	314	for entry in entries.values():
	315	entry['_format_sort_fields'] = ('source', )
	316
	317	if len(entries) == 1:
	318	# If there's only one item, use it as the main info dict
	319	only_video = next(iter(entries.values()))
	320	if entry_id:
	321	info = merge_dicts(only_video, info)
	322	else:
	323	info = merge_dicts(info, only_video)
	324	else:
	325	# Otherwise, we have a playlist.
	326	info['_type'] = 'playlist'
	327	info['entries'] = list(entries.values())
	328
	329	if metadata.get('reviews'):
	330	info['comments'] = []
	331	for review in metadata['reviews']:
	332	info['comments'].append({
	333	'id': review.get('review_id'),
	334	'author': review.get('reviewer'),
	335	'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
	336	'timestamp': unified_timestamp(review.get('createdate')),
	337	'parent': 'root'})
	338
	339	return info
	340
	341
	342	class YoutubeWebArchiveIE(InfoExtractor):
	343	IE_NAME = 'web.archive:youtube'
	344	IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
	345	_VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)\|
	346	(?:https?://)?web\.archive\.org/
	347	(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_]/)? # /web and the version index is optional
	348	(?:https?(?::\|%3[Aa])//)?(?:
	349	(?:\w+\.)?youtube\.com(?::(?:80\|443))?/watch(?:\.php)?(?:\?\|%3[fF])(?:[^\#]+(?:&\|%26))?v(?:=\|%3[dD]) # Youtube URL
	350	\|(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
	351	)
	352	)(?P<id>[0-9A-Za-z_-]{11})
	353	(?(prefix)
	354	(?::(?P<date2>[0-9]{14}))?$\|
	355	(?:%26\|[#&]\|$)
	356	)'''
	357
	358	_TESTS = [
	359	{
	360	'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',
	361	'info_dict': {
	362	'id': 'aYAGB11YrSs',
	363	'ext': 'webm',
	364	'title': 'Team Fortress 2 - Sandviches!',
	365	'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',
	366	'upload_date': '20110926',
	367	'uploader': 'Zeurel',
	368	'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
	369	'duration': 32,
	370	'uploader_id': 'Zeurel',
	371	'uploader_url': 'https://www.youtube.com/user/Zeurel',
	372	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	373	'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg',
	374	}
	375	}, {
	376	# Internal link
	377	'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
	378	'info_dict': {
	379	'id': '97t7Xj_iBv0',
	380	'ext': 'mp4',
	381	'title': 'Why Machines That Bend Are Better',
	382	'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',
	383	'upload_date': '20190312',
	384	'uploader': 'Veritasium',
	385	'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
	386	'duration': 771,
	387	'uploader_id': '1veritasium',
	388	'uploader_url': 'https://www.youtube.com/user/1veritasium',
	389	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	390	'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA',
	391	}
	392	}, {
	393	# Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
	394	# Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description
	395	'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
	396	'info_dict': {
	397	'id': 'AkhihxRKcrs',
	398	'ext': 'webm',
	399	'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',
	400	'upload_date': '20120712',
	401	'duration': 398,
	402	'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
	403	'uploader_id': 'machinima',
	404	'uploader_url': 'https://www.youtube.com/user/machinima',
	405	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	406	'uploader': 'machinima'
	407	}
	408	}, {
	409	# FLV video. Video file URL does not provide itag information
	410	'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
	411	'info_dict': {
	412	'id': 'jNQXAC9IVRw',
	413	'ext': 'flv',
	414	'title': 'Me at the zoo',
	415	'upload_date': '20050423',
	416	'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',
	417	'duration': 19,
	418	'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
	419	'uploader_id': 'jawed',
	420	'uploader_url': 'https://www.youtube.com/user/jawed',
	421	'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A',
	422	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	423	'uploader': 'jawed',
	424	}
	425	}, {
	426	'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
	427	'info_dict': {
	428	'id': 'lTx3G6h2xyA',
	429	'ext': 'flv',
	430	'title': 'Madeon - Pop Culture (live mashup)',
	431	'upload_date': '20110711',
	432	'uploader': 'Madeon',
	433	'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',
	434	'duration': 204,
	435	'description': 'md5:f7535343b6eda34a314eff8b85444680',
	436	'uploader_id': 'itsmadeon',
	437	'uploader_url': 'https://www.youtube.com/user/itsmadeon',
	438	'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w',
	439	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	440	}
	441	}, {
	442	# First capture is of dead video, second is the oldest from CDX response.
	443	'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',
	444	'info_dict': {
	445	'id': '1JYutPM8O6E',
	446	'ext': 'mp4',
	447	'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
	448	'upload_date': '20160218',
	449	'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
	450	'duration': 1235,
	451	'description': 'md5:21032bae736421e89c2edf36d1936947',
	452	'uploader_id': 'MachinimaETC',
	453	'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
	454	'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
	455	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	456	'uploader': 'ETC News',
	457	}
	458	}, {
	459	# First capture of dead video, capture date in link links to dead capture.
	460	'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',
	461	'info_dict': {
	462	'id': '6FPhZJGvf4E',
	463	'ext': 'mp4',
	464	'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
	465	'upload_date': '20160219',
	466	'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
	467	'duration': 797,
	468	'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
	469	'uploader_id': 'MachinimaETC',
	470	'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
	471	'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
	472	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	473	'uploader': 'ETC News',
	474	},
	475	'expected_warnings': [
	476	r'unable to download capture webpage \(it may not be archived\)'
	477	]
	478	}, { # Very old YouTube page, has - YouTube in title.
	479	'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',
	480	'info_dict': {
	481	'id': '-06-KB9XTzg',
	482	'ext': 'flv',
	483	'title': 'New Coin Hack!! 100% Safe!!'
	484	}
	485	}, {
	486	'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',
	487	'info_dict': {
	488	'id': 'dWW7qP423y8',
	489	'ext': 'mp4',
	490	'title': 'It\'s Bootleg AirPods Time.',
	491	'upload_date': '20211021',
	492	'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
	493	'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
	494	'duration': 810,
	495	'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
	496	'thumbnail': r're:https?://.*\.(jpg\|webp)',
	497	'uploader': 'DankPods',
	498	}
	499	}, {
	500	# player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093

1

import json

import re

import urllib.parse

from .common import InfoExtractor

6

from .youtube import YoutubeBaseInfoExtractor, YoutubeIE

7

from ..compat import compat_HTTPError, compat_urllib_parse_unquote

8

from ..utils import (

KNOWN_EXTENSIONS,

ExtractorError,

HEADRequest,

bug_reports_message,

clean_html,

dict_get,

extract_attributes,

get_element_by_id,

int_or_none,

join_nonempty,

js_to_json,

merge_dicts,

mimetype2ext,

orderedSet,

parse_duration,

parse_qs,

str_or_none,

str_to_int,

traverse_obj,

try_get,

unified_strdate,

unified_timestamp,

url_or_none,

urlhandle_detect_ext,

)

class ArchiveOrgIE(InfoExtractor):

37

IE_NAME = 'archive.org'

38

IE_DESC = 'archive.org video and audio'

39

_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'

40

_TESTS = [{

41

'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',

42

'md5': '8af1d4cf447933ed3c7f4871162602db',

43

'info_dict': {

44

'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',

45

'ext': 'ogv',

46

'title': '1968 Demo - FJCC Conference Presentation Reel #1',

47

'description': 'md5:da45c349df039f1cc8075268eb1b5c25',

48

'release_date': '19681210',

49

'timestamp': 1268695290,

50

'upload_date': '20100315',

51

'creator': 'SRI International',

52

'uploader': 'laura@archive.org',

53

'thumbnail': r're:https://archive\.org/download/.*\.jpg',

54

'release_year': 1968,

55

'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr',

56

'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect',

},

}, {

'url': 'https://archive.org/details/Cops1922',

61

'md5': '0869000b4ce265e8ca62738b336b268a',

'info_dict': {

'id': 'Cops1922',

'ext': 'mp4',

'title': 'Buster Keaton\'s "Cops" (1922)',

66

'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca',

67

'uploader': 'yorkmba99@hotmail.com',

68

'timestamp': 1387699629,

69

'upload_date': '20131222',

70

'display_id': 'Cops-v2.mp4',

71

'thumbnail': r're:https://archive\.org/download/.*\.jpg',

'duration': 1091.96,

},

}, {

'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',

76

'only_matching': True,

77

}, {

78

'url': 'https://archive.org/details/Election_Ads',

79

'md5': 'eec5cddebd4793c6a653b69c3b11f2e6',

80

'info_dict': {

81

'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',

82

'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',

83

'ext': 'mpg',

84

'thumbnail': r're:https://archive\.org/download/.*\.jpg',

85

'duration': 59.77,

86

'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',

87

},

88

}, {

89

'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',

90

'md5': 'ea1eed8234e7d4165f38c8c769edef38',

91

'info_dict': {

92

'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',

93

'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',

94

'ext': 'mpg',

95

'timestamp': 1205588045,

96

'uploader': 'mikedavisstripmaster@yahoo.com',

97

'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',

98

'upload_date': '20080315',

99

'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',

100

'duration': 59.51,

101

'license': 'http://creativecommons.org/licenses/publicdomain/',

102

'thumbnail': r're:https://archive\.org/download/.*\.jpg',

103

},

104

}, {

105

'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',

106

'md5': '7d07ffb42aba6537c28e053efa4b54c9',

107

'info_dict': {

108

'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',

'title': 'Turning',

'ext': 'flac',

'track': 'Turning',

'creator': 'Grateful Dead',

113

'display_id': 'gd1977-05-08d01t01.flac',

114

'track_number': 1,

115

'album': '1977-05-08 - Barton Hall - Cornell University',

'duration': 39.8,

},

}, {

'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',

120

'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',

121

'info_dict': {

122

'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',

123

'title': 'Deal',

124

'ext': 'flac',

125

'timestamp': 1205895624,

126

'uploader': 'mvernon54@yahoo.com',

127

'description': 'md5:6c921464414814720c6593810a5c7e3d',

128

'upload_date': '20080319',

129

'location': 'Barton Hall - Cornell University',

130

'duration': 438.68,

131

'track': 'Deal',

132

'creator': 'Grateful Dead',

133

'album': '1977-05-08 - Barton Hall - Cornell University',

134

'release_date': '19770508',

135

'display_id': 'gd1977-05-08d01t07.flac',

136

'release_year': 1977,

'track_number': 7,

},

}, {

# FIXME: give a better error message than just IndexError when all available formats are restricted

141

'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',

142

'md5': '7cb019baa9b332e82ea7c10403acd180',

143

'info_dict': {

144

'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',

145

'title': 'Bells Of Rostov',

'ext': 'mp3',

},

'skip': 'restricted'

}, {

'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',

151

'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',

152

'info_dict': {

153

'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',

154

'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',

155

'ext': 'mp3',

156

'timestamp': 1569662587,

157

'uploader': 'associate-joygen-odiongan@archive.org',

158

'description': 'md5:012b2d668ae753be36896f343d12a236',

159

'upload_date': '20190928',

},

'skip': 'restricted'

}, {

# Original formats are private

164

'url': 'https://archive.org/details/irelandthemakingofarepublic',

165

'info_dict': {

166

'id': 'irelandthemakingofarepublic',

167

'title': 'Ireland: The Making of a Republic',

168

'upload_date': '20160610',

169

'description': 'md5:f70956a156645a658a0dc9513d9e78b7',

170

'uploader': 'dimitrios@archive.org',

171

'creator': ['British Broadcasting Corporation', 'Time-Life Films'],

172

'timestamp': 1465594947,

},

'playlist': [

{

'md5': '0b211261b26590d49df968f71b90690d',

177

'info_dict': {

178

'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov',

179

'ext': 'mp4',

180

'title': 'irelandthemakingofarepublicreel1_01.mov',

181

'duration': 130.46,

182

'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg',

183

'display_id': 'irelandthemakingofarepublicreel1_01.mov',

184

},

185

}, {

186

'md5': '67335ee3b23a0da930841981c1e79b02',

187

'info_dict': {

188

'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov',

189

'ext': 'mp4',

190

'duration': 1395.13,

191

'title': 'irelandthemakingofarepublicreel1_02.mov',

192

'display_id': 'irelandthemakingofarepublicreel1_02.mov',

193

'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg',

194

},

195

}, {

196

'md5': 'e470e86787893603f4a341a16c281eb5',

197

'info_dict': {

198

'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov',

199

'ext': 'mp4',

200

'duration': 1602.67,

201

'title': 'irelandthemakingofarepublicreel2.mov',

202

'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg',

203

'display_id': 'irelandthemakingofarepublicreel2.mov',

},

}

]

}]

@staticmethod

def _playlist_data(webpage):

211

element = re.findall(r'''(?xs)

212

<input

213

(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?

214

\s+class=['"]?js-play8-playlist['"]?

215

(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?

\s*/>

''', webpage)[0]

return json.loads(extract_attributes(element)['value'])

220

221

def _real_extract(self, url):

222

video_id = urllib.parse.unquote_plus(self._match_id(url))

223

identifier, entry_id = (video_id.split('/', 1) + [None])[:2]

224

225

# Archive.org metadata API doesn't clearly demarcate playlist entries

226

# or subtitle tracks, so we get them from the embeddable player.

227

embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)

228

playlist = self._playlist_data(embed_page)

entries = {}

for p in playlist:

# If the user specified a playlist entry in the URL, ignore the

233

# rest of the playlist.

234

if entry_id and p['orig'] != entry_id:

235

continue

236

237

entries[p['orig']] = {

238

'formats': [],

239

'thumbnails': [],

240

'artist': p.get('artist'),

241

'track': p.get('title'),

'subtitles': {},

}

for track in p.get('tracks', []):

246

if track['kind'] != 'subtitles':

247

continue

248

entries[p['orig']][track['label']] = {

249

'url': 'https://archive.org/' + track['file'].lstrip('/')

250

}

251

252

metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)

253

m = metadata['metadata']

254

identifier = m['identifier']

info = {

'id': identifier,

'title': m['title'],

'description': clean_html(m.get('description')),

260

'uploader': dict_get(m, ['uploader', 'adder']),

261

'creator': m.get('creator'),

262

'license': m.get('licenseurl'),

263

'release_date': unified_strdate(m.get('date')),

264

'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),

265

'webpage_url': f'https://archive.org/details/{identifier}',

266

'location': m.get('venue'),

267

'release_year': int_or_none(m.get('year'))}

268

269

for f in metadata['files']:

270

if f['name'] in entries:

271

entries[f['name']] = merge_dicts(entries[f['name']], {

272

'id': identifier + '/' + f['name'],

273

'title': f.get('title') or f['name'],

274

'display_id': f['name'],

275

'description': clean_html(f.get('description')),

276

'creator': f.get('creator'),

277

'duration': parse_duration(f.get('length')),

278

'track_number': int_or_none(f.get('track')),

279

'album': f.get('album'),

280

'discnumber': int_or_none(f.get('disc')),

281

'release_year': int_or_none(f.get('year'))})

282

entry = entries[f['name']]

283

elif traverse_obj(f, 'original', expected_type=str) in entries:

284

entry = entries[f['original']]

else:

continue

if f.get('format') == 'Thumbnail':

289

entry['thumbnails'].append({

290

'id': f['name'],

291

'url': 'https://archive.org/download/' + identifier + '/' + f['name'],

292

'width': int_or_none(f.get('width')),

293

'height': int_or_none(f.get('width')),

294

'filesize': int_or_none(f.get('size'))})

295

296

extension = (f['name'].rsplit('.', 1) + [None])[1]

297

298

# We don't want to skip private formats if the user has access to them,

299

# however without access to an account with such privileges we can't implement/test this.

300

# For now to be safe, we will only skip them if there is no user logged in.

301

is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))

302

if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):

303

entry['formats'].append({

304

'url': 'https://archive.org/download/' + identifier + '/' + f['name'],

305

'format': f.get('format'),

306

'width': int_or_none(f.get('width')),

307

'height': int_or_none(f.get('height')),

308

'filesize': int_or_none(f.get('size')),

309

'protocol': 'https',

310

'source_preference': 0 if f.get('source') == 'original' else -1,

311

'format_note': f.get('source')

312

})

313

314

for entry in entries.values():

315

entry['_format_sort_fields'] = ('source', )

316

317

if len(entries) == 1:

318

# If there's only one item, use it as the main info dict

319

only_video = next(iter(entries.values()))

320

if entry_id:

321

info = merge_dicts(only_video, info)

322

else:

323

info = merge_dicts(info, only_video)

324

else:

325

# Otherwise, we have a playlist.

326

info['_type'] = 'playlist'

327

info['entries'] = list(entries.values())

328

329

if metadata.get('reviews'):

330

info['comments'] = []

331

for review in metadata['reviews']:

332

info['comments'].append({

333

'id': review.get('review_id'),

334

'author': review.get('reviewer'),

335

'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),

336

'timestamp': unified_timestamp(review.get('createdate')),

'parent': 'root'})

return info

class YoutubeWebArchiveIE(InfoExtractor):

343

IE_NAME = 'web.archive:youtube'

344

IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'

345

_VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|

346

(?:https?://)?web\.archive\.org/

347

(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional

348

(?:https?(?::|%3[Aa])//)?(?:

349

(?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL

350

|(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url

351

)

352

)(?P<id>[0-9A-Za-z_-]{11})

353

(?(prefix)

354

(?::(?P<date2>[0-9]{14}))?$|

(?:%26|[#&]|$)

)'''

_TESTS = [

{

'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',

'info_dict': {

'id': 'aYAGB11YrSs',

'ext': 'webm',

'title': 'Team Fortress 2 - Sandviches!',

365

'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',

366

'upload_date': '20110926',

367

'uploader': 'Zeurel',

368

'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',

369

'duration': 32,

370

'uploader_id': 'Zeurel',

371

'uploader_url': 'https://www.youtube.com/user/Zeurel',

372

'thumbnail': r're:https?://.*\.(jpg|webp)',

373

'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg',

}

}, {

# Internal link

'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',

'info_dict': {

'id': '97t7Xj_iBv0',

'ext': 'mp4',

'title': 'Why Machines That Bend Are Better',

382

'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',

383

'upload_date': '20190312',

384

'uploader': 'Veritasium',

385

'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',

386

'duration': 771,

387

'uploader_id': '1veritasium',

388

'uploader_url': 'https://www.youtube.com/user/1veritasium',

389

'thumbnail': r're:https?://.*\.(jpg|webp)',

390

'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA',

391

}

392

}, {

393

# Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.

394

# Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description

395

'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',

'info_dict': {

'id': 'AkhihxRKcrs',

'ext': 'webm',

'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',

400

'upload_date': '20120712',

401

'duration': 398,

402

'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',

403

'uploader_id': 'machinima',

404

'uploader_url': 'https://www.youtube.com/user/machinima',

405

'thumbnail': r're:https?://.*\.(jpg|webp)',

406

'uploader': 'machinima'

407

}

408

}, {

409

# FLV video. Video file URL does not provide itag information

410

'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',

'info_dict': {

'id': 'jNQXAC9IVRw',

'ext': 'flv',

'title': 'Me at the zoo',

415

'upload_date': '20050423',

416

'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',

417

'duration': 19,

418

'description': 'md5:10436b12e07ac43ff8df65287a56efb4',

419

'uploader_id': 'jawed',

420

'uploader_url': 'https://www.youtube.com/user/jawed',

421

'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A',

422

'thumbnail': r're:https?://.*\.(jpg|webp)',

'uploader': 'jawed',

}

}, {

'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',

'info_dict': {

'id': 'lTx3G6h2xyA',

'ext': 'flv',

'title': 'Madeon - Pop Culture (live mashup)',

431

'upload_date': '20110711',

432

'uploader': 'Madeon',

433

'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',

434

'duration': 204,

435

'description': 'md5:f7535343b6eda34a314eff8b85444680',

436

'uploader_id': 'itsmadeon',

437

'uploader_url': 'https://www.youtube.com/user/itsmadeon',

438

'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w',

439

'thumbnail': r're:https?://.*\.(jpg|webp)',

440

}

441

}, {

442

# First capture is of dead video, second is the oldest from CDX response.

443

'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',

'info_dict': {

'id': '1JYutPM8O6E',

'ext': 'mp4',

'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',

448

'upload_date': '20160218',

449

'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',

450

'duration': 1235,

451

'description': 'md5:21032bae736421e89c2edf36d1936947',

452

'uploader_id': 'MachinimaETC',

453

'uploader_url': 'https://www.youtube.com/user/MachinimaETC',

454

'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',

455

'thumbnail': r're:https?://.*\.(jpg|webp)',

456

'uploader': 'ETC News',

457

}

458

}, {

459

# First capture of dead video, capture date in link links to dead capture.

460

'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',

'info_dict': {

'id': '6FPhZJGvf4E',

'ext': 'mp4',

'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',

465

'upload_date': '20160219',

466

'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',

467

'duration': 797,

468

'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',

469

'uploader_id': 'MachinimaETC',

470

'uploader_url': 'https://www.youtube.com/user/MachinimaETC',

471

'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',

472

'thumbnail': r're:https?://.*\.(jpg|webp)',

473

'uploader': 'ETC News',

474

},

475

'expected_warnings': [

476

r'unable to download capture webpage $it may not be archived$'

477

]

478

}, { # Very old YouTube page, has - YouTube in title.

479

'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',

'info_dict': {

'id': '-06-KB9XTzg',

'ext': 'flv',

'title': 'New Coin Hack!! 100% Safe!!'

484

}

485

}, {

486

'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',

'info_dict': {

'id': 'dWW7qP423y8',

'ext': 'mp4',

'title': 'It\'s Bootleg AirPods Time.',

491

'upload_date': '20211021',

492

'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',

493

'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',

494

'duration': 810,

495

'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',

496

'thumbnail': r're:https?://.*\.(jpg|webp)',

497

'uploader': 'DankPods',

498

}

499

}, {

500

# player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093

501

'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4',

'info_dict': {

'id': '6Dh-RL__uN4',

'ext': 'mp4',

'title': 'bitch lasagna',

506

'upload_date': '20181005',

507

'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',

508

'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',

509

'duration': 135,

510

'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',

511

'uploader': 'PewDiePie',

512

'uploader_id': 'PewDiePie',

513

'uploader_url': 'https://www.youtube.com/user/PewDiePie',

514

'thumbnail': r're:https?://.*\.(jpg|webp)',

515

}

516

}, {

517

# ~June 2010 Capture. swfconfig

518

'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y',

'info_dict': {

'id': '8XeW5ilk-9Y',

'ext': 'flv',

'title': 'Story of Stuff, The Critique Part 4 of 4',

523

'duration': 541,

524

'description': 'md5:28157da06f2c5e94c97f7f3072509972',

525

'uploader': 'HowTheWorldWorks',

526

'uploader_id': 'HowTheWorldWorks',

527

'thumbnail': r're:https?://.*\.(jpg|webp)',

528

'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',

529

'upload_date': '20090520',

530

}

531

}, {

532

# Jan 2011: watch-video-date/eow-date surrounded by whitespace

533

'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc',

'info_dict': {

'id': 'Q_yjX80U7Yc',

'ext': 'flv',

'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest',

538

'uploader_id': 'claybutlermusic',

539

'description': 'md5:4595264559e3d0a0ceb3f011f6334543',

540

'upload_date': '20090803',

541

'uploader': 'claybutlermusic',

542

'thumbnail': r're:https?://.*\.(jpg|webp)',

543

'duration': 132,

544

'uploader_url': 'https://www.youtube.com/user/claybutlermusic',

545

}

546

}, {

547

# ~May 2009 swfArgs. ytcfg is spread out over various vars

548

'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY',

'info_dict': {

'id': 'c5uJgG05xUY',

'ext': 'webm',

'title': 'Story of Stuff, The Critique Part 1 of 4',

553

'uploader_id': 'HowTheWorldWorks',

554

'uploader': 'HowTheWorldWorks',

555

'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',

556

'upload_date': '20090513',

557

'description': 'md5:4ca77d79538064e41e4cc464e93f44f0',

558

'thumbnail': r're:https?://.*\.(jpg|webp)',

'duration': 754,

}

}, {

# ~June 2012. Upload date is in another lang so cannot extract.

563

'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA',

'info_dict': {

'id': 'xWTLLl-dQaA',

'ext': 'mp4',

'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)',

568

'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy',

569

'description': 'md5:e25f0133aaf9e6793fb81c18021d193e',

570

'uploader_id': 'BlackNerdComedy',

571

'uploader': 'BlackNerdComedy',

572

'duration': 182,

573

'thumbnail': r're:https?://.*\.(jpg|webp)',

}

}, {

# ~July 2013

'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM',

'info_dict': {

'id': '9eO1aasHyTM',

'ext': 'mp4',

'title': 'Polar-oid',

582

'description': 'Cameras and bears are dangerous!',

583

'uploader_url': 'https://www.youtube.com/user/punkybird',

584

'uploader_id': 'punkybird',

585

'duration': 202,

586

'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ',

587

'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ',

588

'upload_date': '20060428',

589

'uploader': 'punkybird',

590

}

591

}, {

592

# April 2020: Player response in player config

593

'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en',

'info_dict': {

'id': 'Cf7vS8jc7dY',

'ext': 'mp4',

'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated',

598

'duration': 64,

599

'upload_date': '20200408',

600

'uploader_id': 'GameGrumps',

601

'uploader': 'GameGrumps',

602

'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ',

603

'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ',

604

'thumbnail': r're:https?://.*\.(jpg|webp)',

605

'description': 'md5:c625bb3c02c4f5fb4205971e468fa341',

606

'uploader_url': 'https://www.youtube.com/user/GameGrumps',

607

}

608

}, {

609

# watch7-user-header with yt-user-info

610

'url': 'ytarchive:kbh4T_b4Ixw:20160307085057',

'info_dict': {

'id': 'kbh4T_b4Ixw',

'ext': 'mp4',

'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix',

615

'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA',

616

'uploader': 'Nelward music',

617

'duration': 213,

618

'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d',

619

'thumbnail': r're:https?://.*\.(jpg|webp)',

620

'upload_date': '20150503',

621

'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA',

}

}, {

# April 2012

'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU',

'info_dict': {

'id': 'SOm7mPoPskU',

'ext': 'mp4',

'title': 'Boyfriend - Justin Bieber Parody',

630

'uploader_url': 'https://www.youtube.com/user/thecomputernerd01',

631

'uploader': 'thecomputernerd01',

632

'thumbnail': r're:https?://.*\.(jpg|webp)',

633

'description': 'md5:dd7fa635519c2a5b4d566beaecad7491',

634

'duration': 200,

635

'upload_date': '20120407',

636

'uploader_id': 'thecomputernerd01',

637

}

638

}, {

639

'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',

640

'only_matching': True

641

}, {

642

'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M',

643

'only_matching': True

644

}, {

645

# Video not archived, only capture is unavailable video page

646

'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',

647

'only_matching': True

648

}, { # Encoded url

649

'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',

650

'only_matching': True

651

}, {

652

'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',

653

'only_matching': True

654

}, {

655

'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&search=soccer',

656

'only_matching': True

657

}, {

658

'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',

659

'only_matching': True

660

}, {

661

'url': 'ytarchive:BaW_jenozKc:20050214000000',

662

'only_matching': True

663

}, {

664

'url': 'ytarchive:BaW_jenozKc',

665

'only_matching': True

666

},

667

]

668

_YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE

669

_YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x:

670

(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*|

671

{YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}

672

)'''

673

674

_YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers

675

_YT_ALL_THUMB_SERVERS = orderedSet(

676

_YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]])

677

678

_WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/'

679

_OLDEST_CAPTURE_DATE = 20050214000000

680

_NEWEST_CAPTURE_DATE = 20500101000000

681

682

def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False):

683

# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md

query = {

'url': url,

'output': 'json',

'fl': 'original,mimetype,length,timestamp',

688

'limit': 500,

689

'filter': ['statuscode:200'] + (filters or []),

690

'collapse': collapse or [],

691

**(query or {})

692

}

693

res = self._download_json(

694

'https://web.archive.org/cdx/search/cdx', item_id,

695

note or 'Downloading CDX API JSON', query=query, fatal=fatal)

696

if isinstance(res, list) and len(res) >= 2:

697

# format response to make it easier to use

698

return list(dict(zip(res[0], v)) for v in res[1:])

699

elif not isinstance(res, list) or len(res) != 0:

700

self.report_warning('Error while parsing CDX API response' + bug_reports_message())

701

702

def _extract_webpage_title(self, webpage):

703

page_title = self._html_extract_title(webpage, default='')

704

# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.

705

return self._html_search_regex(

706

r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',

707

page_title, 'title', default='')

708

709

def _extract_metadata(self, video_id, webpage):

710

search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))

711

player_response = self._search_json(

712

self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response',

713

video_id, default={})

714

initial_data = self._search_json(

715

self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={})

716

717

ytcfg = {}

718

for j in re.findall(r'yt\.setConfig$\s*(?P<json>{\s*(?s:.+?)\s*})\s*$;', webpage): # ~June 2010

719

ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {})

720

721

# XXX: this also may contain a 'ptchn' key

722

player_config = (

723

self._search_json(

724

r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=',

725

webpage, 'player config', video_id, default=None)

726

or ytcfg.get('PLAYER_CONFIG') or {})

727

728

# XXX: this may also contain a 'creator' key.

729

swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={})

730

if swf_args and not traverse_obj(player_config, ('args',)):

731

player_config['args'] = swf_args

732

733

if not player_response:

734

# April 2020

735

player_response = self._parse_json(

736

traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False)

737

738

initial_data_video = traverse_obj(

739

initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),

740

expected_type=dict, get_all=False, default={})

741

742

video_details = traverse_obj(

743

player_response, 'videoDetails', expected_type=dict, get_all=False, default={})

744

745

microformats = traverse_obj(

746

player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={})

747

748

video_title = (

749

video_details.get('title')

750

or YoutubeBaseInfoExtractor._get_text(microformats, 'title')

751

or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')

752

or traverse_obj(player_config, ('args', 'title'))

753

or self._extract_webpage_title(webpage)

754

or search_meta(['og:title', 'twitter:title', 'title']))

755

756

def id_from_url(url, type_):

757

return self._search_regex(

758

rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None)

759

760

# XXX: would the get_elements_by_... functions be better suited here?

761

_CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"'

762

uploader_or_channel_url = self._search_regex(

763

[fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024

764

fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012

765

webpage, 'uploader or channel url', default=None)

766

767

owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2

768

769

# Uploader refers to the /user/ id ONLY

770

uploader_id = (

771

id_from_url(owner_profile_url, 'user')

772

or id_from_url(uploader_or_channel_url, 'user')

773

or ytcfg.get('VIDEO_USERNAME'))

774

uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None

775

776

# XXX: do we want to differentiate uploader and channel?

777

uploader = (

778

self._search_regex(

779

[r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010

780

r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009

781

r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009

782

r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012

783

webpage, 'uploader', default=None)

784

or self._html_search_regex(

785

[r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016

786

r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013

787

get_element_by_id('watch7-user-header', webpage), 'uploader', default=None)

788

or self._html_search_regex(

789

r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012

790

get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None)

791

or traverse_obj(player_config, ('args', 'creator'))

792

or video_details.get('author'))

793

794

channel_id = str_or_none(

795

video_details.get('channelId')

796

or microformats.get('externalChannelId')

797

or search_meta('channelId')

798

or self._search_regex(

799

r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6

800

webpage, 'channel id', default=None, group='id')

801

or id_from_url(owner_profile_url, 'channel')

802

or id_from_url(uploader_or_channel_url, 'channel')

803

or traverse_obj(player_config, ('args', 'ucid')))

804

805

channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None

806

duration = int_or_none(

807

video_details.get('lengthSeconds')

808

or microformats.get('lengthSeconds')

809

or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False)

810

or parse_duration(search_meta('duration')))

811

description = (

812

video_details.get('shortDescription')

813

or YoutubeBaseInfoExtractor._get_text(microformats, 'description')

814

or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23

815

or search_meta(['description', 'og:description', 'twitter:description']))

816

817

upload_date = unified_strdate(

818

dict_get(microformats, ('uploadDate', 'publishDate'))

819

or search_meta(['uploadDate', 'datePublished'])

820

or self._search_regex(

821

[r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>',

822

r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520

823

r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively)

824

webpage, 'upload date', default=None))

825

826

return {

827

'title': video_title,

828

'description': description,

829

'upload_date': upload_date,

830

'uploader': uploader,

831

'channel_id': channel_id,

832

'channel_url': channel_url,

833

'duration': duration,

834

'uploader_url': uploader_url,

835

'uploader_id': uploader_id,

836

}

837

838

def _extract_thumbnails(self, video_id):

839

try_all = 'thumbnails' in self._configuration_arg('check_all')

840

thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format(

841

webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server)

842

for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))]

843

844

thumbnails = []

845

for url in thumbnail_base_urls:

846

response = self._call_cdx_api(

847

video_id, url, filters=['mimetype:image/(?:webp|jpeg)'],

848

collapse=['urlkey'], query={'matchType': 'prefix'})

if not response:

continue

thumbnails.extend(

{

'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'),

854

'filesize': int_or_none(thumbnail_dict.get('length')),

855

'preference': int_or_none(thumbnail_dict.get('length'))

856

} for thumbnail_dict in response)

if not try_all:

break

self._remove_duplicate_formats(thumbnails)

861

return thumbnails

862

863

def _get_capture_dates(self, video_id, url_date):

864

capture_dates = []

865

# Note: CDX API will not find watch pages with extra params in the url.

866

response = self._call_cdx_api(

867

video_id, f'https://www.youtube.com/watch?v={video_id}',

868

filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []

869

all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None)

870

871

# Prefer the new polymer UI captures as we support extracting more metadata from them

872

# WBM captures seem to all switch to this layout ~July 2020

873

modern_captures = [x for x in all_captures if x >= 20200701000000]

874

if modern_captures:

875

capture_dates.append(modern_captures[0])

876

capture_dates.append(url_date)

877

if all_captures:

878

capture_dates.append(all_captures[0])

879

880

if 'captures' in self._configuration_arg('check_all'):

881

capture_dates.extend(modern_captures + all_captures)

882

883

# Fallbacks if any of the above fail

884

capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])

885

return orderedSet(filter(None, capture_dates))

886

887

def _real_extract(self, url):

888

video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')

889

url_date = url_date or url_date_2

890

891

urlh = None

892

retry_manager = self.RetryManager(fatal=False)

893

for retry in retry_manager:

894

try:

895

urlh = self._request_webpage(

896

HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id),

897

video_id, note='Fetching archived video file url', expected_status=True)

898

except ExtractorError as e:

899

# HTTP Error 404 is expected if the video is not saved.

900

if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:

901

self.raise_no_formats(

902

'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)

else:

retry.error = e

if retry_manager.error:

907

self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id)

908

909

capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))

910

self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))

911

info = {'id': video_id}

912

for capture in capture_dates:

913

webpage = self._download_webpage(

914

(self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),

915

video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',

916

note='Downloading capture webpage')

917

current_info = self._extract_metadata(video_id, webpage or '')

918

# Try avoid getting deleted video metadata

919

if current_info.get('title'):

920

info = merge_dicts(info, current_info)

921

if 'captures' not in self._configuration_arg('check_all'):

922

break

923

924

info['thumbnails'] = self._extract_thumbnails(video_id)

925

926

if urlh:

927

url = compat_urllib_parse_unquote(urlh.geturl())

928

video_file_url_qs = parse_qs(url)

929

# Attempt to recover any ext & format info from playback url & response headers

930

format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}

931

itag = try_get(video_file_url_qs, lambda x: x['itag'][0])

932

if itag and itag in YoutubeIE._formats:

933

format.update(YoutubeIE._formats[itag])

934

format.update({'format_id': itag})

935

else:

936

mime = try_get(video_file_url_qs, lambda x: x['mime'][0])

937

ext = (mimetype2ext(mime)

938

or urlhandle_detect_ext(urlh)

939

or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))

940

format.update({'ext': ext})

941

info['formats'] = [format]

942

if not info.get('duration'):

943

info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))

944

945

if not info.get('title'):

946

info['title'] = video_id

947

return info