jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/googledrive.py

Commit	Line	Data
3e5f3df1	1	import re
add96eb9	2	import urllib.parse
3e5f3df1	3
984e4d48	4	from .common import InfoExtractor
85ec2a33	5	from .youtube import YoutubeIE
8e92d21e	6	from ..utils import (
8e92d21e	7	ExtractorError,
85ec2a33	8	bug_reports_message,
4d248e29	9	determine_ext,
3b7f5300	10	extract_attributes,
2181983a	11	get_element_by_class,
3b7f5300	12	get_element_html_by_id,
5b251628	13	int_or_none,
e4e50f60	14	lowercase_escape,
a0566bbf	15	try_get,
05915e37	16	update_url_query,
8e92d21e	17	)
984e4d48	18
5b251628	19
5b251628	20	class GoogleDriveIE(InfoExtractor):
1b41da48 S	21	_VALID_URL = r'''(?x)
	22	https?://
	23	(?:
5498729c	24	(?:docs\|drive\|drive\.usercontent)\.google\.com/
1b41da48	25	(?:
5498729c	26	(?:uc\|open\|download)\?.*?id=\|
1b41da48 S	27	file/d/
	28	)\|
	29	video\.google\.com/get_player\?.*?docid=
	30	)
	31	(?P<id>[a-zA-Z0-9_-]{28,})
	32	'''
58e6d097	33	_TESTS = [{
5b251628	34	'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
fea82c17	35	'md5': '5c602afbbf2c1db91831f5d82f678554',
3e5f3df1	36	'info_dict': {
5b251628	37	'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
3e5f3df1	38	'ext': 'mp4',
5b251628	39	'title': 'Big Buck Bunny.mp4',
e4e50f60	40	'duration': 45,
3b7f5300	41	'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
add96eb9	42	},
85ec2a33 W	43	}, {
	44	# has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922)
	45	'url': 'https://drive.google.com/uc?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
	46	'md5': '322db8d63dd19788c04050a4bba67073',
	47	'info_dict': {
	48	'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
	49	'ext': 'mp3',
	50	'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3',
	51	'duration': 184,
	52	'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
	53	},
fea82c17 S	54	}, {
fea82c17 S	55	# video can't be watched anonymously due to view count limit reached,
067aa17e	56	# but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
fea82c17	57	'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
a0566bbf	58	'only_matching': True,
58e6d097 S	59	}, {
	60	# video id is longer than 28 characters
	61	'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
1b41da48 S	62	'only_matching': True,
	63	}, {
	64	'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
	65	'only_matching': True,
	66	}, {
	67	'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
	68	'only_matching': True,
5498729c	69	}, {
	70	'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
	71	'only_matching': True,
58e6d097	72	}]
5b251628	73	_FORMATS_EXT = {
85ec2a33 W	74	**{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')},
85ec2a33 W	75	'50': 'm4a',
5b251628	76	}
05915e37 PV	77	_BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
	78	_CAPTIONS_ENTRY_TAG = {
	79	'subtitles': 'track',
	80	'automatic_captions': 'target',
	81	}
	82	_caption_formats_ext = []
37d9af30	83	_captions_xml = None
3e5f3df1	84
bfd973ec	85	@classmethod
bfd973ec	86	def _extract_embed_urls(cls, url, webpage):
3e5f3df1	87	mobj = re.search(
58e6d097	88	r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=\|(?:docs\|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
3e5f3df1	89	webpage)
3e5f3df1	90	if mobj:
add96eb9	91	yield 'https://drive.google.com/file/d/{}'.format(mobj.group('id'))
3e5f3df1	92
37d9af30 S	93	def _download_subtitles_xml(self, video_id, subtitles_id, hl):
	94	if self._captions_xml:
	95	return
	96	self._captions_xml = self._download_xml(
	97	self._BASE_URL_CAPTIONS, video_id, query={
05915e37	98	'id': video_id,
37d9af30	99	'vid': subtitles_id,
05915e37 PV	100	'hl': hl,
	101	'v': video_id,
	102	'type': 'list',
	103	'tlangs': '1',
	104	'fmts': '1',
	105	'vssids': '1',
37d9af30 S	106	}, note='Downloading subtitles XML',
	107	errnote='Unable to download subtitles XML', fatal=False)
	108	if self._captions_xml:
	109	for f in self._captions_xml.findall('format'):
	110	if f.attrib.get('fmt_code') and not f.attrib.get('default'):
	111	self._caption_formats_ext.append(f.attrib['fmt_code'])
	112
	113	def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
	114	origin_lang_code=None):
	115	if not subtitles_id or not caption_type:
	116	return
05915e37	117	captions = {}
37d9af30 S	118	for caption_entry in self._captions_xml.findall(
37d9af30 S	119	self._CAPTIONS_ENTRY_TAG[caption_type]):
05915e37 PV	120	caption_lang_code = caption_entry.attrib.get('lang_code')
	121	if not caption_lang_code:
	122	continue
	123	caption_format_data = []
	124	for caption_format in self._caption_formats_ext:
	125	query = {
37d9af30	126	'vid': subtitles_id,
05915e37 PV	127	'v': video_id,
05915e37 PV	128	'fmt': caption_format,
37d9af30 S	129	'lang': (caption_lang_code if origin_lang_code is None
37d9af30 S	130	else origin_lang_code),
05915e37 PV	131	'type': 'track',
	132	'name': '',
	133	'kind': '',
	134	}
37d9af30	135	if origin_lang_code is not None:
05915e37 PV	136	query.update({'tlang': caption_lang_code})
	137	caption_format_data.append({
	138	'url': update_url_query(self._BASE_URL_CAPTIONS, query),
	139	'ext': caption_format,
	140	})
	141	captions[caption_lang_code] = caption_format_data
05915e37 PV	142	return captions
05915e37 PV	143
37d9af30 S	144	def _get_subtitles(self, video_id, subtitles_id, hl):
	145	if not subtitles_id or not hl:
	146	return
	147	self._download_subtitles_xml(video_id, subtitles_id, hl)
	148	if not self._captions_xml:
	149	return
	150	return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
	151
	152	def _get_automatic_captions(self, video_id, subtitles_id, hl):
	153	if not subtitles_id or not hl:
	154	return
	155	self._download_subtitles_xml(video_id, subtitles_id, hl)
	156	if not self._captions_xml:
	157	return
	158	track = self._captions_xml.find('track')
	159	if track is None:
	160	return
	161	origin_lang_code = track.attrib.get('lang_code')
	162	if not origin_lang_code:
	163	return
	164	return self._get_captions_by_type(
	165	video_id, subtitles_id, 'automatic_captions', origin_lang_code)
05915e37	166
3e5f3df1	167	def _real_extract(self, url):
3e5f3df1	168	video_id = self._match_id(url)
add96eb9	169	video_info = urllib.parse.parse_qs(self._download_webpage(
a0566bbf	170	'https://drive.google.com/get_video_info',
4d248e29	171	video_id, 'Downloading video webpage', query={'docid': video_id}))
a0566bbf	172
	173	def get_value(key):
	174	return try_get(video_info, lambda x: x[key][0])
3e5f3df1	175
a0566bbf	176	reason = get_value('reason')
a0566bbf	177	title = get_value('title')
fea82c17 S	178
fea82c17 S	179	formats = []
a0566bbf	180	fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
a0566bbf	181	fmt_list = (get_value('fmt_list') or '').split(',')
fea82c17 S	182	if fmt_stream_map and fmt_list:
	183	resolutions = {}
	184	for fmt in fmt_list:
	185	mobj = re.search(
	186	r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
	187	if mobj:
	188	resolutions[mobj.group('format_id')] = (
	189	int(mobj.group('width')), int(mobj.group('height')))
984e4d48	190
fea82c17 S	191	for fmt_stream in fmt_stream_map:
	192	fmt_stream_split = fmt_stream.split('\|')
	193	if len(fmt_stream_split) < 2:
	194	continue
	195	format_id, format_url = fmt_stream_split[:2]
85ec2a33 W	196	ext = self._FORMATS_EXT.get(format_id)
	197	if not ext:
	198	self.report_warning(f'Unknown format {format_id}{bug_reports_message()}')
fea82c17 S	199	f = {
	200	'url': lowercase_escape(format_url),
	201	'format_id': format_id,
85ec2a33	202	'ext': ext,
fea82c17 S	203	}
	204	resolution = resolutions.get(format_id)
	205	if resolution:
	206	f.update({
	207	'width': resolution[0],
	208	'height': resolution[1],
	209	})
	210	formats.append(f)
9be9ec59	211
fea82c17	212	source_url = update_url_query(
5498729c	213	'https://drive.usercontent.google.com/download', {
fea82c17 S	214	'id': video_id,
fea82c17 S	215	'export': 'download',
5498729c	216	'confirm': 't',
fea82c17	217	})
da2069fb	218
3b7f5300	219	def request_source_file(source_url, kind, data=None):
da2069fb	220	return self._request_webpage(
add96eb9	221	source_url, video_id, note=f'Requesting {kind} file',
add96eb9	222	errnote=f'Unable to request {kind} file', fatal=False, data=data)
da2069fb	223	urlh = request_source_file(source_url, 'source')
fea82c17	224	if urlh:
da2069fb	225	def add_source_format(urlh):
4d248e29	226	nonlocal title
	227	if not title:
	228	title = self._search_regex(
	229	r'\bfilename="([^"]+)"', urlh.headers.get('Content-Disposition'),
	230	'title', default=None)
fea82c17	231	formats.append({
da2069fb S	232	# Use redirect URLs as download URLs in order to calculate
	233	# correct cookies in _calc_cookies.
	234	# Using original URLs may result in redirect loop due to
	235	# google.com's cookies mistakenly used for googleusercontent.com
	236	# redirect URLs (see #23919).
3d2623a8	237	'url': urlh.url,
fea82c17 S	238	'ext': determine_ext(title, 'mp4').lower(),
	239	'format_id': 'source',
	240	'quality': 1,
9be9ec59	241	})
fea82c17	242	if urlh.headers.get('Content-Disposition'):
da2069fb	243	add_source_format(urlh)
fea82c17 S	244	else:
	245	confirmation_webpage = self._webpage_read_content(
	246	urlh, url, video_id, note='Downloading confirmation page',
	247	errnote='Unable to confirm download', fatal=False)
	248	if confirmation_webpage:
3b7f5300 R	249	confirmed_source_url = extract_attributes(
	250	get_element_html_by_id('download-form', confirmation_webpage) or '').get('action')
	251	if confirmed_source_url:
	252	urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'')
da2069fb S	253	if urlh and urlh.headers.get('Content-Disposition'):
da2069fb S	254	add_source_format(urlh)
2181983a	255	else:
	256	self.report_warning(
	257	get_element_by_class('uc-error-subcaption', confirmation_webpage)
	258	or get_element_by_class('uc-error-caption', confirmation_webpage)
	259	or 'unable to extract confirmation code')
fea82c17	260
a0566bbf	261	if not formats and reason:
4d248e29	262	if title:
	263	self.raise_no_formats(reason, expected=True)
	264	else:
	265	raise ExtractorError(reason, expected=True)
fea82c17	266
a0566bbf	267	hl = get_value('hl')
37d9af30	268	subtitles_id = None
a0566bbf	269	ttsurl = get_value('ttsurl')
05915e37	270	if ttsurl:
37d9af30 S	271	# the video Id for subtitles will be the last value in the ttsurl
37d9af30 S	272	# query string
add96eb9	273	subtitles_id = ttsurl.encode().decode(
37d9af30	274	'unicode_escape').split('=')[-1]
05915e37	275
9809740b	276	self.cookiejar.clear(domain='.google.com', path='/', name='NID')
67475072	277
984e4d48	278	return {
	279	'id': video_id,
	280	'title': title,
a0566bbf	281	'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
a0566bbf	282	'duration': int_or_none(get_value('length_seconds')),
5b251628	283	'formats': formats,
37d9af30 S	284	'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
	285	'automatic_captions': self.extract_automatic_captions(
	286	video_id, subtitles_id, hl),
984e4d48	287	}
145c5a83 ES	288
	289
	290	class GoogleDriveFolderIE(InfoExtractor):
	291	IE_NAME = 'GoogleDrive:Folder'
	292	_VALID_URL = r'https?://(?:docs\|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})'
	293	_TESTS = [{
	294	'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
	295	'info_dict': {
	296	'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
add96eb9	297	'title': 'Forrest',
145c5a83 ES	298	},
	299	'playlist_count': 3,
	300	}]
	301	_BOUNDARY = '=====vc17a3rwnndj====='
	302	_REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1"
	303	_DATA = f'''--{_BOUNDARY}
	304	content-type: application/http
	305	content-transfer-encoding: binary
	306
	307	GET %s
	308
	309	--{_BOUNDARY}
	310	'''
	311
	312	def _call_api(self, folder_id, key, data, **kwargs):
	313	response = self._download_webpage(
	314	'https://clients6.google.com/batch/drive/v2beta',
add96eb9	315	folder_id, data=data.encode(),
145c5a83 ES	316	headers={
	317	'Content-Type': 'text/plain;charset=UTF-8;',
	318	'Origin': 'https://drive.google.com',
	319	}, query={
	320	'$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"',
add96eb9	321	'key': key,
145c5a83 ES	322	}, **kwargs)
	323	return self._search_json('', response, 'api response', folder_id, **kwargs) or {}
	324
	325	def _get_folder_items(self, folder_id, key):
	326	page_token = ''
	327	while page_token is not None:
	328	request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key)
	329	page = self._call_api(folder_id, key, self._DATA % request)
	330	yield from page['items']
	331	page_token = page.get('nextPageToken')
	332
	333	def _real_extract(self, url):
	334	folder_id = self._match_id(url)
	335
	336	webpage = self._download_webpage(url, folder_id)
	337	key = self._search_regex(r'"(\w{39})"', webpage, 'key')
	338
	339	folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False)
	340
	341	return self.playlist_from_matches(
	342	self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'),
	343	ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}')