3 from .common
import InfoExtractor
4 from ..compat
import compat_parse_qs
16 class GoogleDriveIE(InfoExtractor
):
20 (?:docs|drive)\.google\.com/
25 video\.google\.com/get_player\?.*?docid=
27 (?P<id>[a-zA-Z0-9_-]{28,})
30 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
31 'md5': '5c602afbbf2c1db91831f5d82f678554',
33 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
35 'title': 'Big Buck Bunny.mp4',
39 # video can't be watched anonymously due to view count limit reached,
40 # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
41 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
42 'only_matching': True,
44 # video id is longer than 28 characters
45 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
46 'only_matching': True,
48 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
49 'only_matching': True,
51 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
52 'only_matching': True,
72 _BASE_URL_CAPTIONS
= 'https://drive.google.com/timedtext'
73 _CAPTIONS_ENTRY_TAG
= {
75 'automatic_captions': 'target',
77 _caption_formats_ext
= []
81 def _extract_embed_urls(cls
, url
, webpage
):
83 r
'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
86 yield 'https://drive.google.com/file/d/%s' % mobj
.group('id')
88 def _download_subtitles_xml(self
, video_id
, subtitles_id
, hl
):
89 if self
._captions
_xml
:
91 self
._captions
_xml
= self
._download
_xml
(
92 self
._BASE
_URL
_CAPTIONS
, video_id
, query
={
101 }, note
='Downloading subtitles XML',
102 errnote
='Unable to download subtitles XML', fatal
=False)
103 if self
._captions
_xml
:
104 for f
in self
._captions
_xml
.findall('format'):
105 if f
.attrib
.get('fmt_code') and not f
.attrib
.get('default'):
106 self
._caption
_formats
_ext
.append(f
.attrib
['fmt_code'])
108 def _get_captions_by_type(self
, video_id
, subtitles_id
, caption_type
,
109 origin_lang_code
=None):
110 if not subtitles_id
or not caption_type
:
113 for caption_entry
in self
._captions
_xml
.findall(
114 self
._CAPTIONS
_ENTRY
_TAG
[caption_type
]):
115 caption_lang_code
= caption_entry
.attrib
.get('lang_code')
116 if not caption_lang_code
:
118 caption_format_data
= []
119 for caption_format
in self
._caption
_formats
_ext
:
123 'fmt': caption_format
,
124 'lang': (caption_lang_code
if origin_lang_code
is None
125 else origin_lang_code
),
130 if origin_lang_code
is not None:
131 query
.update({'tlang': caption_lang_code}
)
132 caption_format_data
.append({
133 'url': update_url_query(self
._BASE
_URL
_CAPTIONS
, query
),
134 'ext': caption_format
,
136 captions
[caption_lang_code
] = caption_format_data
139 def _get_subtitles(self
, video_id
, subtitles_id
, hl
):
140 if not subtitles_id
or not hl
:
142 self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
)
143 if not self
._captions
_xml
:
145 return self
._get
_captions
_by
_type
(video_id
, subtitles_id
, 'subtitles')
147 def _get_automatic_captions(self
, video_id
, subtitles_id
, hl
):
148 if not subtitles_id
or not hl
:
150 self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
)
151 if not self
._captions
_xml
:
153 track
= self
._captions
_xml
.find('track')
156 origin_lang_code
= track
.attrib
.get('lang_code')
157 if not origin_lang_code
:
159 return self
._get
_captions
_by
_type
(
160 video_id
, subtitles_id
, 'automatic_captions', origin_lang_code
)
162 def _real_extract(self
, url
):
163 video_id
= self
._match
_id
(url
)
164 video_info
= compat_parse_qs(self
._download
_webpage
(
165 'https://drive.google.com/get_video_info',
166 video_id
, query
={'docid': video_id}
))
169 return try_get(video_info
, lambda x
: x
[key
][0])
171 reason
= get_value('reason')
172 title
= get_value('title')
173 if not title
and reason
:
174 raise ExtractorError(reason
, expected
=True)
177 fmt_stream_map
= (get_value('fmt_stream_map') or '').split(',')
178 fmt_list
= (get_value('fmt_list') or '').split(',')
179 if fmt_stream_map
and fmt_list
:
183 r
'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt
)
185 resolutions
[mobj
.group('format_id')] = (
186 int(mobj
.group('width')), int(mobj
.group('height')))
188 for fmt_stream
in fmt_stream_map
:
189 fmt_stream_split
= fmt_stream
.split('|')
190 if len(fmt_stream_split
) < 2:
192 format_id
, format_url
= fmt_stream_split
[:2]
194 'url': lowercase_escape(format_url
),
195 'format_id': format_id
,
196 'ext': self
._FORMATS
_EXT
[format_id
],
198 resolution
= resolutions
.get(format_id
)
201 'width': resolution
[0],
202 'height': resolution
[1],
206 source_url
= update_url_query(
207 'https://drive.google.com/uc', {
209 'export': 'download',
212 def request_source_file(source_url
, kind
):
213 return self
._request
_webpage
(
214 source_url
, video_id
, note
='Requesting %s file' % kind
,
215 errnote
='Unable to request %s file' % kind
, fatal
=False)
216 urlh
= request_source_file(source_url
, 'source')
218 def add_source_format(urlh
):
220 # Use redirect URLs as download URLs in order to calculate
221 # correct cookies in _calc_cookies.
222 # Using original URLs may result in redirect loop due to
223 # google.com's cookies mistakenly used for googleusercontent.com
224 # redirect URLs (see #23919).
225 'url': urlh
.geturl(),
226 'ext': determine_ext(title
, 'mp4').lower(),
227 'format_id': 'source',
230 if urlh
.headers
.get('Content-Disposition'):
231 add_source_format(urlh
)
233 confirmation_webpage
= self
._webpage
_read
_content
(
234 urlh
, url
, video_id
, note
='Downloading confirmation page',
235 errnote
='Unable to confirm download', fatal
=False)
236 if confirmation_webpage
:
237 confirm
= self
._search
_regex
(
238 r
'confirm=([^&"\']+)', confirmation_webpage,
239 'confirmation code
', default=None)
241 confirmed_source_url = update_url_query(source_url, {
244 urlh = request_source_file(confirmed_source_url, 'confirmed source
')
245 if urlh and urlh.headers.get('Content
-Disposition
'):
246 add_source_format(urlh)
249 get_element_by_class('uc
-error
-subcaption
', confirmation_webpage)
250 or get_element_by_class('uc
-error
-caption
', confirmation_webpage)
251 or 'unable to extract confirmation code
')
253 if not formats and reason:
254 self.raise_no_formats(reason, expected=True)
256 self._sort_formats(formats)
260 ttsurl = get_value('ttsurl
')
262 # the video Id for subtitles will be the last value in the ttsurl
264 subtitles_id = ttsurl.encode('utf
-8').decode(
265 'unicode_escape
').split('=')[-1]
267 self.cookiejar.clear(domain='.google
.com
', path='/', name='NID
')
272 'thumbnail
': 'https
://drive
.google
.com
/thumbnail?
id=' + video_id,
273 'duration
': int_or_none(get_value('length_seconds
')),
275 'subtitles
': self.extract_subtitles(video_id, subtitles_id, hl),
276 'automatic_captions
': self.extract_automatic_captions(
277 video_id, subtitles_id, hl),
281 class GoogleDriveFolderIE(InfoExtractor):
282 IE_NAME = 'GoogleDrive
:Folder
'
283 _VALID_URL = r'https?
://(?
:docs|drive
)\
.google\
.com
/drive
/folders
/(?P
<id>[\w
-]{28,}
)'
285 'url
': 'https
://drive
.google
.com
/drive
/folders
/1dQ4sx0
-__Nvg65rxTSgQrl7VyW_FZ9QI
',
287 'id': '1dQ4sx0
-__Nvg65rxTSgQrl7VyW_FZ9QI
',
292 _BOUNDARY = '=====vc17a3rwnndj
====='
293 _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}
'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1"
294 _DATA = f'''--{_BOUNDARY}
295 content-type: application/http
296 content-transfer-encoding: binary
303 def _call_api(self, folder_id, key, data, **kwargs):
304 response = self._download_webpage(
305 'https
://clients6
.google
.com
/batch
/drive
/v2beta
',
306 folder_id, data=data.encode('utf
-8'),
308 'Content
-Type
': 'text
/plain
;charset
=UTF
-8;',
309 'Origin
': 'https
://drive
.google
.com
',
311 '$ct
': f'multipart
/mixed
; boundary
="{self._BOUNDARY}"',
314 return self._search_json('', response, 'api response
', folder_id, **kwargs) or {}
316 def _get_folder_items(self, folder_id, key):
318 while page_token is not None:
319 request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key)
320 page = self._call_api(folder_id, key, self._DATA % request)
321 yield from page['items
']
322 page_token = page.get('nextPageToken
')
324 def _real_extract(self, url):
325 folder_id = self._match_id(url)
327 webpage = self._download_webpage(url, folder_id)
328 key = self._search_regex(r'"(\w{39})"', webpage, 'key
')
330 folder_info = self._call_api(folder_id, key, self._DATA % f'/drive
/v2beta
/files
/{folder_id} HTTP
/1.1', fatal=False)
332 return self.playlist_from_matches(
333 self._get_folder_items(folder_id, key), folder_id, folder_info.get('title
'),
334 ie=GoogleDriveIE, getter=lambda item: f'https
://drive
.google
.com
/file/d
/{item["id"]}
')