3 from .common
import InfoExtractor
4 from ..compat
import compat_parse_qs
10 get_element_html_by_id
,
18 class GoogleDriveIE(InfoExtractor
):
22 (?:docs|drive)\.google\.com/
27 video\.google\.com/get_player\?.*?docid=
29 (?P<id>[a-zA-Z0-9_-]{28,})
32 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
33 'md5': '5c602afbbf2c1db91831f5d82f678554',
35 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
37 'title': 'Big Buck Bunny.mp4',
39 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
42 # video can't be watched anonymously due to view count limit reached,
43 # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
44 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
45 'only_matching': True,
47 # video id is longer than 28 characters
48 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
49 'only_matching': True,
51 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
52 'only_matching': True,
54 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
55 'only_matching': True,
75 _BASE_URL_CAPTIONS
= 'https://drive.google.com/timedtext'
76 _CAPTIONS_ENTRY_TAG
= {
78 'automatic_captions': 'target',
80 _caption_formats_ext
= []
84 def _extract_embed_urls(cls
, url
, webpage
):
86 r
'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
89 yield 'https://drive.google.com/file/d/%s' % mobj
.group('id')
91 def _download_subtitles_xml(self
, video_id
, subtitles_id
, hl
):
92 if self
._captions
_xml
:
94 self
._captions
_xml
= self
._download
_xml
(
95 self
._BASE
_URL
_CAPTIONS
, video_id
, query
={
104 }, note
='Downloading subtitles XML',
105 errnote
='Unable to download subtitles XML', fatal
=False)
106 if self
._captions
_xml
:
107 for f
in self
._captions
_xml
.findall('format'):
108 if f
.attrib
.get('fmt_code') and not f
.attrib
.get('default'):
109 self
._caption
_formats
_ext
.append(f
.attrib
['fmt_code'])
111 def _get_captions_by_type(self
, video_id
, subtitles_id
, caption_type
,
112 origin_lang_code
=None):
113 if not subtitles_id
or not caption_type
:
116 for caption_entry
in self
._captions
_xml
.findall(
117 self
._CAPTIONS
_ENTRY
_TAG
[caption_type
]):
118 caption_lang_code
= caption_entry
.attrib
.get('lang_code')
119 if not caption_lang_code
:
121 caption_format_data
= []
122 for caption_format
in self
._caption
_formats
_ext
:
126 'fmt': caption_format
,
127 'lang': (caption_lang_code
if origin_lang_code
is None
128 else origin_lang_code
),
133 if origin_lang_code
is not None:
134 query
.update({'tlang': caption_lang_code}
)
135 caption_format_data
.append({
136 'url': update_url_query(self
._BASE
_URL
_CAPTIONS
, query
),
137 'ext': caption_format
,
139 captions
[caption_lang_code
] = caption_format_data
142 def _get_subtitles(self
, video_id
, subtitles_id
, hl
):
143 if not subtitles_id
or not hl
:
145 self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
)
146 if not self
._captions
_xml
:
148 return self
._get
_captions
_by
_type
(video_id
, subtitles_id
, 'subtitles')
150 def _get_automatic_captions(self
, video_id
, subtitles_id
, hl
):
151 if not subtitles_id
or not hl
:
153 self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
)
154 if not self
._captions
_xml
:
156 track
= self
._captions
_xml
.find('track')
159 origin_lang_code
= track
.attrib
.get('lang_code')
160 if not origin_lang_code
:
162 return self
._get
_captions
_by
_type
(
163 video_id
, subtitles_id
, 'automatic_captions', origin_lang_code
)
165 def _real_extract(self
, url
):
166 video_id
= self
._match
_id
(url
)
167 video_info
= compat_parse_qs(self
._download
_webpage
(
168 'https://drive.google.com/get_video_info',
169 video_id
, 'Downloading video webpage', query
={'docid': video_id}
))
172 return try_get(video_info
, lambda x
: x
[key
][0])
174 reason
= get_value('reason')
175 title
= get_value('title')
178 fmt_stream_map
= (get_value('fmt_stream_map') or '').split(',')
179 fmt_list
= (get_value('fmt_list') or '').split(',')
180 if fmt_stream_map
and fmt_list
:
184 r
'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt
)
186 resolutions
[mobj
.group('format_id')] = (
187 int(mobj
.group('width')), int(mobj
.group('height')))
189 for fmt_stream
in fmt_stream_map
:
190 fmt_stream_split
= fmt_stream
.split('|')
191 if len(fmt_stream_split
) < 2:
193 format_id
, format_url
= fmt_stream_split
[:2]
195 'url': lowercase_escape(format_url
),
196 'format_id': format_id
,
197 'ext': self
._FORMATS
_EXT
[format_id
],
199 resolution
= resolutions
.get(format_id
)
202 'width': resolution
[0],
203 'height': resolution
[1],
207 source_url
= update_url_query(
208 'https://drive.google.com/uc', {
210 'export': 'download',
213 def request_source_file(source_url
, kind
, data
=None):
214 return self
._request
_webpage
(
215 source_url
, video_id
, note
='Requesting %s file' % kind
,
216 errnote
='Unable to request %s file' % kind
, fatal
=False, data
=data
)
217 urlh
= request_source_file(source_url
, 'source')
219 def add_source_format(urlh
):
222 title
= self
._search
_regex
(
223 r
'\bfilename="([^"]+)"', urlh
.headers
.get('Content-Disposition'),
224 'title', default
=None)
226 # Use redirect URLs as download URLs in order to calculate
227 # correct cookies in _calc_cookies.
228 # Using original URLs may result in redirect loop due to
229 # google.com's cookies mistakenly used for googleusercontent.com
230 # redirect URLs (see #23919).
231 'url': urlh
.geturl(),
232 'ext': determine_ext(title
, 'mp4').lower(),
233 'format_id': 'source',
236 if urlh
.headers
.get('Content-Disposition'):
237 add_source_format(urlh
)
239 confirmation_webpage
= self
._webpage
_read
_content
(
240 urlh
, url
, video_id
, note
='Downloading confirmation page',
241 errnote
='Unable to confirm download', fatal
=False)
242 if confirmation_webpage
:
243 confirmed_source_url
= extract_attributes(
244 get_element_html_by_id('download-form', confirmation_webpage
) or '').get('action')
245 if confirmed_source_url
:
246 urlh
= request_source_file(confirmed_source_url
, 'confirmed source', data
=b
'')
247 if urlh
and urlh
.headers
.get('Content-Disposition'):
248 add_source_format(urlh
)
251 get_element_by_class('uc-error-subcaption', confirmation_webpage
)
252 or get_element_by_class('uc-error-caption', confirmation_webpage
)
253 or 'unable to extract confirmation code')
255 if not formats
and reason
:
257 self
.raise_no_formats(reason
, expected
=True)
259 raise ExtractorError(reason
, expected
=True)
263 ttsurl
= get_value('ttsurl')
265 # the video Id for subtitles will be the last value in the ttsurl
267 subtitles_id
= ttsurl
.encode('utf-8').decode(
268 'unicode_escape').split('=')[-1]
270 self
.cookiejar
.clear(domain
='.google.com', path
='/', name
='NID')
275 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id
,
276 'duration': int_or_none(get_value('length_seconds')),
278 'subtitles': self
.extract_subtitles(video_id
, subtitles_id
, hl
),
279 'automatic_captions': self
.extract_automatic_captions(
280 video_id
, subtitles_id
, hl
),
284 class GoogleDriveFolderIE(InfoExtractor
):
285 IE_NAME
= 'GoogleDrive:Folder'
286 _VALID_URL
= r
'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})'
288 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
290 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
295 _BOUNDARY
= '=====vc17a3rwnndj====='
296 _REQUEST
= "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1"
297 _DATA
= f
'''--{_BOUNDARY}
298 content-type: application/http
299 content-transfer-encoding: binary
306 def _call_api(self
, folder_id
, key
, data
, **kwargs
):
307 response
= self
._download
_webpage
(
308 'https://clients6.google.com/batch/drive/v2beta',
309 folder_id
, data
=data
.encode('utf-8'),
311 'Content-Type': 'text/plain;charset=UTF-8;',
312 'Origin': 'https://drive.google.com',
314 '$ct': f
'multipart/mixed; boundary="{self._BOUNDARY}"',
317 return self
._search
_json
('', response
, 'api response', folder_id
, **kwargs
) or {}
319 def _get_folder_items(self
, folder_id
, key
):
321 while page_token
is not None:
322 request
= self
._REQUEST
.format(folder_id
=folder_id
, page_token
=page_token
, key
=key
)
323 page
= self
._call
_api
(folder_id
, key
, self
._DATA
% request
)
324 yield from page
['items']
325 page_token
= page
.get('nextPageToken')
327 def _real_extract(self
, url
):
328 folder_id
= self
._match
_id
(url
)
330 webpage
= self
._download
_webpage
(url
, folder_id
)
331 key
= self
._search
_regex
(r
'"(\w{39})"', webpage
, 'key')
333 folder_info
= self
._call
_api
(folder_id
, key
, self
._DATA
% f
'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal
=False)
335 return self
.playlist_from_matches(
336 self
._get
_folder
_items
(folder_id
, key
), folder_id
, folder_info
.get('title'),
337 ie
=GoogleDriveIE
, getter
=lambda item
: f
'https://drive.google.com/file/d/{item["id"]}')