]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/googledrive.py
Updated to release 2020.11.21.1
[yt-dlp.git] / youtube_dlc / extractor / googledrive.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..compat import compat_parse_qs
7 from ..utils import (
8 determine_ext,
9 ExtractorError,
10 int_or_none,
11 lowercase_escape,
12 try_get,
13 update_url_query,
14 )
15
16
17 class GoogleDriveIE(InfoExtractor):
18 _VALID_URL = r'''(?x)
19 https?://
20 (?:
21 (?:docs|drive)\.google\.com/
22 (?:
23 (?:uc|open)\?.*?id=|
24 file/d/
25 )|
26 video\.google\.com/get_player\?.*?docid=
27 )
28 (?P<id>[a-zA-Z0-9_-]{28,})
29 '''
30 _TESTS = [{
31 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
32 'md5': '5c602afbbf2c1db91831f5d82f678554',
33 'info_dict': {
34 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
35 'ext': 'mp4',
36 'title': 'Big Buck Bunny.mp4',
37 'duration': 45,
38 }
39 }, {
40 # video can't be watched anonymously due to view count limit reached,
41 # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
42 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
43 'only_matching': True,
44 }, {
45 # video id is longer than 28 characters
46 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
47 'only_matching': True,
48 }, {
49 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
50 'only_matching': True,
51 }, {
52 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
53 'only_matching': True,
54 }]
55 _FORMATS_EXT = {
56 '5': 'flv',
57 '6': 'flv',
58 '13': '3gp',
59 '17': '3gp',
60 '18': 'mp4',
61 '22': 'mp4',
62 '34': 'flv',
63 '35': 'flv',
64 '36': '3gp',
65 '37': 'mp4',
66 '38': 'mp4',
67 '43': 'webm',
68 '44': 'webm',
69 '45': 'webm',
70 '46': 'webm',
71 '59': 'mp4',
72 }
73 _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
74 _CAPTIONS_ENTRY_TAG = {
75 'subtitles': 'track',
76 'automatic_captions': 'target',
77 }
78 _caption_formats_ext = []
79 _captions_xml = None
80
81 @staticmethod
82 def _extract_url(webpage):
83 mobj = re.search(
84 r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
85 webpage)
86 if mobj:
87 return 'https://drive.google.com/file/d/%s' % mobj.group('id')
88
89 def _download_subtitles_xml(self, video_id, subtitles_id, hl):
90 if self._captions_xml:
91 return
92 self._captions_xml = self._download_xml(
93 self._BASE_URL_CAPTIONS, video_id, query={
94 'id': video_id,
95 'vid': subtitles_id,
96 'hl': hl,
97 'v': video_id,
98 'type': 'list',
99 'tlangs': '1',
100 'fmts': '1',
101 'vssids': '1',
102 }, note='Downloading subtitles XML',
103 errnote='Unable to download subtitles XML', fatal=False)
104 if self._captions_xml:
105 for f in self._captions_xml.findall('format'):
106 if f.attrib.get('fmt_code') and not f.attrib.get('default'):
107 self._caption_formats_ext.append(f.attrib['fmt_code'])
108
109 def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
110 origin_lang_code=None):
111 if not subtitles_id or not caption_type:
112 return
113 captions = {}
114 for caption_entry in self._captions_xml.findall(
115 self._CAPTIONS_ENTRY_TAG[caption_type]):
116 caption_lang_code = caption_entry.attrib.get('lang_code')
117 if not caption_lang_code:
118 continue
119 caption_format_data = []
120 for caption_format in self._caption_formats_ext:
121 query = {
122 'vid': subtitles_id,
123 'v': video_id,
124 'fmt': caption_format,
125 'lang': (caption_lang_code if origin_lang_code is None
126 else origin_lang_code),
127 'type': 'track',
128 'name': '',
129 'kind': '',
130 }
131 if origin_lang_code is not None:
132 query.update({'tlang': caption_lang_code})
133 caption_format_data.append({
134 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
135 'ext': caption_format,
136 })
137 captions[caption_lang_code] = caption_format_data
138 return captions
139
140 def _get_subtitles(self, video_id, subtitles_id, hl):
141 if not subtitles_id or not hl:
142 return
143 self._download_subtitles_xml(video_id, subtitles_id, hl)
144 if not self._captions_xml:
145 return
146 return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
147
148 def _get_automatic_captions(self, video_id, subtitles_id, hl):
149 if not subtitles_id or not hl:
150 return
151 self._download_subtitles_xml(video_id, subtitles_id, hl)
152 if not self._captions_xml:
153 return
154 track = self._captions_xml.find('track')
155 if track is None:
156 return
157 origin_lang_code = track.attrib.get('lang_code')
158 if not origin_lang_code:
159 return
160 return self._get_captions_by_type(
161 video_id, subtitles_id, 'automatic_captions', origin_lang_code)
162
163 def _real_extract(self, url):
164 video_id = self._match_id(url)
165 video_info = compat_parse_qs(self._download_webpage(
166 'https://drive.google.com/get_video_info',
167 video_id, query={'docid': video_id}))
168
169 def get_value(key):
170 return try_get(video_info, lambda x: x[key][0])
171
172 reason = get_value('reason')
173 title = get_value('title')
174 if not title and reason:
175 raise ExtractorError(reason, expected=True)
176
177 formats = []
178 fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
179 fmt_list = (get_value('fmt_list') or '').split(',')
180 if fmt_stream_map and fmt_list:
181 resolutions = {}
182 for fmt in fmt_list:
183 mobj = re.search(
184 r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
185 if mobj:
186 resolutions[mobj.group('format_id')] = (
187 int(mobj.group('width')), int(mobj.group('height')))
188
189 for fmt_stream in fmt_stream_map:
190 fmt_stream_split = fmt_stream.split('|')
191 if len(fmt_stream_split) < 2:
192 continue
193 format_id, format_url = fmt_stream_split[:2]
194 f = {
195 'url': lowercase_escape(format_url),
196 'format_id': format_id,
197 'ext': self._FORMATS_EXT[format_id],
198 }
199 resolution = resolutions.get(format_id)
200 if resolution:
201 f.update({
202 'width': resolution[0],
203 'height': resolution[1],
204 })
205 formats.append(f)
206
207 source_url = update_url_query(
208 'https://drive.google.com/uc', {
209 'id': video_id,
210 'export': 'download',
211 })
212
213 def request_source_file(source_url, kind):
214 return self._request_webpage(
215 source_url, video_id, note='Requesting %s file' % kind,
216 errnote='Unable to request %s file' % kind, fatal=False)
217 urlh = request_source_file(source_url, 'source')
218 if urlh:
219 def add_source_format(urlh):
220 formats.append({
221 # Use redirect URLs as download URLs in order to calculate
222 # correct cookies in _calc_cookies.
223 # Using original URLs may result in redirect loop due to
224 # google.com's cookies mistakenly used for googleusercontent.com
225 # redirect URLs (see #23919).
226 'url': urlh.geturl(),
227 'ext': determine_ext(title, 'mp4').lower(),
228 'format_id': 'source',
229 'quality': 1,
230 })
231 if urlh.headers.get('Content-Disposition'):
232 add_source_format(urlh)
233 else:
234 confirmation_webpage = self._webpage_read_content(
235 urlh, url, video_id, note='Downloading confirmation page',
236 errnote='Unable to confirm download', fatal=False)
237 if confirmation_webpage:
238 confirm = self._search_regex(
239 r'confirm=([^&"\']+)', confirmation_webpage,
240 'confirmation code', fatal=False)
241 if confirm:
242 confirmed_source_url = update_url_query(source_url, {
243 'confirm': confirm,
244 })
245 urlh = request_source_file(confirmed_source_url, 'confirmed source')
246 if urlh and urlh.headers.get('Content-Disposition'):
247 add_source_format(urlh)
248
249 if not formats and reason:
250 raise ExtractorError(reason, expected=True)
251
252 self._sort_formats(formats)
253
254 hl = get_value('hl')
255 subtitles_id = None
256 ttsurl = get_value('ttsurl')
257 if ttsurl:
258 # the video Id for subtitles will be the last value in the ttsurl
259 # query string
260 subtitles_id = ttsurl.encode('utf-8').decode(
261 'unicode_escape').split('=')[-1]
262
263 self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID')
264
265 return {
266 'id': video_id,
267 'title': title,
268 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
269 'duration': int_or_none(get_value('length_seconds')),
270 'formats': formats,
271 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
272 'automatic_captions': self.extract_automatic_captions(
273 video_id, subtitles_id, hl),
274 }