]>
Commit | Line | Data |
---|---|---|
5b251628 | 1 | from __future__ import unicode_literals |
2 | ||
3e5f3df1 | 3 | import re |
4 | ||
984e4d48 | 5 | from .common import InfoExtractor |
8e92d21e | 6 | from ..utils import ( |
fea82c17 | 7 | determine_ext, |
8e92d21e | 8 | ExtractorError, |
5b251628 | 9 | int_or_none, |
e4e50f60 | 10 | lowercase_escape, |
05915e37 | 11 | update_url_query, |
8e92d21e | 12 | ) |
984e4d48 | 13 | |
5b251628 | 14 | |
15 | class GoogleDriveIE(InfoExtractor): | |
1b41da48 S |
16 | _VALID_URL = r'''(?x) |
17 | https?:// | |
18 | (?: | |
19 | (?:docs|drive)\.google\.com/ | |
20 | (?: | |
21 | (?:uc|open)\?.*?id=| | |
22 | file/d/ | |
23 | )| | |
24 | video\.google\.com/get_player\?.*?docid= | |
25 | ) | |
26 | (?P<id>[a-zA-Z0-9_-]{28,}) | |
27 | ''' | |
58e6d097 | 28 | _TESTS = [{ |
5b251628 | 29 | 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', |
fea82c17 | 30 | 'md5': '5c602afbbf2c1db91831f5d82f678554', |
3e5f3df1 | 31 | 'info_dict': { |
5b251628 | 32 | 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', |
3e5f3df1 | 33 | 'ext': 'mp4', |
5b251628 | 34 | 'title': 'Big Buck Bunny.mp4', |
e4e50f60 | 35 | 'duration': 45, |
3e5f3df1 | 36 | } |
fea82c17 S |
37 | }, { |
38 | # video can't be watched anonymously due to view count limit reached, | |
067aa17e | 39 | # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) |
fea82c17 S |
40 | 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', |
41 | 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', | |
42 | 'info_dict': { | |
43 | 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', | |
44 | 'ext': 'mp4', | |
45 | 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', | |
46 | } | |
58e6d097 S |
47 | }, { |
48 | # video id is longer than 28 characters | |
49 | 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', | |
05915e37 PV |
50 | 'info_dict': { |
51 | 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', | |
52 | 'ext': 'mp4', | |
53 | 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', | |
54 | 'duration': 189, | |
55 | }, | |
1b41da48 S |
56 | 'only_matching': True, |
57 | }, { | |
58 | 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', | |
59 | 'only_matching': True, | |
60 | }, { | |
61 | 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', | |
62 | 'only_matching': True, | |
58e6d097 | 63 | }] |
5b251628 | 64 | _FORMATS_EXT = { |
65 | '5': 'flv', | |
66 | '6': 'flv', | |
67 | '13': '3gp', | |
68 | '17': '3gp', | |
69 | '18': 'mp4', | |
70 | '22': 'mp4', | |
71 | '34': 'flv', | |
72 | '35': 'flv', | |
73 | '36': '3gp', | |
74 | '37': 'mp4', | |
75 | '38': 'mp4', | |
76 | '43': 'webm', | |
77 | '44': 'webm', | |
78 | '45': 'webm', | |
79 | '46': 'webm', | |
80 | '59': 'mp4', | |
81 | } | |
05915e37 PV |
82 | _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' |
83 | _CAPTIONS_ENTRY_TAG = { | |
84 | 'subtitles': 'track', | |
85 | 'automatic_captions': 'target', | |
86 | } | |
87 | _caption_formats_ext = [] | |
37d9af30 | 88 | _captions_xml = None |
3e5f3df1 | 89 | |
90 | @staticmethod | |
91 | def _extract_url(webpage): | |
92 | mobj = re.search( | |
58e6d097 | 93 | r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', |
3e5f3df1 | 94 | webpage) |
95 | if mobj: | |
96 | return 'https://drive.google.com/file/d/%s' % mobj.group('id') | |
97 | ||
37d9af30 S |
98 | def _download_subtitles_xml(self, video_id, subtitles_id, hl): |
99 | if self._captions_xml: | |
100 | return | |
101 | self._captions_xml = self._download_xml( | |
102 | self._BASE_URL_CAPTIONS, video_id, query={ | |
05915e37 | 103 | 'id': video_id, |
37d9af30 | 104 | 'vid': subtitles_id, |
05915e37 PV |
105 | 'hl': hl, |
106 | 'v': video_id, | |
107 | 'type': 'list', | |
108 | 'tlangs': '1', | |
109 | 'fmts': '1', | |
110 | 'vssids': '1', | |
37d9af30 S |
111 | }, note='Downloading subtitles XML', |
112 | errnote='Unable to download subtitles XML', fatal=False) | |
113 | if self._captions_xml: | |
114 | for f in self._captions_xml.findall('format'): | |
115 | if f.attrib.get('fmt_code') and not f.attrib.get('default'): | |
116 | self._caption_formats_ext.append(f.attrib['fmt_code']) | |
117 | ||
118 | def _get_captions_by_type(self, video_id, subtitles_id, caption_type, | |
119 | origin_lang_code=None): | |
120 | if not subtitles_id or not caption_type: | |
121 | return | |
05915e37 | 122 | captions = {} |
37d9af30 S |
123 | for caption_entry in self._captions_xml.findall( |
124 | self._CAPTIONS_ENTRY_TAG[caption_type]): | |
05915e37 PV |
125 | caption_lang_code = caption_entry.attrib.get('lang_code') |
126 | if not caption_lang_code: | |
127 | continue | |
128 | caption_format_data = [] | |
129 | for caption_format in self._caption_formats_ext: | |
130 | query = { | |
37d9af30 | 131 | 'vid': subtitles_id, |
05915e37 PV |
132 | 'v': video_id, |
133 | 'fmt': caption_format, | |
37d9af30 S |
134 | 'lang': (caption_lang_code if origin_lang_code is None |
135 | else origin_lang_code), | |
05915e37 PV |
136 | 'type': 'track', |
137 | 'name': '', | |
138 | 'kind': '', | |
139 | } | |
37d9af30 | 140 | if origin_lang_code is not None: |
05915e37 PV |
141 | query.update({'tlang': caption_lang_code}) |
142 | caption_format_data.append({ | |
143 | 'url': update_url_query(self._BASE_URL_CAPTIONS, query), | |
144 | 'ext': caption_format, | |
145 | }) | |
146 | captions[caption_lang_code] = caption_format_data | |
05915e37 PV |
147 | return captions |
148 | ||
37d9af30 S |
149 | def _get_subtitles(self, video_id, subtitles_id, hl): |
150 | if not subtitles_id or not hl: | |
151 | return | |
152 | self._download_subtitles_xml(video_id, subtitles_id, hl) | |
153 | if not self._captions_xml: | |
154 | return | |
155 | return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') | |
156 | ||
157 | def _get_automatic_captions(self, video_id, subtitles_id, hl): | |
158 | if not subtitles_id or not hl: | |
159 | return | |
160 | self._download_subtitles_xml(video_id, subtitles_id, hl) | |
161 | if not self._captions_xml: | |
162 | return | |
163 | track = self._captions_xml.find('track') | |
164 | if track is None: | |
165 | return | |
166 | origin_lang_code = track.attrib.get('lang_code') | |
167 | if not origin_lang_code: | |
168 | return | |
169 | return self._get_captions_by_type( | |
170 | video_id, subtitles_id, 'automatic_captions', origin_lang_code) | |
05915e37 | 171 | |
3e5f3df1 | 172 | def _real_extract(self, url): |
173 | video_id = self._match_id(url) | |
5b251628 | 174 | webpage = self._download_webpage( |
e4e50f60 | 175 | 'http://docs.google.com/file/d/%s' % video_id, video_id) |
3e5f3df1 | 176 | |
fea82c17 S |
177 | title = self._search_regex( |
178 | r'"title"\s*,\s*"([^"]+)', webpage, 'title', | |
179 | default=None) or self._og_search_title(webpage) | |
5b251628 | 180 | duration = int_or_none(self._search_regex( |
37d9af30 S |
181 | r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', |
182 | default=None)) | |
fea82c17 S |
183 | |
184 | formats = [] | |
5b251628 | 185 | fmt_stream_map = self._search_regex( |
37d9af30 | 186 | r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, |
fea82c17 | 187 | 'fmt stream map', default='').split(',') |
37d9af30 | 188 | fmt_list = self._search_regex( |
fea82c17 S |
189 | r'"fmt_list"\s*,\s*"([^"]+)', webpage, |
190 | 'fmt_list', default='').split(',') | |
191 | if fmt_stream_map and fmt_list: | |
192 | resolutions = {} | |
193 | for fmt in fmt_list: | |
194 | mobj = re.search( | |
195 | r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) | |
196 | if mobj: | |
197 | resolutions[mobj.group('format_id')] = ( | |
198 | int(mobj.group('width')), int(mobj.group('height'))) | |
984e4d48 | 199 | |
fea82c17 S |
200 | for fmt_stream in fmt_stream_map: |
201 | fmt_stream_split = fmt_stream.split('|') | |
202 | if len(fmt_stream_split) < 2: | |
203 | continue | |
204 | format_id, format_url = fmt_stream_split[:2] | |
205 | f = { | |
206 | 'url': lowercase_escape(format_url), | |
207 | 'format_id': format_id, | |
208 | 'ext': self._FORMATS_EXT[format_id], | |
209 | } | |
210 | resolution = resolutions.get(format_id) | |
211 | if resolution: | |
212 | f.update({ | |
213 | 'width': resolution[0], | |
214 | 'height': resolution[1], | |
215 | }) | |
216 | formats.append(f) | |
9be9ec59 | 217 | |
fea82c17 S |
218 | source_url = update_url_query( |
219 | 'https://drive.google.com/uc', { | |
220 | 'id': video_id, | |
221 | 'export': 'download', | |
222 | }) | |
223 | urlh = self._request_webpage( | |
224 | source_url, video_id, note='Requesting source file', | |
225 | errnote='Unable to request source file', fatal=False) | |
226 | if urlh: | |
227 | def add_source_format(src_url): | |
228 | formats.append({ | |
229 | 'url': src_url, | |
230 | 'ext': determine_ext(title, 'mp4').lower(), | |
231 | 'format_id': 'source', | |
232 | 'quality': 1, | |
9be9ec59 | 233 | }) |
fea82c17 S |
234 | if urlh.headers.get('Content-Disposition'): |
235 | add_source_format(source_url) | |
236 | else: | |
237 | confirmation_webpage = self._webpage_read_content( | |
238 | urlh, url, video_id, note='Downloading confirmation page', | |
239 | errnote='Unable to confirm download', fatal=False) | |
240 | if confirmation_webpage: | |
241 | confirm = self._search_regex( | |
242 | r'confirm=([^&"\']+)', confirmation_webpage, | |
243 | 'confirmation code', fatal=False) | |
244 | if confirm: | |
245 | add_source_format(update_url_query(source_url, { | |
246 | 'confirm': confirm, | |
247 | })) | |
248 | ||
249 | if not formats: | |
250 | reason = self._search_regex( | |
251 | r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) | |
252 | if reason: | |
253 | raise ExtractorError(reason, expected=True) | |
254 | ||
984e4d48 | 255 | self._sort_formats(formats) |
256 | ||
05915e37 PV |
257 | hl = self._search_regex( |
258 | r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) | |
37d9af30 | 259 | subtitles_id = None |
05915e37 PV |
260 | ttsurl = self._search_regex( |
261 | r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) | |
262 | if ttsurl: | |
37d9af30 S |
263 | # the video Id for subtitles will be the last value in the ttsurl |
264 | # query string | |
265 | subtitles_id = ttsurl.encode('utf-8').decode( | |
266 | 'unicode_escape').split('=')[-1] | |
05915e37 | 267 | |
984e4d48 | 268 | return { |
269 | 'id': video_id, | |
270 | 'title': title, | |
d69abbd3 | 271 | 'thumbnail': self._og_search_thumbnail(webpage, default=None), |
5b251628 | 272 | 'duration': duration, |
273 | 'formats': formats, | |
37d9af30 S |
274 | 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), |
275 | 'automatic_captions': self.extract_automatic_captions( | |
276 | video_id, subtitles_id, hl), | |
984e4d48 | 277 | } |