[yt-dlp.git] / yt_dlp / extractor / googledrive.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..utils import (
    determine_ext,
    ExtractorError,
    get_element_by_class,
    int_or_none,
    lowercase_escape,
    try_get,
    update_url_query,
)


class GoogleDriveIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                        https?://
                            (?:
                                (?:docs|drive)\.google\.com/
                                (?:
                                    (?:uc|open)\?.*?id=|
                                    file/d/
                                )|
                                video\.google\.com/get_player\?.*?docid=
                            )
                            (?P<id>[a-zA-Z0-9_-]{28,})
                    '''
    _TESTS = [{
        'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
        'md5': '5c602afbbf2c1db91831f5d82f678554',
        'info_dict': {
            'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
            'ext': 'mp4',
            'title': 'Big Buck Bunny.mp4',
            'duration': 45,
        }
    }, {
        # video can't be watched anonymously due to view count limit reached,
        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
        'only_matching': True,
    }, {
        # video id is longer than 28 characters
        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
        'only_matching': True,
    }, {
        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
        'only_matching': True,
    }, {
        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
        'only_matching': True,
    }]
    _FORMATS_EXT = {
        '5': 'flv',
        '6': 'flv',
        '13': '3gp',
        '17': '3gp',
        '18': 'mp4',
        '22': 'mp4',
        '34': 'flv',
        '35': 'flv',
        '36': '3gp',
        '37': 'mp4',
        '38': 'mp4',
        '43': 'webm',
        '44': 'webm',
        '45': 'webm',
        '46': 'webm',
        '59': 'mp4',
    }
    _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
    _CAPTIONS_ENTRY_TAG = {
        'subtitles': 'track',
        'automatic_captions': 'target',
    }
    _caption_formats_ext = []
    _captions_xml = None

    @staticmethod
    def _extract_url(webpage):
        mobj = re.search(
            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
            webpage)
        if mobj:
            return 'https://drive.google.com/file/d/%s' % mobj.group('id')

    def _download_subtitles_xml(self, video_id, subtitles_id, hl):
        if self._captions_xml:
            return
        self._captions_xml = self._download_xml(
            self._BASE_URL_CAPTIONS, video_id, query={
                'id': video_id,
                'vid': subtitles_id,
                'hl': hl,
                'v': video_id,
                'type': 'list',
                'tlangs': '1',
                'fmts': '1',
                'vssids': '1',
            }, note='Downloading subtitles XML',
            errnote='Unable to download subtitles XML', fatal=False)
        if self._captions_xml:
            for f in self._captions_xml.findall('format'):
                if f.attrib.get('fmt_code') and not f.attrib.get('default'):
                    self._caption_formats_ext.append(f.attrib['fmt_code'])

    def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
                              origin_lang_code=None):
        if not subtitles_id or not caption_type:
            return
        captions = {}
        for caption_entry in self._captions_xml.findall(
                self._CAPTIONS_ENTRY_TAG[caption_type]):
            caption_lang_code = caption_entry.attrib.get('lang_code')
            if not caption_lang_code:
                continue
            caption_format_data = []
            for caption_format in self._caption_formats_ext:
                query = {
                    'vid': subtitles_id,
                    'v': video_id,
                    'fmt': caption_format,
                    'lang': (caption_lang_code if origin_lang_code is None
                             else origin_lang_code),
                    'type': 'track',
                    'name': '',
                    'kind': '',
                }
                if origin_lang_code is not None:
                    query.update({'tlang': caption_lang_code})
                caption_format_data.append({
                    'url': update_url_query(self._BASE_URL_CAPTIONS, query),
                    'ext': caption_format,
                })
            captions[caption_lang_code] = caption_format_data
        return captions

    def _get_subtitles(self, video_id, subtitles_id, hl):
        if not subtitles_id or not hl:
            return
        self._download_subtitles_xml(video_id, subtitles_id, hl)
        if not self._captions_xml:
            return
        return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')

    def _get_automatic_captions(self, video_id, subtitles_id, hl):
        if not subtitles_id or not hl:
            return
        self._download_subtitles_xml(video_id, subtitles_id, hl)
        if not self._captions_xml:
            return
        track = self._captions_xml.find('track')
        if track is None:
            return
        origin_lang_code = track.attrib.get('lang_code')
        if not origin_lang_code:
            return
        return self._get_captions_by_type(
            video_id, subtitles_id, 'automatic_captions', origin_lang_code)

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_info = compat_parse_qs(self._download_webpage(
            'https://drive.google.com/get_video_info',
            video_id, query={'docid': video_id}))

        def get_value(key):
            return try_get(video_info, lambda x: x[key][0])

        reason = get_value('reason')
        title = get_value('title')
        if not title and reason:
            raise ExtractorError(reason, expected=True)

        formats = []
        fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
        fmt_list = (get_value('fmt_list') or '').split(',')
        if fmt_stream_map and fmt_list:
            resolutions = {}
            for fmt in fmt_list:
                mobj = re.search(
                    r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
                if mobj:
                    resolutions[mobj.group('format_id')] = (
                        int(mobj.group('width')), int(mobj.group('height')))

            for fmt_stream in fmt_stream_map:
                fmt_stream_split = fmt_stream.split('|')
                if len(fmt_stream_split) < 2:
                    continue
                format_id, format_url = fmt_stream_split[:2]
                f = {
                    'url': lowercase_escape(format_url),
                    'format_id': format_id,
                    'ext': self._FORMATS_EXT[format_id],
                }
                resolution = resolutions.get(format_id)
                if resolution:
                    f.update({
                        'width': resolution[0],
                        'height': resolution[1],
                    })
                formats.append(f)

        source_url = update_url_query(
            'https://drive.google.com/uc', {
                'id': video_id,
                'export': 'download',
            })

        def request_source_file(source_url, kind):
            return self._request_webpage(
                source_url, video_id, note='Requesting %s file' % kind,
                errnote='Unable to request %s file' % kind, fatal=False)
        urlh = request_source_file(source_url, 'source')
        if urlh:
            def add_source_format(urlh):
                formats.append({
                    # Use redirect URLs as download URLs in order to calculate
                    # correct cookies in _calc_cookies.
                    # Using original URLs may result in redirect loop due to
                    # google.com's cookies mistakenly used for googleusercontent.com
                    # redirect URLs (see #23919).
                    'url': urlh.geturl(),
                    'ext': determine_ext(title, 'mp4').lower(),
                    'format_id': 'source',
                    'quality': 1,
                })
            if urlh.headers.get('Content-Disposition'):
                add_source_format(urlh)
            else:
                confirmation_webpage = self._webpage_read_content(
                    urlh, url, video_id, note='Downloading confirmation page',
                    errnote='Unable to confirm download', fatal=False)
                if confirmation_webpage:
                    confirm = self._search_regex(
                        r'confirm=([^&"\']+)', confirmation_webpage,
                        'confirmation code', default=None)
                    if confirm:
                        confirmed_source_url = update_url_query(source_url, {
                            'confirm': confirm,
                        })
                        urlh = request_source_file(confirmed_source_url, 'confirmed source')
                        if urlh and urlh.headers.get('Content-Disposition'):
                            add_source_format(urlh)
                    else:
                        self.report_warning(
                            get_element_by_class('uc-error-subcaption', confirmation_webpage)
                            or get_element_by_class('uc-error-caption', confirmation_webpage)
                            or 'unable to extract confirmation code')

        if not formats and reason:
            self.raise_no_formats(reason, expected=True)

        self._sort_formats(formats)

        hl = get_value('hl')
        subtitles_id = None
        ttsurl = get_value('ttsurl')
        if ttsurl:
            # the video Id for subtitles will be the last value in the ttsurl
            # query string
            subtitles_id = ttsurl.encode('utf-8').decode(
                'unicode_escape').split('=')[-1]

        self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID')

        return {
            'id': video_id,
            'title': title,
            'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
            'duration': int_or_none(get_value('length_seconds')),
            'formats': formats,
            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
            'automatic_captions': self.extract_automatic_captions(
                video_id, subtitles_id, hl),
        }
Commit	Line	Data
5b251628	1	from __future__ import unicode_literals
5b251628	2
3e5f3df1	3	import re
3e5f3df1	4
984e4d48	5	from .common import InfoExtractor
a0566bbf	6	from ..compat import compat_parse_qs
8e92d21e	7	from ..utils import (
fea82c17	8	determine_ext,
8e92d21e	9	ExtractorError,
2181983a	10	get_element_by_class,
5b251628	11	int_or_none,
e4e50f60	12	lowercase_escape,
a0566bbf	13	try_get,
05915e37	14	update_url_query,
8e92d21e	15	)
984e4d48	16
5b251628	17
5b251628	18	class GoogleDriveIE(InfoExtractor):
1b41da48 S	19	_VALID_URL = r'''(?x)
	20	https?://
	21	(?:
	22	(?:docs\|drive)\.google\.com/
	23	(?:
	24	(?:uc\|open)\?.*?id=\|
	25	file/d/
	26	)\|
	27	video\.google\.com/get_player\?.*?docid=
	28	)
	29	(?P<id>[a-zA-Z0-9_-]{28,})
	30	'''
58e6d097	31	_TESTS = [{
5b251628	32	'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
fea82c17	33	'md5': '5c602afbbf2c1db91831f5d82f678554',
3e5f3df1	34	'info_dict': {
5b251628	35	'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
3e5f3df1	36	'ext': 'mp4',
5b251628	37	'title': 'Big Buck Bunny.mp4',
e4e50f60	38	'duration': 45,
3e5f3df1	39	}
fea82c17 S	40	}, {
fea82c17 S	41	# video can't be watched anonymously due to view count limit reached,
067aa17e	42	# but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
fea82c17	43	'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
a0566bbf	44	'only_matching': True,
58e6d097 S	45	}, {
	46	# video id is longer than 28 characters
	47	'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
1b41da48 S	48	'only_matching': True,
	49	}, {
	50	'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
	51	'only_matching': True,
	52	}, {
	53	'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
	54	'only_matching': True,
58e6d097	55	}]
5b251628	56	_FORMATS_EXT = {
	57	'5': 'flv',
	58	'6': 'flv',
	59	'13': '3gp',
	60	'17': '3gp',
	61	'18': 'mp4',
	62	'22': 'mp4',
	63	'34': 'flv',
	64	'35': 'flv',
	65	'36': '3gp',
	66	'37': 'mp4',
	67	'38': 'mp4',
	68	'43': 'webm',
	69	'44': 'webm',
	70	'45': 'webm',
	71	'46': 'webm',
	72	'59': 'mp4',
	73	}
05915e37 PV	74	_BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
	75	_CAPTIONS_ENTRY_TAG = {
	76	'subtitles': 'track',
	77	'automatic_captions': 'target',
	78	}
	79	_caption_formats_ext = []
37d9af30	80	_captions_xml = None
3e5f3df1	81
	82	@staticmethod
	83	def _extract_url(webpage):
	84	mobj = re.search(
58e6d097	85	r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=\|(?:docs\|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
3e5f3df1	86	webpage)
	87	if mobj:
	88	return 'https://drive.google.com/file/d/%s' % mobj.group('id')
	89
37d9af30 S	90	def _download_subtitles_xml(self, video_id, subtitles_id, hl):
	91	if self._captions_xml:
	92	return
	93	self._captions_xml = self._download_xml(
	94	self._BASE_URL_CAPTIONS, video_id, query={
05915e37	95	'id': video_id,
37d9af30	96	'vid': subtitles_id,
05915e37 PV	97	'hl': hl,
	98	'v': video_id,
	99	'type': 'list',
	100	'tlangs': '1',
	101	'fmts': '1',
	102	'vssids': '1',
37d9af30 S	103	}, note='Downloading subtitles XML',
	104	errnote='Unable to download subtitles XML', fatal=False)
	105	if self._captions_xml:
	106	for f in self._captions_xml.findall('format'):
	107	if f.attrib.get('fmt_code') and not f.attrib.get('default'):
	108	self._caption_formats_ext.append(f.attrib['fmt_code'])
	109
	110	def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
	111	origin_lang_code=None):
	112	if not subtitles_id or not caption_type:
	113	return
05915e37	114	captions = {}
37d9af30 S	115	for caption_entry in self._captions_xml.findall(
37d9af30 S	116	self._CAPTIONS_ENTRY_TAG[caption_type]):
05915e37 PV	117	caption_lang_code = caption_entry.attrib.get('lang_code')
	118	if not caption_lang_code:
	119	continue
	120	caption_format_data = []
	121	for caption_format in self._caption_formats_ext:
	122	query = {
37d9af30	123	'vid': subtitles_id,
05915e37 PV	124	'v': video_id,
05915e37 PV	125	'fmt': caption_format,
37d9af30 S	126	'lang': (caption_lang_code if origin_lang_code is None
37d9af30 S	127	else origin_lang_code),
05915e37 PV	128	'type': 'track',
	129	'name': '',
	130	'kind': '',
	131	}
37d9af30	132	if origin_lang_code is not None:
05915e37 PV	133	query.update({'tlang': caption_lang_code})
	134	caption_format_data.append({
	135	'url': update_url_query(self._BASE_URL_CAPTIONS, query),
	136	'ext': caption_format,
	137	})
	138	captions[caption_lang_code] = caption_format_data
05915e37 PV	139	return captions
05915e37 PV	140
37d9af30 S	141	def _get_subtitles(self, video_id, subtitles_id, hl):
	142	if not subtitles_id or not hl:
	143	return
	144	self._download_subtitles_xml(video_id, subtitles_id, hl)
	145	if not self._captions_xml:
	146	return
	147	return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
	148
	149	def _get_automatic_captions(self, video_id, subtitles_id, hl):
	150	if not subtitles_id or not hl:
	151	return
	152	self._download_subtitles_xml(video_id, subtitles_id, hl)
	153	if not self._captions_xml:
	154	return
	155	track = self._captions_xml.find('track')
	156	if track is None:
	157	return
	158	origin_lang_code = track.attrib.get('lang_code')
	159	if not origin_lang_code:
	160	return
	161	return self._get_captions_by_type(
	162	video_id, subtitles_id, 'automatic_captions', origin_lang_code)
05915e37	163
3e5f3df1	164	def _real_extract(self, url):
3e5f3df1	165	video_id = self._match_id(url)
a0566bbf	166	video_info = compat_parse_qs(self._download_webpage(
	167	'https://drive.google.com/get_video_info',
	168	video_id, query={'docid': video_id}))
	169
	170	def get_value(key):
	171	return try_get(video_info, lambda x: x[key][0])
3e5f3df1	172
a0566bbf	173	reason = get_value('reason')
	174	title = get_value('title')
	175	if not title and reason:
	176	raise ExtractorError(reason, expected=True)
fea82c17 S	177
fea82c17 S	178	formats = []
a0566bbf	179	fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
a0566bbf	180	fmt_list = (get_value('fmt_list') or '').split(',')
fea82c17 S	181	if fmt_stream_map and fmt_list:
	182	resolutions = {}
	183	for fmt in fmt_list:
	184	mobj = re.search(
	185	r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
	186	if mobj:
	187	resolutions[mobj.group('format_id')] = (
	188	int(mobj.group('width')), int(mobj.group('height')))
984e4d48	189
fea82c17 S	190	for fmt_stream in fmt_stream_map:
	191	fmt_stream_split = fmt_stream.split('\|')
	192	if len(fmt_stream_split) < 2:
	193	continue
	194	format_id, format_url = fmt_stream_split[:2]
	195	f = {
	196	'url': lowercase_escape(format_url),
	197	'format_id': format_id,
	198	'ext': self._FORMATS_EXT[format_id],
	199	}
	200	resolution = resolutions.get(format_id)
	201	if resolution:
	202	f.update({
	203	'width': resolution[0],
	204	'height': resolution[1],
	205	})
	206	formats.append(f)
9be9ec59	207
fea82c17 S	208	source_url = update_url_query(
	209	'https://drive.google.com/uc', {
	210	'id': video_id,
	211	'export': 'download',
	212	})
da2069fb S	213
	214	def request_source_file(source_url, kind):
	215	return self._request_webpage(
	216	source_url, video_id, note='Requesting %s file' % kind,
	217	errnote='Unable to request %s file' % kind, fatal=False)
	218	urlh = request_source_file(source_url, 'source')
fea82c17	219	if urlh:
da2069fb	220	def add_source_format(urlh):
fea82c17	221	formats.append({
da2069fb S	222	# Use redirect URLs as download URLs in order to calculate
	223	# correct cookies in _calc_cookies.
	224	# Using original URLs may result in redirect loop due to
	225	# google.com's cookies mistakenly used for googleusercontent.com
	226	# redirect URLs (see #23919).
	227	'url': urlh.geturl(),
fea82c17 S	228	'ext': determine_ext(title, 'mp4').lower(),
	229	'format_id': 'source',
	230	'quality': 1,
9be9ec59	231	})
fea82c17	232	if urlh.headers.get('Content-Disposition'):
da2069fb	233	add_source_format(urlh)
fea82c17 S	234	else:
	235	confirmation_webpage = self._webpage_read_content(
	236	urlh, url, video_id, note='Downloading confirmation page',
	237	errnote='Unable to confirm download', fatal=False)
	238	if confirmation_webpage:
	239	confirm = self._search_regex(
	240	r'confirm=([^&"\']+)', confirmation_webpage,
2181983a	241	'confirmation code', default=None)
fea82c17	242	if confirm:
da2069fb	243	confirmed_source_url = update_url_query(source_url, {
fea82c17	244	'confirm': confirm,
da2069fb S	245	})
	246	urlh = request_source_file(confirmed_source_url, 'confirmed source')
	247	if urlh and urlh.headers.get('Content-Disposition'):
	248	add_source_format(urlh)
2181983a	249	else:
	250	self.report_warning(
	251	get_element_by_class('uc-error-subcaption', confirmation_webpage)
	252	or get_element_by_class('uc-error-caption', confirmation_webpage)
	253	or 'unable to extract confirmation code')
fea82c17	254
a0566bbf	255	if not formats and reason:
b7da73eb	256	self.raise_no_formats(reason, expected=True)
fea82c17	257
984e4d48	258	self._sort_formats(formats)
984e4d48	259
a0566bbf	260	hl = get_value('hl')
37d9af30	261	subtitles_id = None
a0566bbf	262	ttsurl = get_value('ttsurl')
05915e37	263	if ttsurl:
37d9af30 S	264	# the video Id for subtitles will be the last value in the ttsurl
	265	# query string
	266	subtitles_id = ttsurl.encode('utf-8').decode(
	267	'unicode_escape').split('=')[-1]
05915e37	268
d35cf6b7	269	self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID')
67475072	270
984e4d48	271	return {
	272	'id': video_id,
	273	'title': title,
a0566bbf	274	'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
a0566bbf	275	'duration': int_or_none(get_value('length_seconds')),
5b251628	276	'formats': formats,
37d9af30 S	277	'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
	278	'automatic_captions': self.extract_automatic_captions(
	279	video_id, subtitles_id, hl),
984e4d48	280	}