[yt-dlp.git] / yt_dlp / extractor / googlesearch.py

from __future__ import unicode_literals

import itertools
import re

from .common import SearchInfoExtractor


class GoogleSearchIE(SearchInfoExtractor):
    IE_DESC = 'Google Video search'
    _MAX_RESULTS = 1000
    IE_NAME = 'video.google:search'
    _SEARCH_KEY = 'gvsearch'
    _TEST = {
        'url': 'gvsearch15:python language',
        'info_dict': {
            'id': 'python language',
            'title': 'python language',
        },
        'playlist_count': 15,
    }

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""

        entries = []
        res = {
            '_type': 'playlist',
            'id': query,
            'title': query,
        }

        for pagenum in itertools.count():
            webpage = self._download_webpage(
                'http://www.google.com/search',
                'gvsearch:' + query,
                note='Downloading result page %s' % (pagenum + 1),
                query={
                    'tbm': 'vid',
                    'q': query,
                    'start': pagenum * 10,
                    'hl': 'en',
                })

            for hit_idx, mobj in enumerate(re.finditer(
                    r'<h3 class="r"><a href="([^"]+)"', webpage)):

                # Skip playlists
                if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
                    continue

                entries.append({
                    '_type': 'url',
                    'url': mobj.group(1)
                })

            if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
                res['entries'] = entries[:n]
                return res
Commit	Line	Data
ccf9114e PH	1	from __future__ import unicode_literals
ccf9114e PH	2
3fc03845 PH	3	import itertools
	4	import re
	5
	6	from .common import SearchInfoExtractor
3fc03845 PH	7
	8
	9	class GoogleSearchIE(SearchInfoExtractor):
ccf9114e	10	IE_DESC = 'Google Video search'
3fc03845	11	_MAX_RESULTS = 1000
ccf9114e	12	IE_NAME = 'video.google:search'
3fc03845	13	_SEARCH_KEY = 'gvsearch'
829476b8 PH	14	_TEST = {
	15	'url': 'gvsearch15:python language',
	16	'info_dict': {
	17	'id': 'python language',
	18	'title': 'python language',
	19	},
	20	'playlist_count': 15,
	21	}
3fc03845 PH	22
	23	def _get_n_results(self, query, n):
	24	"""Get a specified number of results for a query"""
	25
ccf9114e	26	entries = []
3fc03845 PH	27	res = {
	28	'_type': 'playlist',
	29	'id': query,
ccf9114e	30	'title': query,
3fc03845 PH	31	}
3fc03845 PH	32
ccf9114e	33	for pagenum in itertools.count():
ccf9114e	34	webpage = self._download_webpage(
f3517569 S	35	'http://www.google.com/search',
	36	'gvsearch:' + query,
	37	note='Downloading result page %s' % (pagenum + 1),
	38	query={
	39	'tbm': 'vid',
	40	'q': query,
	41	'start': pagenum * 10,
	42	'hl': 'en',
	43	})
ccf9114e PH	44
	45	for hit_idx, mobj in enumerate(re.finditer(
	46	r'<h3 class="r"><a href="([^"]+)"', webpage)):
	47
	48	# Skip playlists
	49	if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
	50	continue
3fc03845	51
ccf9114e	52	entries.append({
3fc03845 PH	53	'_type': 'url',
3fc03845 PH	54	'url': mobj.group(1)
ccf9114e	55	})
3fc03845	56
c3d36f13	57	if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
ccf9114e	58	res['entries'] = entries[:n]
3fc03845	59	return res