[yt-dlp.git] / youtube_dl / extractor / googlesearch.py

from __future__ import unicode_literals

import itertools
import re

from .common import SearchInfoExtractor
from ..utils import (
    compat_urllib_parse,
)


class GoogleSearchIE(SearchInfoExtractor):
    IE_DESC = 'Google Video search'
    _MAX_RESULTS = 1000
    IE_NAME = 'video.google:search'
    _SEARCH_KEY = 'gvsearch'

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""

        entries = []
        res = {
            '_type': 'playlist',
            'id': query,
            'title': query,
        }

        for pagenum in itertools.count():
            result_url = (
                'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
                % (compat_urllib_parse.quote_plus(query), pagenum * 10))

            webpage = self._download_webpage(
                result_url, 'gvsearch:' + query,
                note='Downloading result page ' + str(pagenum + 1))

            for hit_idx, mobj in enumerate(re.finditer(
                    r'<h3 class="r"><a href="([^"]+)"', webpage)):

                # Skip playlists
                if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
                    continue

                entries.append({
                    '_type': 'url',
                    'url': mobj.group(1)
                })

            if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
                res['entries'] = entries[:n]
                return res
Commit	Line	Data
ccf9114e PH	1	from __future__ import unicode_literals
ccf9114e PH	2
3fc03845 PH	3	import itertools
	4	import re
	5
	6	from .common import SearchInfoExtractor
	7	from ..utils import (
	8	compat_urllib_parse,
	9	)
	10
	11
	12	class GoogleSearchIE(SearchInfoExtractor):
ccf9114e	13	IE_DESC = 'Google Video search'
3fc03845	14	_MAX_RESULTS = 1000
ccf9114e	15	IE_NAME = 'video.google:search'
3fc03845 PH	16	_SEARCH_KEY = 'gvsearch'
	17
	18	def _get_n_results(self, query, n):
	19	"""Get a specified number of results for a query"""
	20
ccf9114e	21	entries = []
3fc03845 PH	22	res = {
	23	'_type': 'playlist',
	24	'id': query,
ccf9114e	25	'title': query,
3fc03845 PH	26	}
3fc03845 PH	27
ccf9114e PH	28	for pagenum in itertools.count():
	29	result_url = (
	30	'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
	31	% (compat_urllib_parse.quote_plus(query), pagenum * 10))
	32
	33	webpage = self._download_webpage(
	34	result_url, 'gvsearch:' + query,
	35	note='Downloading result page ' + str(pagenum + 1))
	36
	37	for hit_idx, mobj in enumerate(re.finditer(
	38	r'<h3 class="r"><a href="([^"]+)"', webpage)):
	39
	40	# Skip playlists
	41	if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
	42	continue
3fc03845	43
ccf9114e	44	entries.append({
3fc03845 PH	45	'_type': 'url',
3fc03845 PH	46	'url': mobj.group(1)
ccf9114e	47	})
3fc03845	48
c3d36f13	49	if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
ccf9114e	50	res['entries'] = entries[:n]
3fc03845	51	return res