[yt-dlp.git] / youtube_dl / extractor / common.py

import base64
import os
import re
import socket
import sys
import netrc
import xml.etree.ElementTree

from ..utils import (
    compat_http_client,
    compat_urllib_error,
    compat_str,

    clean_html,
    compiled_regex_type,
    ExtractorError,
    RegexNotFoundError,
    sanitize_filename,
    unescapeHTML,
)


class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the FileDownloader. The FileDownloader processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The dictionaries must include the following fields:

    id:             Video identifier.
    url:            Final video URL.
    title:          Video title, unescaped.
    ext:            Video filename extension.

    Instead of url and ext, formats can also specified.

    The following fields are optional:

    format:         The video format, defaults to ext (used for --get-format)
    thumbnails:     A list of dictionaries (with the entries "resolution" and
                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
    upload_date:    Video upload date (YYYYMMDD).
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location of the video.
    player_url:     SWF Player URL (used for rtmpdump).
    subtitles:      The subtitle file contents as a dictionary in the format
                    {language: subtitles}.
    view_count:     How many users have watched the video on the platform.
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
    comment_count:  Number of comments on the video
    urlhandle:      [internal] The urlHandle to be used to download the file,
                    like returned by urllib.request.urlopen
    age_limit:      Age restriction for the video, as an integer (years)
    formats:        A list of dictionaries for each format available, it must
                    be ordered from worst to best quality. Potential fields:
                    * url       Mandatory. The URL of the video file
                    * ext       Will be calculated from url if missing
                    * format    A human-readable description of the format
                                ("mp4 container with h264/opus").
                                Calculated from the format_id, width, height.
                                and format_note fields if missing.
                    * format_id A short description of the format
                                ("mp4_h264_opus" or "19")
                    * format_note Additional info about the format
                                ("3D" or "DASH video")
                    * width     Width of the video, if known
                    * height    Height of the video, if known
                    * abr       Average audio bitrate in KBit/s
                    * acodec    Name of the audio codec in use
                    * vbr       Average video bitrate in KBit/s
                    * vcodec    Name of the video codec in use
                    * filesize  The number of bytes, if known in advance
    webpage_url:    The url to the video webpage, if given to youtube-dl it
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)

    Unless mentioned otherwise, the fields should be Unicode strings.

    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _real_extract() must return a *list* of information dictionaries as
    described above.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""

        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url) is not None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        self.initialize()
        return self._real_extract(url)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

    @property
    def IE_NAME(self):
        return type(self).__name__[:-2]

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the response handle """
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            if video_id is None:
                self.to_screen(u'%s' % (note,))
            else:
                self.to_screen(u'%s: %s' % (video_id, note))
        try:
            return self._downloader.urlopen(url_or_request)
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            if errnote is None:
                errnote = u'Unable to download webpage'
            errmsg = u'%s: %s' % (errnote, compat_str(err))
            if fatal:
                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
            else:
                self._downloader.report_warning(errmsg)
                return False

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns a tuple (page content as string, URL handle) """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
        if urlh is False:
            assert not fatal
            return False
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
            else:
                encoding = 'utf-8'
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self._downloader.params.get('write_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            raw_filename = ('%s_%s.dump' % (video_id, url))
            filename = sanitize_filename(raw_filename, restricted=True)
            self.to_screen(u'Saving request to ' + filename)
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

        content = webpage_bytes.decode(encoding, 'replace')
        return (content, urlh)

    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the data of the page as a string """
        res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
        if res is False:
            return res
        else:
            content, _ = res
            return content

    def _download_xml(self, url_or_request, video_id,
                      note=u'Downloading XML', errnote=u'Unable to download XML',
                      transform_source=None):
        """Return the xml as an xml.etree.ElementTree.Element"""
        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
        if transform_source:
            xml_string = transform_source(xml_string)
        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(u'%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(u'%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    #Methods for following #608
    def url_result(self, url, ie=None, video_id=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        if video_id is not None:
            video_info['id'] = video_id
        return video_info
    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info

    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        RegexNotFoundError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj: break

        if sys.stderr.isatty() and os.name != 'nt':
            _name = u'\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            # return the first matching group
            return next(g for g in mobj.groups() if g is not None)
        elif default is not None:
            return default
        elif fatal:
            raise RegexNotFoundError(u'Unable to extract %s' % _name)
        else:
            self._downloader.report_warning(u'unable to extract %s; '
                u'please report this issue on http://yt-dl.org/bug' % _name)
            return None

    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_login_info(self):
        """
        Get the the login info as (username, password)
        It will look in the netrc file using the _NETRC_MACHINE value
        If there's no info available, return (None, None)
        """
        if self._downloader is None:
            return (None, None)

        username = None
        password = None
        downloader_params = self._downloader.params

        # Attempt to use provided username and password or .netrc data
        if downloader_params.get('username', None) is not None:
            username = downloader_params['username']
            password = downloader_params['password']
        elif downloader_params.get('usenetrc', False):
            try:
                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
            except (IOError, netrc.NetrcParseError) as err:
                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
        
        return (username, password)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regexes(prop):
        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
        template = r'<meta[^>]+?%s[^>]+?%s'
        return [
            template % (property_re, content_re),
            template % (content_re, property_re),
        ]

    def _og_search_property(self, prop, html, name=None, **kargs):
        if name is None:
            name = 'OpenGraph %s' % prop
        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
        if escaped is None:
            return None
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
        regexes = self._og_regexes('video')
        if secure: regexes = self._og_regexes('video:secure_url') + regexes
        return self._html_search_regex(regexes, html, name, **kargs)

    def _html_search_meta(self, name, html, display_name=None):
        if display_name is None:
            display_name = name
        return self._html_search_regex(
            r'''(?ix)<meta
                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
            html, display_name, fatal=False)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')

    def _rta_search(self, html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18
        return 0

    def _media_rating_search(self, html):
        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
        rating = self._html_search_meta('rating', html)

        if not rating:
            return None

        RATING_TABLE = {
            'safe for kids': 0,
            'general': 8,
            '14 years': 14,
            'mature': 17,
            'restricted': 19,
        }
        return RATING_TABLE.get(rating.lower(), None)


class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError(u'Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError("This method must be implemented by subclasses")

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
Commit	Line	Data
d6983cb4 PH	1	import base64
	2	import os
	3	import re
	4	import socket
	5	import sys
fc79158d	6	import netrc
267ed0c5	7	import xml.etree.ElementTree
d6983cb4 PH	8
	9	from ..utils import (
	10	compat_http_client,
	11	compat_urllib_error,
d6983cb4 PH	12	compat_str,
	13
	14	clean_html,
	15	compiled_regex_type,
	16	ExtractorError,
55b3e45b	17	RegexNotFoundError,
d41e6efc	18	sanitize_filename,
f38de77f	19	unescapeHTML,
d6983cb4 PH	20	)
d6983cb4 PH	21
dca08720	22
d6983cb4 PH	23	class InfoExtractor(object):
	24	"""Information Extractor class.
	25
	26	Information extractors are the classes that, given a URL, extract
	27	information about the video (or videos) the URL refers to. This
	28	information includes the real video URL, the video title, author and
	29	others. The information is stored in a dictionary which is then
	30	passed to the FileDownloader. The FileDownloader processes this
	31	information possibly downloading the video to the file system, among
	32	other possible outcomes.
	33
	34	The dictionaries must include the following fields:
	35
	36	id: Video identifier.
	37	url: Final video URL.
	38	title: Video title, unescaped.
	39	ext: Video filename extension.
	40
2f5865cc PH	41	Instead of url and ext, formats can also specified.
2f5865cc PH	42
d6983cb4 PH	43	The following fields are optional:
	44
	45	format: The video format, defaults to ext (used for --get-format)
73e79f2a PH	46	thumbnails: A list of dictionaries (with the entries "resolution" and
73e79f2a PH	47	"url") for the varying thumbnails
d6983cb4 PH	48	thumbnail: Full URL to a video thumbnail image.
	49	description: One-line video description.
	50	uploader: Full name of the video uploader.
	51	upload_date: Video upload date (YYYYMMDD).
	52	uploader_id: Nickname or id of the video uploader.
	53	location: Physical location of the video.
	54	player_url: SWF Player URL (used for rtmpdump).
5d51a883 JMF	55	subtitles: The subtitle file contents as a dictionary in the format
5d51a883 JMF	56	{language: subtitles}.
f3d29461	57	view_count: How many users have watched the video on the platform.
19e3dfc9 PH	58	like_count: Number of positive ratings of the video
	59	dislike_count: Number of negative ratings of the video
	60	comment_count: Number of comments on the video
d6983cb4 PH	61	urlhandle: [internal] The urlHandle to be used to download the file,
d6983cb4 PH	62	like returned by urllib.request.urlopen
8dbe9899	63	age_limit: Age restriction for the video, as an integer (years)
deefc05b PH	64	formats: A list of dictionaries for each format available, it must
	65	be ordered from worst to best quality. Potential fields:
	66	* url Mandatory. The URL of the video file
	67	* ext Will be calculated from url if missing
	68	* format A human-readable description of the format
	69	("mp4 container with h264/opus").
b5d0d817	70	Calculated from the format_id, width, height.
8c51aa65	71	and format_note fields if missing.
deefc05b PH	72	* format_id A short description of the format
deefc05b PH	73	("mp4_h264_opus" or "19")
8c51aa65 JMF	74	* format_note Additional info about the format
8c51aa65 JMF	75	("3D" or "DASH video")
deefc05b PH	76	* width Width of the video, if known
deefc05b PH	77	* height Height of the video, if known
91c7271a PH	78	* abr Average audio bitrate in KBit/s
	79	* acodec Name of the audio codec in use
	80	* vbr Average video bitrate in KBit/s
	81	* vcodec Name of the video codec in use
02dbf93f	82	* filesize The number of bytes, if known in advance
9103bbc5 JMF	83	webpage_url: The url to the video webpage, if given to youtube-dl it
	84	should allow to get the same result again. (It will be set
	85	by YoutubeDL if it's missing)
d6983cb4	86
deefc05b	87	Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 PH	88
	89	Subclasses of this one should re-define the _real_initialize() and
	90	_real_extract() methods and define a _VALID_URL regexp.
	91	Probably, they should also be added to the list of extractors.
	92
	93	_real_extract() must return a list of information dictionaries as
	94	described above.
	95
	96	Finally, the _WORKING attribute should be set to False for broken IEs
	97	in order to warn the users and skip the tests.
	98	"""
	99
	100	_ready = False
	101	_downloader = None
	102	_WORKING = True
	103
	104	def __init__(self, downloader=None):
	105	"""Constructor. Receives an optional downloader."""
	106	self._ready = False
	107	self.set_downloader(downloader)
	108
	109	@classmethod
	110	def suitable(cls, url):
	111	"""Receives a URL and returns True if suitable for this IE."""
79cb2577 PH	112
	113	# This does not use has/getattr intentionally - we want to know whether
	114	# we have cached the regexp for this class, whereas getattr would also
	115	# match the superclass
	116	if '_VALID_URL_RE' not in cls.__dict__:
	117	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	118	return cls._VALID_URL_RE.match(url) is not None
d6983cb4 PH	119
	120	@classmethod
	121	def working(cls):
	122	"""Getter method for _WORKING."""
	123	return cls._WORKING
	124
	125	def initialize(self):
	126	"""Initializes an instance (authentication, etc)."""
	127	if not self._ready:
	128	self._real_initialize()
	129	self._ready = True
	130
	131	def extract(self, url):
	132	"""Extracts URL information and returns it in list of dicts."""
	133	self.initialize()
	134	return self._real_extract(url)
	135
	136	def set_downloader(self, downloader):
	137	"""Sets the downloader for this IE."""
	138	self._downloader = downloader
	139
	140	def _real_initialize(self):
	141	"""Real initialization process. Redefine in subclasses."""
	142	pass
	143
	144	def _real_extract(self, url):
	145	"""Real extraction process. Redefine in subclasses."""
	146	pass
	147
56c73665 JMF	148	@classmethod
	149	def ie_key(cls):
	150	"""A string for getting the InfoExtractor with get_info_extractor"""
	151	return cls.__name__[:-2]
	152
d6983cb4 PH	153	@property
	154	def IE_NAME(self):
	155	return type(self).__name__[:-2]
	156
7cc3570e	157	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 PH	158	""" Returns the response handle """
	159	if note is None:
	160	self.report_download_webpage(video_id)
	161	elif note is not False:
7cc3570e PH	162	if video_id is None:
	163	self.to_screen(u'%s' % (note,))
	164	else:
	165	self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4	166	try:
dca08720	167	return self._downloader.urlopen(url_or_request)
d6983cb4 PH	168	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	169	if errnote is None:
	170	errnote = u'Unable to download webpage'
7cc3570e PH	171	errmsg = u'%s: %s' % (errnote, compat_str(err))
	172	if fatal:
	173	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	174	else:
	175	self._downloader.report_warning(errmsg)
	176	return False
d6983cb4	177
7cc3570e	178	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	179	""" Returns a tuple (page content as string, URL handle) """
b9d3e163 PH	180
	181	# Strip hashes from the URL (#1038)
	182	if isinstance(url_or_request, (compat_str, str)):
	183	url_or_request = url_or_request.partition('#')[0]
	184
7cc3570e PH	185	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	186	if urlh is False:
	187	assert not fatal
	188	return False
d6983cb4	189	content_type = urlh.headers.get('Content-Type', '')
f143d86a	190	webpage_bytes = urlh.read()
d6983cb4 PH	191	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	192	if m:
	193	encoding = m.group(1)
	194	else:
0d75ae2c	195	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a PH	196	webpage_bytes[:1024])
	197	if m:
	198	encoding = m.group(1).decode('ascii')
	199	else:
	200	encoding = 'utf-8'
d6983cb4 PH	201	if self._downloader.params.get('dump_intermediate_pages', False):
	202	try:
	203	url = url_or_request.get_full_url()
	204	except AttributeError:
	205	url = url_or_request
	206	self.to_screen(u'Dumping request to ' + url)
	207	dump = base64.b64encode(webpage_bytes).decode('ascii')
	208	self._downloader.to_screen(dump)
d41e6efc PH	209	if self._downloader.params.get('write_pages', False):
	210	try:
	211	url = url_or_request.get_full_url()
	212	except AttributeError:
	213	url = url_or_request
	214	raw_filename = ('%s_%s.dump' % (video_id, url))
	215	filename = sanitize_filename(raw_filename, restricted=True)
	216	self.to_screen(u'Saving request to ' + filename)
	217	with open(filename, 'wb') as outf:
	218	outf.write(webpage_bytes)
	219
d6983cb4 PH	220	content = webpage_bytes.decode(encoding, 'replace')
	221	return (content, urlh)
	222
7cc3570e	223	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	224	""" Returns the data of the page as a string """
7cc3570e PH	225	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	226	if res is False:
	227	return res
	228	else:
	229	content, _ = res
	230	return content
d6983cb4	231
2a275ab0	232	def _download_xml(self, url_or_request, video_id,
e2b38da9 PH	233	note=u'Downloading XML', errnote=u'Unable to download XML',
e2b38da9 PH	234	transform_source=None):
267ed0c5 JMF	235	"""Return the xml as an xml.etree.ElementTree.Element"""
267ed0c5 JMF	236	xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9 PH	237	if transform_source:
e2b38da9 PH	238	xml_string = transform_source(xml_string)
267ed0c5 JMF	239	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
267ed0c5 JMF	240
d6983cb4 PH	241	def to_screen(self, msg):
	242	"""Print msg to screen, prefixing it with '[ie_name]'"""
	243	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	244
	245	def report_extraction(self, id_or_name):
	246	"""Report information extraction."""
	247	self.to_screen(u'%s: Extracting information' % id_or_name)
	248
	249	def report_download_webpage(self, video_id):
	250	"""Report webpage download."""
	251	self.to_screen(u'%s: Downloading webpage' % video_id)
	252
	253	def report_age_confirmation(self):
	254	"""Report attempt to confirm age."""
	255	self.to_screen(u'Confirming age')
	256
fc79158d JMF	257	def report_login(self):
	258	"""Report attempt to log in."""
	259	self.to_screen(u'Logging in')
	260
d6983cb4	261	#Methods for following #608
7012b23c	262	def url_result(self, url, ie=None, video_id=None):
d6983cb4 PH	263	"""Returns a url that points to a page that should be processed"""
	264	#TODO: ie should be the class used for getting the info
	265	video_info = {'_type': 'url',
	266	'url': url,
	267	'ie_key': ie}
7012b23c PH	268	if video_id is not None:
7012b23c PH	269	video_info['id'] = video_id
d6983cb4 PH	270	return video_info
	271	def playlist_result(self, entries, playlist_id=None, playlist_title=None):
	272	"""Returns a playlist"""
	273	video_info = {'_type': 'playlist',
	274	'entries': entries}
	275	if playlist_id:
	276	video_info['id'] = playlist_id
	277	if playlist_title:
	278	video_info['title'] = playlist_title
	279	return video_info
	280
	281	def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	282	"""
	283	Perform a regex search on the given string, using a single or a list of
	284	patterns returning the first matching group.
	285	In case of failure return a default value or raise a WARNING or a
55b3e45b	286	RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 PH	287	"""
	288	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	289	mobj = re.search(pattern, string, flags)
	290	else:
	291	for p in pattern:
	292	mobj = re.search(p, string, flags)
	293	if mobj: break
	294
	295	if sys.stderr.isatty() and os.name != 'nt':
	296	_name = u'\033[0;34m%s\033[0m' % name
	297	else:
	298	_name = name
	299
	300	if mobj:
	301	# return the first matching group
	302	return next(g for g in mobj.groups() if g is not None)
	303	elif default is not None:
	304	return default
	305	elif fatal:
55b3e45b	306	raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4 PH	307	else:
d6983cb4 PH	308	self._downloader.report_warning(u'unable to extract %s; '
98bcd283	309	u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4 PH	310	return None
	311
	312	def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	313	"""
	314	Like _search_regex, but strips HTML tags and unescapes entities.
	315	"""
	316	res = self._search_regex(pattern, string, name, default, fatal, flags)
	317	if res:
	318	return clean_html(res).strip()
	319	else:
	320	return res
	321
fc79158d JMF	322	def _get_login_info(self):
	323	"""
	324	Get the the login info as (username, password)
	325	It will look in the netrc file using the _NETRC_MACHINE value
	326	If there's no info available, return (None, None)
	327	"""
	328	if self._downloader is None:
	329	return (None, None)
	330
	331	username = None
	332	password = None
	333	downloader_params = self._downloader.params
	334
	335	# Attempt to use provided username and password or .netrc data
	336	if downloader_params.get('username', None) is not None:
	337	username = downloader_params['username']
	338	password = downloader_params['password']
	339	elif downloader_params.get('usenetrc', False):
	340	try:
	341	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	342	if info is not None:
	343	username = info[0]
	344	password = info[2]
	345	else:
	346	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	347	except (IOError, netrc.NetrcParseError) as err:
	348	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	349
	350	return (username, password)
	351
46720279 JMF	352	# Helper functions for extracting OpenGraph info
46720279 JMF	353	@staticmethod
ab2d5247	354	def _og_regexes(prop):
78fb87b2 JMF	355	content_re = r'content=(?:"([^>]+?)"\|\'(.+?)\')'
	356	property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
	357	template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247	358	return [
78fb87b2 JMF	359	template % (property_re, content_re),
78fb87b2 JMF	360	template % (content_re, property_re),
ab2d5247	361	]
46720279	362
3c4e6d83	363	def _og_search_property(self, prop, html, name=None, **kargs):
46720279	364	if name is None:
3c4e6d83	365	name = 'OpenGraph %s' % prop
ab2d5247	366	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398 PH	367	if escaped is None:
	368	return None
	369	return unescapeHTML(escaped)
46720279 JMF	370
46720279 JMF	371	def _og_search_thumbnail(self, html, **kargs):
3c4e6d83	372	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279 JMF	373
	374	def _og_search_description(self, html, **kargs):
	375	return self._og_search_property('description', html, fatal=False, **kargs)
	376
	377	def _og_search_title(self, html, **kargs):
	378	return self._og_search_property('title', html, **kargs)
	379
8ffa13e0	380	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247 JMF	381	regexes = self._og_regexes('video')
ab2d5247 JMF	382	if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0	383	return self._html_search_regex(regexes, html, name, **kargs)
46720279	384
59040888 PH	385	def _html_search_meta(self, name, html, display_name=None):
	386	if display_name is None:
	387	display_name = name
	388	return self._html_search_regex(
aaebed13 PH	389	r'''(?ix)<meta
aaebed13 PH	390	(?=[^>]+(?:itemprop\|name\|property)=["\']%s["\'])
59040888 PH	391	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	392	html, display_name, fatal=False)
	393
	394	def _dc_search_uploader(self, html):
	395	return self._html_search_meta('dc.creator', html, 'uploader')
	396
8dbe9899 PH	397	def _rta_search(self, html):
	398	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	399	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	400	r' content="RTA-5042-1996-1400-1577-RTA"',
	401	html):
	402	return 18
	403	return 0
	404
59040888 PH	405	def _media_rating_search(self, html):
	406	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	407	rating = self._html_search_meta('rating', html)
	408
	409	if not rating:
	410	return None
	411
	412	RATING_TABLE = {
	413	'safe for kids': 0,
	414	'general': 8,
	415	'14 years': 14,
	416	'mature': 17,
	417	'restricted': 19,
	418	}
	419	return RATING_TABLE.get(rating.lower(), None)
	420
	421
8dbe9899	422
d6983cb4 PH	423	class SearchInfoExtractor(InfoExtractor):
	424	"""
	425	Base class for paged search queries extractors.
	426	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	427	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	428	"""
	429
	430	@classmethod
	431	def _make_valid_url(cls):
	432	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	433
	434	@classmethod
	435	def suitable(cls, url):
	436	return re.match(cls._make_valid_url(), url) is not None
	437
	438	def _real_extract(self, query):
	439	mobj = re.match(self._make_valid_url(), query)
	440	if mobj is None:
	441	raise ExtractorError(u'Invalid search query "%s"' % query)
	442
	443	prefix = mobj.group('prefix')
	444	query = mobj.group('query')
	445	if prefix == '':
	446	return self._get_n_results(query, 1)
	447	elif prefix == 'all':
	448	return self._get_n_results(query, self._MAX_RESULTS)
	449	else:
	450	n = int(prefix)
	451	if n <= 0:
	452	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	453	elif n > self._MAX_RESULTS:
	454	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	455	n = self._MAX_RESULTS
	456	return self._get_n_results(query, n)
	457
	458	def _get_n_results(self, query, n):
	459	"""Get a specified number of results for a query"""
416a5efc	460	raise NotImplementedError("This method must be implemented by subclasses")
0f818663 PH	461
	462	@property
	463	def SEARCH_KEY(self):
	464	return self._SEARCH_KEY