[yt-dlp.git] / youtube_dl / extractor / common.py

import base64
import os
import re
import socket
import sys
import netrc
import xml.etree.ElementTree

from ..utils import (
    compat_http_client,
    compat_urllib_error,
    compat_str,

    clean_html,
    compiled_regex_type,
    ExtractorError,
    RegexNotFoundError,
    sanitize_filename,
    unescapeHTML,
)


class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the FileDownloader. The FileDownloader processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The dictionaries must include the following fields:

    id:             Video identifier.
    title:          Video title, unescaped.
    url:            Final video URL.
    ext:            Video filename extension.

    Instead of url and ext, formats can also specified.

    The following fields are optional:

    format:         The video format, defaults to ext (used for --get-format)
    thumbnails:     A list of dictionaries (with the entries "resolution" and
                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
    upload_date:    Video upload date (YYYYMMDD).
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location of the video.
    player_url:     SWF Player URL (used for rtmpdump).
    subtitles:      The subtitle file contents as a dictionary in the format
                    {language: subtitles}.
    duration:       Length of the video in seconds, as an integer.
    view_count:     How many users have watched the video on the platform.
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
    comment_count:  Number of comments on the video
    urlhandle:      [internal] The urlHandle to be used to download the file,
                    like returned by urllib.request.urlopen
    age_limit:      Age restriction for the video, as an integer (years)
    formats:        A list of dictionaries for each format available, it must
                    be ordered from worst to best quality. Potential fields:
                    * url       Mandatory. The URL of the video file
                    * ext       Will be calculated from url if missing
                    * format    A human-readable description of the format
                                ("mp4 container with h264/opus").
                                Calculated from the format_id, width, height.
                                and format_note fields if missing.
                    * format_id A short description of the format
                                ("mp4_h264_opus" or "19")
                    * format_note Additional info about the format
                                ("3D" or "DASH video")
                    * width     Width of the video, if known
                    * height    Height of the video, if known
                    * abr       Average audio bitrate in KBit/s
                    * acodec    Name of the audio codec in use
                    * vbr       Average video bitrate in KBit/s
                    * vcodec    Name of the video codec in use
                    * filesize  The number of bytes, if known in advance
    webpage_url:    The url to the video webpage, if given to youtube-dl it
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)

    Unless mentioned otherwise, the fields should be Unicode strings.

    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _real_extract() must return a *list* of information dictionaries as
    described above.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""

        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url) is not None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        self.initialize()
        return self._real_extract(url)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

    @property
    def IE_NAME(self):
        return type(self).__name__[:-2]

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the response handle """
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            if video_id is None:
                self.to_screen(u'%s' % (note,))
            else:
                self.to_screen(u'%s: %s' % (video_id, note))
        try:
            return self._downloader.urlopen(url_or_request)
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            if errnote is None:
                errnote = u'Unable to download webpage'
            errmsg = u'%s: %s' % (errnote, compat_str(err))
            if fatal:
                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
            else:
                self._downloader.report_warning(errmsg)
                return False

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns a tuple (page content as string, URL handle) """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
        if urlh is False:
            assert not fatal
            return False
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
            else:
                encoding = 'utf-8'
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self._downloader.params.get('write_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            raw_filename = ('%s_%s.dump' % (video_id, url))
            filename = sanitize_filename(raw_filename, restricted=True)
            self.to_screen(u'Saving request to ' + filename)
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

        content = webpage_bytes.decode(encoding, 'replace')
        return (content, urlh)

    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the data of the page as a string """
        res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
        if res is False:
            return res
        else:
            content, _ = res
            return content

    def _download_xml(self, url_or_request, video_id,
                      note=u'Downloading XML', errnote=u'Unable to download XML',
                      transform_source=None):
        """Return the xml as an xml.etree.ElementTree.Element"""
        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
        if transform_source:
            xml_string = transform_source(xml_string)
        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(u'%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(u'%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    #Methods for following #608
    def url_result(self, url, ie=None, video_id=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        if video_id is not None:
            video_info['id'] = video_id
        return video_info
    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info

    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        RegexNotFoundError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj: break

        if sys.stderr.isatty() and os.name != 'nt':
            _name = u'\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            # return the first matching group
            return next(g for g in mobj.groups() if g is not None)
        elif default is not None:
            return default
        elif fatal:
            raise RegexNotFoundError(u'Unable to extract %s' % _name)
        else:
            self._downloader.report_warning(u'unable to extract %s; '
                u'please report this issue on http://yt-dl.org/bug' % _name)
            return None

    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_login_info(self):
        """
        Get the the login info as (username, password)
        It will look in the netrc file using the _NETRC_MACHINE value
        If there's no info available, return (None, None)
        """
        if self._downloader is None:
            return (None, None)

        username = None
        password = None
        downloader_params = self._downloader.params

        # Attempt to use provided username and password or .netrc data
        if downloader_params.get('username', None) is not None:
            username = downloader_params['username']
            password = downloader_params['password']
        elif downloader_params.get('usenetrc', False):
            try:
                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
            except (IOError, netrc.NetrcParseError) as err:
                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
        
        return (username, password)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regexes(prop):
        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
        template = r'<meta[^>]+?%s[^>]+?%s'
        return [
            template % (property_re, content_re),
            template % (content_re, property_re),
        ]

    def _og_search_property(self, prop, html, name=None, **kargs):
        if name is None:
            name = 'OpenGraph %s' % prop
        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
        if escaped is None:
            return None
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
        regexes = self._og_regexes('video')
        if secure: regexes = self._og_regexes('video:secure_url') + regexes
        return self._html_search_regex(regexes, html, name, **kargs)

    def _html_search_meta(self, name, html, display_name=None):
        if display_name is None:
            display_name = name
        return self._html_search_regex(
            r'''(?ix)<meta
                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
            html, display_name, fatal=False)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')

    def _rta_search(self, html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18
        return 0

    def _media_rating_search(self, html):
        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
        rating = self._html_search_meta('rating', html)

        if not rating:
            return None

        RATING_TABLE = {
            'safe for kids': 0,
            'general': 8,
            '14 years': 14,
            'mature': 17,
            'restricted': 19,
        }
        return RATING_TABLE.get(rating.lower(), None)


class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError(u'Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError("This method must be implemented by subclasses")

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
Commit	Line	Data
d6983cb4 PH	1	import base64
	2	import os
	3	import re
	4	import socket
	5	import sys
fc79158d	6	import netrc
267ed0c5	7	import xml.etree.ElementTree
d6983cb4 PH	8
	9	from ..utils import (
	10	compat_http_client,
	11	compat_urllib_error,
d6983cb4 PH	12	compat_str,
	13
	14	clean_html,
	15	compiled_regex_type,
	16	ExtractorError,
55b3e45b	17	RegexNotFoundError,
d41e6efc	18	sanitize_filename,
f38de77f	19	unescapeHTML,
d6983cb4 PH	20	)
d6983cb4 PH	21
dca08720	22
d6983cb4 PH	23	class InfoExtractor(object):
	24	"""Information Extractor class.
	25
	26	Information extractors are the classes that, given a URL, extract
	27	information about the video (or videos) the URL refers to. This
	28	information includes the real video URL, the video title, author and
	29	others. The information is stored in a dictionary which is then
	30	passed to the FileDownloader. The FileDownloader processes this
	31	information possibly downloading the video to the file system, among
	32	other possible outcomes.
	33
	34	The dictionaries must include the following fields:
	35
	36	id: Video identifier.
d6983cb4	37	title: Video title, unescaped.
c0ba0f48	38	url: Final video URL.
d6983cb4 PH	39	ext: Video filename extension.
d6983cb4 PH	40
2f5865cc PH	41	Instead of url and ext, formats can also specified.
2f5865cc PH	42
d6983cb4 PH	43	The following fields are optional:
	44
	45	format: The video format, defaults to ext (used for --get-format)
73e79f2a PH	46	thumbnails: A list of dictionaries (with the entries "resolution" and
73e79f2a PH	47	"url") for the varying thumbnails
d6983cb4 PH	48	thumbnail: Full URL to a video thumbnail image.
	49	description: One-line video description.
	50	uploader: Full name of the video uploader.
	51	upload_date: Video upload date (YYYYMMDD).
	52	uploader_id: Nickname or id of the video uploader.
	53	location: Physical location of the video.
	54	player_url: SWF Player URL (used for rtmpdump).
5d51a883 JMF	55	subtitles: The subtitle file contents as a dictionary in the format
5d51a883 JMF	56	{language: subtitles}.
c0ba0f48	57	duration: Length of the video in seconds, as an integer.
f3d29461	58	view_count: How many users have watched the video on the platform.
19e3dfc9 PH	59	like_count: Number of positive ratings of the video
	60	dislike_count: Number of negative ratings of the video
	61	comment_count: Number of comments on the video
d6983cb4 PH	62	urlhandle: [internal] The urlHandle to be used to download the file,
d6983cb4 PH	63	like returned by urllib.request.urlopen
8dbe9899	64	age_limit: Age restriction for the video, as an integer (years)
deefc05b PH	65	formats: A list of dictionaries for each format available, it must
	66	be ordered from worst to best quality. Potential fields:
	67	* url Mandatory. The URL of the video file
	68	* ext Will be calculated from url if missing
	69	* format A human-readable description of the format
	70	("mp4 container with h264/opus").
b5d0d817	71	Calculated from the format_id, width, height.
8c51aa65	72	and format_note fields if missing.
deefc05b PH	73	* format_id A short description of the format
deefc05b PH	74	("mp4_h264_opus" or "19")
8c51aa65 JMF	75	* format_note Additional info about the format
8c51aa65 JMF	76	("3D" or "DASH video")
deefc05b PH	77	* width Width of the video, if known
deefc05b PH	78	* height Height of the video, if known
91c7271a PH	79	* abr Average audio bitrate in KBit/s
	80	* acodec Name of the audio codec in use
	81	* vbr Average video bitrate in KBit/s
	82	* vcodec Name of the video codec in use
02dbf93f	83	* filesize The number of bytes, if known in advance
9103bbc5 JMF	84	webpage_url: The url to the video webpage, if given to youtube-dl it
	85	should allow to get the same result again. (It will be set
	86	by YoutubeDL if it's missing)
d6983cb4	87
deefc05b	88	Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 PH	89
	90	Subclasses of this one should re-define the _real_initialize() and
	91	_real_extract() methods and define a _VALID_URL regexp.
	92	Probably, they should also be added to the list of extractors.
	93
	94	_real_extract() must return a list of information dictionaries as
	95	described above.
	96
	97	Finally, the _WORKING attribute should be set to False for broken IEs
	98	in order to warn the users and skip the tests.
	99	"""
	100
	101	_ready = False
	102	_downloader = None
	103	_WORKING = True
	104
	105	def __init__(self, downloader=None):
	106	"""Constructor. Receives an optional downloader."""
	107	self._ready = False
	108	self.set_downloader(downloader)
	109
	110	@classmethod
	111	def suitable(cls, url):
	112	"""Receives a URL and returns True if suitable for this IE."""
79cb2577 PH	113
	114	# This does not use has/getattr intentionally - we want to know whether
	115	# we have cached the regexp for this class, whereas getattr would also
	116	# match the superclass
	117	if '_VALID_URL_RE' not in cls.__dict__:
	118	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	119	return cls._VALID_URL_RE.match(url) is not None
d6983cb4 PH	120
	121	@classmethod
	122	def working(cls):
	123	"""Getter method for _WORKING."""
	124	return cls._WORKING
	125
	126	def initialize(self):
	127	"""Initializes an instance (authentication, etc)."""
	128	if not self._ready:
	129	self._real_initialize()
	130	self._ready = True
	131
	132	def extract(self, url):
	133	"""Extracts URL information and returns it in list of dicts."""
	134	self.initialize()
	135	return self._real_extract(url)
	136
	137	def set_downloader(self, downloader):
	138	"""Sets the downloader for this IE."""
	139	self._downloader = downloader
	140
	141	def _real_initialize(self):
	142	"""Real initialization process. Redefine in subclasses."""
	143	pass
	144
	145	def _real_extract(self, url):
	146	"""Real extraction process. Redefine in subclasses."""
	147	pass
	148
56c73665 JMF	149	@classmethod
	150	def ie_key(cls):
	151	"""A string for getting the InfoExtractor with get_info_extractor"""
	152	return cls.__name__[:-2]
	153
d6983cb4 PH	154	@property
	155	def IE_NAME(self):
	156	return type(self).__name__[:-2]
	157
7cc3570e	158	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 PH	159	""" Returns the response handle """
	160	if note is None:
	161	self.report_download_webpage(video_id)
	162	elif note is not False:
7cc3570e PH	163	if video_id is None:
	164	self.to_screen(u'%s' % (note,))
	165	else:
	166	self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4	167	try:
dca08720	168	return self._downloader.urlopen(url_or_request)
d6983cb4 PH	169	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	170	if errnote is None:
	171	errnote = u'Unable to download webpage'
7cc3570e PH	172	errmsg = u'%s: %s' % (errnote, compat_str(err))
	173	if fatal:
	174	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	175	else:
	176	self._downloader.report_warning(errmsg)
	177	return False
d6983cb4	178
7cc3570e	179	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	180	""" Returns a tuple (page content as string, URL handle) """
b9d3e163 PH	181
	182	# Strip hashes from the URL (#1038)
	183	if isinstance(url_or_request, (compat_str, str)):
	184	url_or_request = url_or_request.partition('#')[0]
	185
7cc3570e PH	186	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	187	if urlh is False:
	188	assert not fatal
	189	return False
d6983cb4	190	content_type = urlh.headers.get('Content-Type', '')
f143d86a	191	webpage_bytes = urlh.read()
d6983cb4 PH	192	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	193	if m:
	194	encoding = m.group(1)
	195	else:
0d75ae2c	196	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a PH	197	webpage_bytes[:1024])
	198	if m:
	199	encoding = m.group(1).decode('ascii')
	200	else:
	201	encoding = 'utf-8'
d6983cb4 PH	202	if self._downloader.params.get('dump_intermediate_pages', False):
	203	try:
	204	url = url_or_request.get_full_url()
	205	except AttributeError:
	206	url = url_or_request
	207	self.to_screen(u'Dumping request to ' + url)
	208	dump = base64.b64encode(webpage_bytes).decode('ascii')
	209	self._downloader.to_screen(dump)
d41e6efc PH	210	if self._downloader.params.get('write_pages', False):
	211	try:
	212	url = url_or_request.get_full_url()
	213	except AttributeError:
	214	url = url_or_request
	215	raw_filename = ('%s_%s.dump' % (video_id, url))
	216	filename = sanitize_filename(raw_filename, restricted=True)
	217	self.to_screen(u'Saving request to ' + filename)
	218	with open(filename, 'wb') as outf:
	219	outf.write(webpage_bytes)
	220
d6983cb4 PH	221	content = webpage_bytes.decode(encoding, 'replace')
	222	return (content, urlh)
	223
7cc3570e	224	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	225	""" Returns the data of the page as a string """
7cc3570e PH	226	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	227	if res is False:
	228	return res
	229	else:
	230	content, _ = res
	231	return content
d6983cb4	232
2a275ab0	233	def _download_xml(self, url_or_request, video_id,
e2b38da9 PH	234	note=u'Downloading XML', errnote=u'Unable to download XML',
e2b38da9 PH	235	transform_source=None):
267ed0c5 JMF	236	"""Return the xml as an xml.etree.ElementTree.Element"""
267ed0c5 JMF	237	xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9 PH	238	if transform_source:
e2b38da9 PH	239	xml_string = transform_source(xml_string)
267ed0c5 JMF	240	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
267ed0c5 JMF	241
d6983cb4 PH	242	def to_screen(self, msg):
	243	"""Print msg to screen, prefixing it with '[ie_name]'"""
	244	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	245
	246	def report_extraction(self, id_or_name):
	247	"""Report information extraction."""
	248	self.to_screen(u'%s: Extracting information' % id_or_name)
	249
	250	def report_download_webpage(self, video_id):
	251	"""Report webpage download."""
	252	self.to_screen(u'%s: Downloading webpage' % video_id)
	253
	254	def report_age_confirmation(self):
	255	"""Report attempt to confirm age."""
	256	self.to_screen(u'Confirming age')
	257
fc79158d JMF	258	def report_login(self):
	259	"""Report attempt to log in."""
	260	self.to_screen(u'Logging in')
	261
d6983cb4	262	#Methods for following #608
7012b23c	263	def url_result(self, url, ie=None, video_id=None):
d6983cb4 PH	264	"""Returns a url that points to a page that should be processed"""
	265	#TODO: ie should be the class used for getting the info
	266	video_info = {'_type': 'url',
	267	'url': url,
	268	'ie_key': ie}
7012b23c PH	269	if video_id is not None:
7012b23c PH	270	video_info['id'] = video_id
d6983cb4 PH	271	return video_info
	272	def playlist_result(self, entries, playlist_id=None, playlist_title=None):
	273	"""Returns a playlist"""
	274	video_info = {'_type': 'playlist',
	275	'entries': entries}
	276	if playlist_id:
	277	video_info['id'] = playlist_id
	278	if playlist_title:
	279	video_info['title'] = playlist_title
	280	return video_info
	281
	282	def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	283	"""
	284	Perform a regex search on the given string, using a single or a list of
	285	patterns returning the first matching group.
	286	In case of failure return a default value or raise a WARNING or a
55b3e45b	287	RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 PH	288	"""
	289	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	290	mobj = re.search(pattern, string, flags)
	291	else:
	292	for p in pattern:
	293	mobj = re.search(p, string, flags)
	294	if mobj: break
	295
	296	if sys.stderr.isatty() and os.name != 'nt':
	297	_name = u'\033[0;34m%s\033[0m' % name
	298	else:
	299	_name = name
	300
	301	if mobj:
	302	# return the first matching group
	303	return next(g for g in mobj.groups() if g is not None)
	304	elif default is not None:
	305	return default
	306	elif fatal:
55b3e45b	307	raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4 PH	308	else:
d6983cb4 PH	309	self._downloader.report_warning(u'unable to extract %s; '
98bcd283	310	u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4 PH	311	return None
	312
	313	def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	314	"""
	315	Like _search_regex, but strips HTML tags and unescapes entities.
	316	"""
	317	res = self._search_regex(pattern, string, name, default, fatal, flags)
	318	if res:
	319	return clean_html(res).strip()
	320	else:
	321	return res
	322
fc79158d JMF	323	def _get_login_info(self):
	324	"""
	325	Get the the login info as (username, password)
	326	It will look in the netrc file using the _NETRC_MACHINE value
	327	If there's no info available, return (None, None)
	328	"""
	329	if self._downloader is None:
	330	return (None, None)
	331
	332	username = None
	333	password = None
	334	downloader_params = self._downloader.params
	335
	336	# Attempt to use provided username and password or .netrc data
	337	if downloader_params.get('username', None) is not None:
	338	username = downloader_params['username']
	339	password = downloader_params['password']
	340	elif downloader_params.get('usenetrc', False):
	341	try:
	342	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	343	if info is not None:
	344	username = info[0]
	345	password = info[2]
	346	else:
	347	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	348	except (IOError, netrc.NetrcParseError) as err:
	349	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	350
	351	return (username, password)
	352
46720279 JMF	353	# Helper functions for extracting OpenGraph info
46720279 JMF	354	@staticmethod
ab2d5247	355	def _og_regexes(prop):
78fb87b2 JMF	356	content_re = r'content=(?:"([^>]+?)"\|\'(.+?)\')'
	357	property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
	358	template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247	359	return [
78fb87b2 JMF	360	template % (property_re, content_re),
78fb87b2 JMF	361	template % (content_re, property_re),
ab2d5247	362	]
46720279	363
3c4e6d83	364	def _og_search_property(self, prop, html, name=None, **kargs):
46720279	365	if name is None:
3c4e6d83	366	name = 'OpenGraph %s' % prop
ab2d5247	367	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398 PH	368	if escaped is None:
	369	return None
	370	return unescapeHTML(escaped)
46720279 JMF	371
46720279 JMF	372	def _og_search_thumbnail(self, html, **kargs):
3c4e6d83	373	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279 JMF	374
	375	def _og_search_description(self, html, **kargs):
	376	return self._og_search_property('description', html, fatal=False, **kargs)
	377
	378	def _og_search_title(self, html, **kargs):
	379	return self._og_search_property('title', html, **kargs)
	380
8ffa13e0	381	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247 JMF	382	regexes = self._og_regexes('video')
ab2d5247 JMF	383	if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0	384	return self._html_search_regex(regexes, html, name, **kargs)
46720279	385
59040888 PH	386	def _html_search_meta(self, name, html, display_name=None):
	387	if display_name is None:
	388	display_name = name
	389	return self._html_search_regex(
aaebed13 PH	390	r'''(?ix)<meta
aaebed13 PH	391	(?=[^>]+(?:itemprop\|name\|property)=["\']%s["\'])
59040888 PH	392	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	393	html, display_name, fatal=False)
	394
	395	def _dc_search_uploader(self, html):
	396	return self._html_search_meta('dc.creator', html, 'uploader')
	397
8dbe9899 PH	398	def _rta_search(self, html):
	399	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	400	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	401	r' content="RTA-5042-1996-1400-1577-RTA"',
	402	html):
	403	return 18
	404	return 0
	405
59040888 PH	406	def _media_rating_search(self, html):
	407	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	408	rating = self._html_search_meta('rating', html)
	409
	410	if not rating:
	411	return None
	412
	413	RATING_TABLE = {
	414	'safe for kids': 0,
	415	'general': 8,
	416	'14 years': 14,
	417	'mature': 17,
	418	'restricted': 19,
	419	}
	420	return RATING_TABLE.get(rating.lower(), None)
	421
	422
8dbe9899	423
d6983cb4 PH	424	class SearchInfoExtractor(InfoExtractor):
	425	"""
	426	Base class for paged search queries extractors.
	427	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	428	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	429	"""
	430
	431	@classmethod
	432	def _make_valid_url(cls):
	433	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	434
	435	@classmethod
	436	def suitable(cls, url):
	437	return re.match(cls._make_valid_url(), url) is not None
	438
	439	def _real_extract(self, query):
	440	mobj = re.match(self._make_valid_url(), query)
	441	if mobj is None:
	442	raise ExtractorError(u'Invalid search query "%s"' % query)
	443
	444	prefix = mobj.group('prefix')
	445	query = mobj.group('query')
	446	if prefix == '':
	447	return self._get_n_results(query, 1)
	448	elif prefix == 'all':
	449	return self._get_n_results(query, self._MAX_RESULTS)
	450	else:
	451	n = int(prefix)
	452	if n <= 0:
	453	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	454	elif n > self._MAX_RESULTS:
	455	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	456	n = self._MAX_RESULTS
	457	return self._get_n_results(query, n)
	458
	459	def _get_n_results(self, query, n):
	460	"""Get a specified number of results for a query"""
416a5efc	461	raise NotImplementedError("This method must be implemented by subclasses")
0f818663 PH	462
	463	@property
	464	def SEARCH_KEY(self):
	465	return self._SEARCH_KEY