[yt-dlp.git] / youtube_dl / extractor / common.py

import base64
import os
import re
import socket
import sys
import netrc
import xml.etree.ElementTree

from ..utils import (
    compat_http_client,
    compat_urllib_error,
    compat_urllib_parse_urlparse,
    compat_str,

    clean_html,
    compiled_regex_type,
    ExtractorError,
    RegexNotFoundError,
    sanitize_filename,
    unescapeHTML,
)
_NO_DEFAULT = object()


class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the FileDownloader. The FileDownloader processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The dictionaries must include the following fields:

    id:             Video identifier.
    title:          Video title, unescaped.

    Additionally, it must contain either a formats entry or a url one:

    formats:        A list of dictionaries for each format available, ordered
                    from worst to best quality.

                    Potential fields:
                    * url        Mandatory. The URL of the video file
                    * ext        Will be calculated from url if missing
                    * format     A human-readable description of the format
                                 ("mp4 container with h264/opus").
                                 Calculated from the format_id, width, height.
                                 and format_note fields if missing.
                    * format_id  A short description of the format
                                 ("mp4_h264_opus" or "19").
                                Technically optional, but strongly recommended.
                    * format_note Additional info about the format
                                 ("3D" or "DASH video")
                    * width      Width of the video, if known
                    * height     Height of the video, if known
                    * resolution Textual description of width and height
                    * tbr        Average bitrate of audio and video in KBit/s
                    * abr        Average audio bitrate in KBit/s
                    * acodec     Name of the audio codec in use
                    * vbr        Average video bitrate in KBit/s
                    * vcodec     Name of the video codec in use
                    * filesize   The number of bytes, if known in advance
                    * player_url SWF Player URL (used for rtmpdump).
                    * protocol   The protocol that will be used for the actual
                                 download, lower-case.
                                 "http", "https", "rtsp", "rtmp" or so.
                    * preference Order number of this format. If this field is
                                 present, the formats get sorted by this field.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
    url:            Final video URL.
    ext:            Video filename extension.
    format:         The video format, defaults to ext (used for --get-format)
    player_url:     SWF Player URL (used for rtmpdump).

    The following fields are optional:

    thumbnails:     A list of dictionaries (with the entries "resolution" and
                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
    upload_date:    Video upload date (YYYYMMDD).
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location of the video.
    subtitles:      The subtitle file contents as a dictionary in the format
                    {language: subtitles}.
    duration:       Length of the video in seconds, as an integer.
    view_count:     How many users have watched the video on the platform.
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
    comment_count:  Number of comments on the video
    age_limit:      Age restriction for the video, as an integer (years)
    webpage_url:    The url to the video webpage, if given to youtube-dl it
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)

    Unless mentioned otherwise, the fields should be Unicode strings.

    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _real_extract() must return a *list* of information dictionaries as
    described above.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""

        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url) is not None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        self.initialize()
        return self._real_extract(url)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

    @property
    def IE_NAME(self):
        return type(self).__name__[:-2]

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the response handle """
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            if video_id is None:
                self.to_screen(u'%s' % (note,))
            else:
                self.to_screen(u'%s: %s' % (video_id, note))
        try:
            return self._downloader.urlopen(url_or_request)
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            if errnote is False:
                return False
            if errnote is None:
                errnote = u'Unable to download webpage'
            errmsg = u'%s: %s' % (errnote, compat_str(err))
            if fatal:
                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
            else:
                self._downloader.report_warning(errmsg)
                return False

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns a tuple (page content as string, URL handle) """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
        if urlh is False:
            assert not fatal
            return False
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
            else:
                encoding = 'utf-8'
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self._downloader.params.get('write_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            raw_filename = ('%s_%s.dump' % (video_id, url))
            filename = sanitize_filename(raw_filename, restricted=True)
            self.to_screen(u'Saving request to ' + filename)
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

        content = webpage_bytes.decode(encoding, 'replace')
        return (content, urlh)

    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the data of the page as a string """
        res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
        if res is False:
            return res
        else:
            content, _ = res
            return content

    def _download_xml(self, url_or_request, video_id,
                      note=u'Downloading XML', errnote=u'Unable to download XML',
                      transform_source=None):
        """Return the xml as an xml.etree.ElementTree.Element"""
        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
        if transform_source:
            xml_string = transform_source(xml_string)
        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))

    def report_warning(self, msg, video_id=None):
        idstr = u'' if video_id is None else u'%s: ' % video_id
        self._downloader.report_warning(
            u'[%s] %s%s' % (self.IE_NAME, idstr, msg))

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(u'%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(u'%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    #Methods for following #608
    @staticmethod
    def url_result(url, ie=None, video_id=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        if video_id is not None:
            video_info['id'] = video_id
        return video_info
    @staticmethod
    def playlist_result(entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info

    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        RegexNotFoundError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj: break

        if os.name != 'nt' and sys.stderr.isatty():
            _name = u'\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            # return the first matching group
            return next(g for g in mobj.groups() if g is not None)
        elif default is not _NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError(u'Unable to extract %s' % _name)
        else:
            self._downloader.report_warning(u'unable to extract %s; '
                u'please report this issue on http://yt-dl.org/bug' % _name)
            return None

    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_login_info(self):
        """
        Get the the login info as (username, password)
        It will look in the netrc file using the _NETRC_MACHINE value
        If there's no info available, return (None, None)
        """
        if self._downloader is None:
            return (None, None)

        username = None
        password = None
        downloader_params = self._downloader.params

        # Attempt to use provided username and password or .netrc data
        if downloader_params.get('username', None) is not None:
            username = downloader_params['username']
            password = downloader_params['password']
        elif downloader_params.get('usenetrc', False):
            try:
                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
            except (IOError, netrc.NetrcParseError) as err:
                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
        
        return (username, password)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regexes(prop):
        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
        template = r'<meta[^>]+?%s[^>]+?%s'
        return [
            template % (property_re, content_re),
            template % (content_re, property_re),
        ]

    def _og_search_property(self, prop, html, name=None, **kargs):
        if name is None:
            name = 'OpenGraph %s' % prop
        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
        if escaped is None:
            return None
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
        regexes = self._og_regexes('video')
        if secure: regexes = self._og_regexes('video:secure_url') + regexes
        return self._html_search_regex(regexes, html, name, **kargs)

    def _html_search_meta(self, name, html, display_name=None):
        if display_name is None:
            display_name = name
        return self._html_search_regex(
            r'''(?ix)<meta
                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
            html, display_name, fatal=False)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')

    def _rta_search(self, html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18
        return 0

    def _media_rating_search(self, html):
        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
        rating = self._html_search_meta('rating', html)

        if not rating:
            return None

        RATING_TABLE = {
            'safe for kids': 0,
            'general': 8,
            '14 years': 14,
            'mature': 17,
            'restricted': 19,
        }
        return RATING_TABLE.get(rating.lower(), None)

    def _sort_formats(self, formats):
        def _formats_key(f):
            # TODO remove the following workaround
            from ..utils import determine_ext
            if not f.get('ext') and 'url' in f:
                f['ext'] = determine_ext(f['url'])

            preference = f.get('preference')
            if preference is None:
                proto = f.get('protocol')
                if proto is None:
                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme

                preference = 0 if proto in ['http', 'https'] else -0.1
                if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                    preference -= 0.5

            if f.get('vcodec') == 'none':  # audio only
                if self._downloader.params.get('prefer_free_formats'):
                    ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
                else:
                    ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
                ext_preference = 0
                try:
                    audio_ext_preference = ORDER.index(f['ext'])
                except ValueError:
                    audio_ext_preference = -1
            else:
                if self._downloader.params.get('prefer_free_formats'):
                    ORDER = [u'flv', u'mp4', u'webm']
                else:
                    ORDER = [u'webm', u'flv', u'mp4']
                try:
                    ext_preference = ORDER.index(f['ext'])
                except ValueError:
                    ext_preference = -1
                audio_ext_preference = 0

            return (
                preference,
                f.get('height') if f.get('height') is not None else -1,
                f.get('width') if f.get('width') is not None else -1,
                ext_preference,
                f.get('vbr') if f.get('vbr') is not None else -1,
                f.get('abr') if f.get('abr') is not None else -1,
                audio_ext_preference,
                f.get('filesize') if f.get('filesize') is not None else -1,
                f.get('format_id'),
            )
        formats.sort(key=_formats_key)


class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError(u'Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError("This method must be implemented by subclasses")

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
Commit	Line	Data
d6983cb4 PH	1	import base64
	2	import os
	3	import re
	4	import socket
	5	import sys
fc79158d	6	import netrc
267ed0c5	7	import xml.etree.ElementTree
d6983cb4 PH	8
	9	from ..utils import (
	10	compat_http_client,
	11	compat_urllib_error,
c7deaa4c	12	compat_urllib_parse_urlparse,
d6983cb4 PH	13	compat_str,
	14
	15	clean_html,
	16	compiled_regex_type,
	17	ExtractorError,
55b3e45b	18	RegexNotFoundError,
d41e6efc	19	sanitize_filename,
f38de77f	20	unescapeHTML,
d6983cb4	21	)
46374a56	22	_NO_DEFAULT = object()
d6983cb4	23
dca08720	24
d6983cb4 PH	25	class InfoExtractor(object):
	26	"""Information Extractor class.
	27
	28	Information extractors are the classes that, given a URL, extract
	29	information about the video (or videos) the URL refers to. This
	30	information includes the real video URL, the video title, author and
	31	others. The information is stored in a dictionary which is then
	32	passed to the FileDownloader. The FileDownloader processes this
	33	information possibly downloading the video to the file system, among
	34	other possible outcomes.
	35
	36	The dictionaries must include the following fields:
	37
	38	id: Video identifier.
d6983cb4	39	title: Video title, unescaped.
d67b0b15	40
f49d89ee	41	Additionally, it must contain either a formats entry or a url one:
d67b0b15	42
f49d89ee PH	43	formats: A list of dictionaries for each format available, ordered
	44	from worst to best quality.
	45
	46	Potential fields:
d67b0b15 PH	47	* url Mandatory. The URL of the video file
	48	* ext Will be calculated from url if missing
	49	* format A human-readable description of the format
	50	("mp4 container with h264/opus").
	51	Calculated from the format_id, width, height.
	52	and format_note fields if missing.
	53	* format_id A short description of the format
5d4f3985 PH	54	("mp4_h264_opus" or "19").
5d4f3985 PH	55	Technically optional, but strongly recommended.
d67b0b15 PH	56	* format_note Additional info about the format
	57	("3D" or "DASH video")
	58	* width Width of the video, if known
	59	* height Height of the video, if known
f49d89ee	60	* resolution Textual description of width and height
7217e148	61	* tbr Average bitrate of audio and video in KBit/s
d67b0b15 PH	62	* abr Average audio bitrate in KBit/s
	63	* acodec Name of the audio codec in use
	64	* vbr Average video bitrate in KBit/s
	65	* vcodec Name of the video codec in use
	66	* filesize The number of bytes, if known in advance
	67	* player_url SWF Player URL (used for rtmpdump).
c7deaa4c PH	68	* protocol The protocol that will be used for the actual
	69	download, lower-case.
	70	"http", "https", "rtsp", "rtmp" or so.
f49d89ee PH	71	* preference Order number of this format. If this field is
	72	present, the formats get sorted by this field.
	73	-1 for default (order by other properties),
	74	-2 or smaller for less than default.
c0ba0f48	75	url: Final video URL.
d6983cb4	76	ext: Video filename extension.
d67b0b15 PH	77	format: The video format, defaults to ext (used for --get-format)
d67b0b15 PH	78	player_url: SWF Player URL (used for rtmpdump).
2f5865cc	79
d6983cb4 PH	80	The following fields are optional:
d6983cb4 PH	81
73e79f2a PH	82	thumbnails: A list of dictionaries (with the entries "resolution" and
73e79f2a PH	83	"url") for the varying thumbnails
d6983cb4 PH	84	thumbnail: Full URL to a video thumbnail image.
	85	description: One-line video description.
	86	uploader: Full name of the video uploader.
	87	upload_date: Video upload date (YYYYMMDD).
	88	uploader_id: Nickname or id of the video uploader.
	89	location: Physical location of the video.
5d51a883 JMF	90	subtitles: The subtitle file contents as a dictionary in the format
5d51a883 JMF	91	{language: subtitles}.
c0ba0f48	92	duration: Length of the video in seconds, as an integer.
f3d29461	93	view_count: How many users have watched the video on the platform.
19e3dfc9 PH	94	like_count: Number of positive ratings of the video
	95	dislike_count: Number of negative ratings of the video
	96	comment_count: Number of comments on the video
8dbe9899	97	age_limit: Age restriction for the video, as an integer (years)
9103bbc5 JMF	98	webpage_url: The url to the video webpage, if given to youtube-dl it
	99	should allow to get the same result again. (It will be set
	100	by YoutubeDL if it's missing)
d6983cb4	101
deefc05b	102	Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 PH	103
	104	Subclasses of this one should re-define the _real_initialize() and
	105	_real_extract() methods and define a _VALID_URL regexp.
	106	Probably, they should also be added to the list of extractors.
	107
	108	_real_extract() must return a list of information dictionaries as
	109	described above.
	110
	111	Finally, the _WORKING attribute should be set to False for broken IEs
	112	in order to warn the users and skip the tests.
	113	"""
	114
	115	_ready = False
	116	_downloader = None
	117	_WORKING = True
	118
	119	def __init__(self, downloader=None):
	120	"""Constructor. Receives an optional downloader."""
	121	self._ready = False
	122	self.set_downloader(downloader)
	123
	124	@classmethod
	125	def suitable(cls, url):
	126	"""Receives a URL and returns True if suitable for this IE."""
79cb2577 PH	127
	128	# This does not use has/getattr intentionally - we want to know whether
	129	# we have cached the regexp for this class, whereas getattr would also
	130	# match the superclass
	131	if '_VALID_URL_RE' not in cls.__dict__:
	132	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	133	return cls._VALID_URL_RE.match(url) is not None
d6983cb4 PH	134
	135	@classmethod
	136	def working(cls):
	137	"""Getter method for _WORKING."""
	138	return cls._WORKING
	139
	140	def initialize(self):
	141	"""Initializes an instance (authentication, etc)."""
	142	if not self._ready:
	143	self._real_initialize()
	144	self._ready = True
	145
	146	def extract(self, url):
	147	"""Extracts URL information and returns it in list of dicts."""
	148	self.initialize()
	149	return self._real_extract(url)
	150
	151	def set_downloader(self, downloader):
	152	"""Sets the downloader for this IE."""
	153	self._downloader = downloader
	154
	155	def _real_initialize(self):
	156	"""Real initialization process. Redefine in subclasses."""
	157	pass
	158
	159	def _real_extract(self, url):
	160	"""Real extraction process. Redefine in subclasses."""
	161	pass
	162
56c73665 JMF	163	@classmethod
	164	def ie_key(cls):
	165	"""A string for getting the InfoExtractor with get_info_extractor"""
	166	return cls.__name__[:-2]
	167
d6983cb4 PH	168	@property
	169	def IE_NAME(self):
	170	return type(self).__name__[:-2]
	171
7cc3570e	172	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 PH	173	""" Returns the response handle """
	174	if note is None:
	175	self.report_download_webpage(video_id)
	176	elif note is not False:
7cc3570e PH	177	if video_id is None:
	178	self.to_screen(u'%s' % (note,))
	179	else:
	180	self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4	181	try:
dca08720	182	return self._downloader.urlopen(url_or_request)
d6983cb4	183	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3 PH	184	if errnote is False:
aa94a6d3 PH	185	return False
d6983cb4 PH	186	if errnote is None:
d6983cb4 PH	187	errnote = u'Unable to download webpage'
7cc3570e PH	188	errmsg = u'%s: %s' % (errnote, compat_str(err))
	189	if fatal:
	190	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	191	else:
	192	self._downloader.report_warning(errmsg)
	193	return False
d6983cb4	194
7cc3570e	195	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	196	""" Returns a tuple (page content as string, URL handle) """
b9d3e163 PH	197
	198	# Strip hashes from the URL (#1038)
	199	if isinstance(url_or_request, (compat_str, str)):
	200	url_or_request = url_or_request.partition('#')[0]
	201
7cc3570e PH	202	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	203	if urlh is False:
	204	assert not fatal
	205	return False
d6983cb4	206	content_type = urlh.headers.get('Content-Type', '')
f143d86a	207	webpage_bytes = urlh.read()
d6983cb4 PH	208	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	209	if m:
	210	encoding = m.group(1)
	211	else:
0d75ae2c	212	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a PH	213	webpage_bytes[:1024])
	214	if m:
	215	encoding = m.group(1).decode('ascii')
	216	else:
	217	encoding = 'utf-8'
d6983cb4 PH	218	if self._downloader.params.get('dump_intermediate_pages', False):
	219	try:
	220	url = url_or_request.get_full_url()
	221	except AttributeError:
	222	url = url_or_request
	223	self.to_screen(u'Dumping request to ' + url)
	224	dump = base64.b64encode(webpage_bytes).decode('ascii')
	225	self._downloader.to_screen(dump)
d41e6efc PH	226	if self._downloader.params.get('write_pages', False):
	227	try:
	228	url = url_or_request.get_full_url()
	229	except AttributeError:
	230	url = url_or_request
	231	raw_filename = ('%s_%s.dump' % (video_id, url))
	232	filename = sanitize_filename(raw_filename, restricted=True)
	233	self.to_screen(u'Saving request to ' + filename)
	234	with open(filename, 'wb') as outf:
	235	outf.write(webpage_bytes)
	236
d6983cb4 PH	237	content = webpage_bytes.decode(encoding, 'replace')
	238	return (content, urlh)
	239
7cc3570e	240	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	241	""" Returns the data of the page as a string """
7cc3570e PH	242	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	243	if res is False:
	244	return res
	245	else:
	246	content, _ = res
	247	return content
d6983cb4	248
2a275ab0	249	def _download_xml(self, url_or_request, video_id,
e2b38da9 PH	250	note=u'Downloading XML', errnote=u'Unable to download XML',
e2b38da9 PH	251	transform_source=None):
267ed0c5 JMF	252	"""Return the xml as an xml.etree.ElementTree.Element"""
267ed0c5 JMF	253	xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9 PH	254	if transform_source:
e2b38da9 PH	255	xml_string = transform_source(xml_string)
267ed0c5 JMF	256	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
267ed0c5 JMF	257
f45f96f8 PH	258	def report_warning(self, msg, video_id=None):
	259	idstr = u'' if video_id is None else u'%s: ' % video_id
	260	self._downloader.report_warning(
	261	u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
	262
d6983cb4 PH	263	def to_screen(self, msg):
	264	"""Print msg to screen, prefixing it with '[ie_name]'"""
	265	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	266
	267	def report_extraction(self, id_or_name):
	268	"""Report information extraction."""
	269	self.to_screen(u'%s: Extracting information' % id_or_name)
	270
	271	def report_download_webpage(self, video_id):
	272	"""Report webpage download."""
	273	self.to_screen(u'%s: Downloading webpage' % video_id)
	274
	275	def report_age_confirmation(self):
	276	"""Report attempt to confirm age."""
	277	self.to_screen(u'Confirming age')
	278
fc79158d JMF	279	def report_login(self):
	280	"""Report attempt to log in."""
	281	self.to_screen(u'Logging in')
	282
d6983cb4	283	#Methods for following #608
c0d0b01f JMF	284	@staticmethod
c0d0b01f JMF	285	def url_result(url, ie=None, video_id=None):
d6983cb4 PH	286	"""Returns a url that points to a page that should be processed"""
	287	#TODO: ie should be the class used for getting the info
	288	video_info = {'_type': 'url',
	289	'url': url,
	290	'ie_key': ie}
7012b23c PH	291	if video_id is not None:
7012b23c PH	292	video_info['id'] = video_id
d6983cb4	293	return video_info
c0d0b01f JMF	294	@staticmethod
c0d0b01f JMF	295	def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4 PH	296	"""Returns a playlist"""
	297	video_info = {'_type': 'playlist',
	298	'entries': entries}
	299	if playlist_id:
	300	video_info['id'] = playlist_id
	301	if playlist_title:
	302	video_info['title'] = playlist_title
	303	return video_info
	304
46374a56	305	def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4 PH	306	"""
	307	Perform a regex search on the given string, using a single or a list of
	308	patterns returning the first matching group.
	309	In case of failure return a default value or raise a WARNING or a
55b3e45b	310	RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 PH	311	"""
	312	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	313	mobj = re.search(pattern, string, flags)
	314	else:
	315	for p in pattern:
	316	mobj = re.search(p, string, flags)
	317	if mobj: break
	318
87a28127	319	if os.name != 'nt' and sys.stderr.isatty():
d6983cb4 PH	320	_name = u'\033[0;34m%s\033[0m' % name
	321	else:
	322	_name = name
	323
	324	if mobj:
	325	# return the first matching group
	326	return next(g for g in mobj.groups() if g is not None)
46374a56	327	elif default is not _NO_DEFAULT:
d6983cb4 PH	328	return default
d6983cb4 PH	329	elif fatal:
55b3e45b	330	raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4 PH	331	else:
d6983cb4 PH	332	self._downloader.report_warning(u'unable to extract %s; '
98bcd283	333	u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4 PH	334	return None
d6983cb4 PH	335
46374a56	336	def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4 PH	337	"""
	338	Like _search_regex, but strips HTML tags and unescapes entities.
	339	"""
	340	res = self._search_regex(pattern, string, name, default, fatal, flags)
	341	if res:
	342	return clean_html(res).strip()
	343	else:
	344	return res
	345
fc79158d JMF	346	def _get_login_info(self):
	347	"""
	348	Get the the login info as (username, password)
	349	It will look in the netrc file using the _NETRC_MACHINE value
	350	If there's no info available, return (None, None)
	351	"""
	352	if self._downloader is None:
	353	return (None, None)
	354
	355	username = None
	356	password = None
	357	downloader_params = self._downloader.params
	358
	359	# Attempt to use provided username and password or .netrc data
	360	if downloader_params.get('username', None) is not None:
	361	username = downloader_params['username']
	362	password = downloader_params['password']
	363	elif downloader_params.get('usenetrc', False):
	364	try:
	365	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	366	if info is not None:
	367	username = info[0]
	368	password = info[2]
	369	else:
	370	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	371	except (IOError, netrc.NetrcParseError) as err:
	372	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	373
	374	return (username, password)
	375
46720279 JMF	376	# Helper functions for extracting OpenGraph info
46720279 JMF	377	@staticmethod
ab2d5247	378	def _og_regexes(prop):
78fb87b2 JMF	379	content_re = r'content=(?:"([^>]+?)"\|\'(.+?)\')'
	380	property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
	381	template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247	382	return [
78fb87b2 JMF	383	template % (property_re, content_re),
78fb87b2 JMF	384	template % (content_re, property_re),
ab2d5247	385	]
46720279	386
3c4e6d83	387	def _og_search_property(self, prop, html, name=None, **kargs):
46720279	388	if name is None:
3c4e6d83	389	name = 'OpenGraph %s' % prop
ab2d5247	390	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398 PH	391	if escaped is None:
	392	return None
	393	return unescapeHTML(escaped)
46720279 JMF	394
46720279 JMF	395	def _og_search_thumbnail(self, html, **kargs):
3c4e6d83	396	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279 JMF	397
	398	def _og_search_description(self, html, **kargs):
	399	return self._og_search_property('description', html, fatal=False, **kargs)
	400
	401	def _og_search_title(self, html, **kargs):
	402	return self._og_search_property('title', html, **kargs)
	403
8ffa13e0	404	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247 JMF	405	regexes = self._og_regexes('video')
ab2d5247 JMF	406	if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0	407	return self._html_search_regex(regexes, html, name, **kargs)
46720279	408
59040888 PH	409	def _html_search_meta(self, name, html, display_name=None):
	410	if display_name is None:
	411	display_name = name
	412	return self._html_search_regex(
aaebed13 PH	413	r'''(?ix)<meta
aaebed13 PH	414	(?=[^>]+(?:itemprop\|name\|property)=["\']%s["\'])
59040888 PH	415	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	416	html, display_name, fatal=False)
	417
	418	def _dc_search_uploader(self, html):
	419	return self._html_search_meta('dc.creator', html, 'uploader')
	420
8dbe9899 PH	421	def _rta_search(self, html):
	422	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	423	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	424	r' content="RTA-5042-1996-1400-1577-RTA"',
	425	html):
	426	return 18
	427	return 0
	428
59040888 PH	429	def _media_rating_search(self, html):
	430	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	431	rating = self._html_search_meta('rating', html)
	432
	433	if not rating:
	434	return None
	435
	436	RATING_TABLE = {
	437	'safe for kids': 0,
	438	'general': 8,
	439	'14 years': 14,
	440	'mature': 17,
	441	'restricted': 19,
	442	}
	443	return RATING_TABLE.get(rating.lower(), None)
	444
4bcc7bd1 PH	445	def _sort_formats(self, formats):
4bcc7bd1 PH	446	def _formats_key(f):
e6812ac9 PH	447	# TODO remove the following workaround
	448	from ..utils import determine_ext
	449	if not f.get('ext') and 'url' in f:
	450	f['ext'] = determine_ext(f['url'])
	451
4bcc7bd1 PH	452	preference = f.get('preference')
4bcc7bd1 PH	453	if preference is None:
c7deaa4c PH	454	proto = f.get('protocol')
	455	if proto is None:
	456	proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
	457
	458	preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1 PH	459	if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
	460	preference -= 0.5
	461
	462	if f.get('vcodec') == 'none': # audio only
	463	if self._downloader.params.get('prefer_free_formats'):
	464	ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
	465	else:
	466	ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
	467	ext_preference = 0
	468	try:
	469	audio_ext_preference = ORDER.index(f['ext'])
	470	except ValueError:
	471	audio_ext_preference = -1
	472	else:
	473	if self._downloader.params.get('prefer_free_formats'):
	474	ORDER = [u'flv', u'mp4', u'webm']
	475	else:
	476	ORDER = [u'webm', u'flv', u'mp4']
	477	try:
	478	ext_preference = ORDER.index(f['ext'])
	479	except ValueError:
	480	ext_preference = -1
	481	audio_ext_preference = 0
	482
	483	return (
	484	preference,
	485	f.get('height') if f.get('height') is not None else -1,
	486	f.get('width') if f.get('width') is not None else -1,
	487	ext_preference,
	488	f.get('vbr') if f.get('vbr') is not None else -1,
	489	f.get('abr') if f.get('abr') is not None else -1,
	490	audio_ext_preference,
	491	f.get('filesize') if f.get('filesize') is not None else -1,
	492	f.get('format_id'),
	493	)
	494	formats.sort(key=_formats_key)
59040888	495
8dbe9899	496
d6983cb4 PH	497	class SearchInfoExtractor(InfoExtractor):
	498	"""
	499	Base class for paged search queries extractors.
	500	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	501	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	502	"""
	503
	504	@classmethod
	505	def _make_valid_url(cls):
	506	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	507
	508	@classmethod
	509	def suitable(cls, url):
	510	return re.match(cls._make_valid_url(), url) is not None
	511
	512	def _real_extract(self, query):
	513	mobj = re.match(self._make_valid_url(), query)
	514	if mobj is None:
	515	raise ExtractorError(u'Invalid search query "%s"' % query)
	516
	517	prefix = mobj.group('prefix')
	518	query = mobj.group('query')
	519	if prefix == '':
	520	return self._get_n_results(query, 1)
	521	elif prefix == 'all':
	522	return self._get_n_results(query, self._MAX_RESULTS)
	523	else:
	524	n = int(prefix)
	525	if n <= 0:
	526	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	527	elif n > self._MAX_RESULTS:
	528	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	529	n = self._MAX_RESULTS
	530	return self._get_n_results(query, n)
	531
	532	def _get_n_results(self, query, n):
	533	"""Get a specified number of results for a query"""
416a5efc	534	raise NotImplementedError("This method must be implemented by subclasses")
0f818663 PH	535
	536	@property
	537	def SEARCH_KEY(self):
	538	return self._SEARCH_KEY