[yt-dlp.git] / youtube_dl / extractor / common.py

import base64
import os
import re
import socket
import sys
import netrc
import xml.etree.ElementTree

from ..utils import (
    compat_http_client,
    compat_urllib_error,
    compat_urllib_parse_urlparse,
    compat_str,

    clean_html,
    compiled_regex_type,
    ExtractorError,
    RegexNotFoundError,
    sanitize_filename,
    unescapeHTML,
)
_NO_DEFAULT = object()


class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the FileDownloader. The FileDownloader processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The dictionaries must include the following fields:

    id:             Video identifier.
    title:          Video title, unescaped.

    Additionally, it must contain either a formats entry or a url one:

    formats:        A list of dictionaries for each format available, ordered
                    from worst to best quality.

                    Potential fields:
                    * url        Mandatory. The URL of the video file
                    * ext        Will be calculated from url if missing
                    * format     A human-readable description of the format
                                 ("mp4 container with h264/opus").
                                 Calculated from the format_id, width, height.
                                 and format_note fields if missing.
                    * format_id  A short description of the format
                                 ("mp4_h264_opus" or "19").
                                Technically optional, but strongly recommended.
                    * format_note Additional info about the format
                                 ("3D" or "DASH video")
                    * width      Width of the video, if known
                    * height     Height of the video, if known
                    * resolution Textual description of width and height
                    * tbr        Average bitrate of audio and video in KBit/s
                    * abr        Average audio bitrate in KBit/s
                    * acodec     Name of the audio codec in use
                    * vbr        Average video bitrate in KBit/s
                    * vcodec     Name of the video codec in use
                    * filesize   The number of bytes, if known in advance
                    * player_url SWF Player URL (used for rtmpdump).
                    * protocol   The protocol that will be used for the actual
                                 download, lower-case.
                                 "http", "https", "rtsp", "rtmp" or so.
                    * preference Order number of this format. If this field is
                                 present and not None, the formats get sorted
                                 by this field.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                    * quality    Order number of the video quality of this
                                 format, irrespective of the file format.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
    url:            Final video URL.
    ext:            Video filename extension.
    format:         The video format, defaults to ext (used for --get-format)
    player_url:     SWF Player URL (used for rtmpdump).

    The following fields are optional:

    thumbnails:     A list of dictionaries (with the entries "resolution" and
                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
    upload_date:    Video upload date (YYYYMMDD).
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location of the video.
    subtitles:      The subtitle file contents as a dictionary in the format
                    {language: subtitles}.
    duration:       Length of the video in seconds, as an integer.
    view_count:     How many users have watched the video on the platform.
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
    comment_count:  Number of comments on the video
    age_limit:      Age restriction for the video, as an integer (years)
    webpage_url:    The url to the video webpage, if given to youtube-dl it
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)

    Unless mentioned otherwise, the fields should be Unicode strings.

    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _real_extract() must return a *list* of information dictionaries as
    described above.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""

        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url) is not None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        self.initialize()
        return self._real_extract(url)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

    @property
    def IE_NAME(self):
        return type(self).__name__[:-2]

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the response handle """
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            if video_id is None:
                self.to_screen(u'%s' % (note,))
            else:
                self.to_screen(u'%s: %s' % (video_id, note))
        try:
            return self._downloader.urlopen(url_or_request)
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            if errnote is False:
                return False
            if errnote is None:
                errnote = u'Unable to download webpage'
            errmsg = u'%s: %s' % (errnote, compat_str(err))
            if fatal:
                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
            else:
                self._downloader.report_warning(errmsg)
                return False

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns a tuple (page content as string, URL handle) """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
        if urlh is False:
            assert not fatal
            return False
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
            else:
                encoding = 'utf-8'
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self._downloader.params.get('write_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            raw_filename = ('%s_%s.dump' % (video_id, url))
            filename = sanitize_filename(raw_filename, restricted=True)
            self.to_screen(u'Saving request to ' + filename)
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

        content = webpage_bytes.decode(encoding, 'replace')
        return (content, urlh)

    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
        """ Returns the data of the page as a string """
        res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
        if res is False:
            return res
        else:
            content, _ = res
            return content

    def _download_xml(self, url_or_request, video_id,
                      note=u'Downloading XML', errnote=u'Unable to download XML',
                      transform_source=None):
        """Return the xml as an xml.etree.ElementTree.Element"""
        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
        if transform_source:
            xml_string = transform_source(xml_string)
        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))

    def report_warning(self, msg, video_id=None):
        idstr = u'' if video_id is None else u'%s: ' % video_id
        self._downloader.report_warning(
            u'[%s] %s%s' % (self.IE_NAME, idstr, msg))

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(u'%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(u'%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    #Methods for following #608
    @staticmethod
    def url_result(url, ie=None, video_id=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        if video_id is not None:
            video_info['id'] = video_id
        return video_info
    @staticmethod
    def playlist_result(entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info

    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        RegexNotFoundError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj: break

        if os.name != 'nt' and sys.stderr.isatty():
            _name = u'\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            # return the first matching group
            return next(g for g in mobj.groups() if g is not None)
        elif default is not _NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError(u'Unable to extract %s' % _name)
        else:
            self._downloader.report_warning(u'unable to extract %s; '
                u'please report this issue on http://yt-dl.org/bug' % _name)
            return None

    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_login_info(self):
        """
        Get the the login info as (username, password)
        It will look in the netrc file using the _NETRC_MACHINE value
        If there's no info available, return (None, None)
        """
        if self._downloader is None:
            return (None, None)

        username = None
        password = None
        downloader_params = self._downloader.params

        # Attempt to use provided username and password or .netrc data
        if downloader_params.get('username', None) is not None:
            username = downloader_params['username']
            password = downloader_params['password']
        elif downloader_params.get('usenetrc', False):
            try:
                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
            except (IOError, netrc.NetrcParseError) as err:
                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
        
        return (username, password)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regexes(prop):
        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
        template = r'<meta[^>]+?%s[^>]+?%s'
        return [
            template % (property_re, content_re),
            template % (content_re, property_re),
        ]

    def _og_search_property(self, prop, html, name=None, **kargs):
        if name is None:
            name = 'OpenGraph %s' % prop
        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
        if escaped is None:
            return None
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
        regexes = self._og_regexes('video')
        if secure: regexes = self._og_regexes('video:secure_url') + regexes
        return self._html_search_regex(regexes, html, name, **kargs)

    def _html_search_meta(self, name, html, display_name=None):
        if display_name is None:
            display_name = name
        return self._html_search_regex(
            r'''(?ix)<meta
                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
            html, display_name, fatal=False)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')

    def _rta_search(self, html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18
        return 0

    def _media_rating_search(self, html):
        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
        rating = self._html_search_meta('rating', html)

        if not rating:
            return None

        RATING_TABLE = {
            'safe for kids': 0,
            'general': 8,
            '14 years': 14,
            'mature': 17,
            'restricted': 19,
        }
        return RATING_TABLE.get(rating.lower(), None)

    def _sort_formats(self, formats):
        def _formats_key(f):
            # TODO remove the following workaround
            from ..utils import determine_ext
            if not f.get('ext') and 'url' in f:
                f['ext'] = determine_ext(f['url'])

            preference = f.get('preference')
            if preference is None:
                proto = f.get('protocol')
                if proto is None:
                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme

                preference = 0 if proto in ['http', 'https'] else -0.1
                if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                    preference -= 0.5

            if f.get('vcodec') == 'none':  # audio only
                if self._downloader.params.get('prefer_free_formats'):
                    ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
                else:
                    ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
                ext_preference = 0
                try:
                    audio_ext_preference = ORDER.index(f['ext'])
                except ValueError:
                    audio_ext_preference = -1
            else:
                if self._downloader.params.get('prefer_free_formats'):
                    ORDER = [u'flv', u'mp4', u'webm']
                else:
                    ORDER = [u'webm', u'flv', u'mp4']
                try:
                    ext_preference = ORDER.index(f['ext'])
                except ValueError:
                    ext_preference = -1
                audio_ext_preference = 0

            return (
                preference,
                f.get('quality') if f.get('quality') is not None else -1,
                f.get('height') if f.get('height') is not None else -1,
                f.get('width') if f.get('width') is not None else -1,
                ext_preference,
                f.get('vbr') if f.get('vbr') is not None else -1,
                f.get('abr') if f.get('abr') is not None else -1,
                audio_ext_preference,
                f.get('filesize') if f.get('filesize') is not None else -1,
                f.get('format_id'),
            )
        formats.sort(key=_formats_key)


class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError(u'Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError("This method must be implemented by subclasses")

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
Commit	Line	Data
d6983cb4 PH	1	import base64
	2	import os
	3	import re
	4	import socket
	5	import sys
fc79158d	6	import netrc
267ed0c5	7	import xml.etree.ElementTree
d6983cb4 PH	8
	9	from ..utils import (
	10	compat_http_client,
	11	compat_urllib_error,
c7deaa4c	12	compat_urllib_parse_urlparse,
d6983cb4 PH	13	compat_str,
	14
	15	clean_html,
	16	compiled_regex_type,
	17	ExtractorError,
55b3e45b	18	RegexNotFoundError,
d41e6efc	19	sanitize_filename,
f38de77f	20	unescapeHTML,
d6983cb4	21	)
46374a56	22	_NO_DEFAULT = object()
d6983cb4	23
dca08720	24
d6983cb4 PH	25	class InfoExtractor(object):
	26	"""Information Extractor class.
	27
	28	Information extractors are the classes that, given a URL, extract
	29	information about the video (or videos) the URL refers to. This
	30	information includes the real video URL, the video title, author and
	31	others. The information is stored in a dictionary which is then
	32	passed to the FileDownloader. The FileDownloader processes this
	33	information possibly downloading the video to the file system, among
	34	other possible outcomes.
	35
	36	The dictionaries must include the following fields:
	37
	38	id: Video identifier.
d6983cb4	39	title: Video title, unescaped.
d67b0b15	40
f49d89ee	41	Additionally, it must contain either a formats entry or a url one:
d67b0b15	42
f49d89ee PH	43	formats: A list of dictionaries for each format available, ordered
	44	from worst to best quality.
	45
	46	Potential fields:
d67b0b15 PH	47	* url Mandatory. The URL of the video file
	48	* ext Will be calculated from url if missing
	49	* format A human-readable description of the format
	50	("mp4 container with h264/opus").
	51	Calculated from the format_id, width, height.
	52	and format_note fields if missing.
	53	* format_id A short description of the format
5d4f3985 PH	54	("mp4_h264_opus" or "19").
5d4f3985 PH	55	Technically optional, but strongly recommended.
d67b0b15 PH	56	* format_note Additional info about the format
	57	("3D" or "DASH video")
	58	* width Width of the video, if known
	59	* height Height of the video, if known
f49d89ee	60	* resolution Textual description of width and height
7217e148	61	* tbr Average bitrate of audio and video in KBit/s
d67b0b15 PH	62	* abr Average audio bitrate in KBit/s
	63	* acodec Name of the audio codec in use
	64	* vbr Average video bitrate in KBit/s
	65	* vcodec Name of the video codec in use
	66	* filesize The number of bytes, if known in advance
	67	* player_url SWF Player URL (used for rtmpdump).
c7deaa4c PH	68	* protocol The protocol that will be used for the actual
	69	download, lower-case.
	70	"http", "https", "rtsp", "rtmp" or so.
f49d89ee	71	* preference Order number of this format. If this field is
08d13955 PH	72	present and not None, the formats get sorted
08d13955 PH	73	by this field.
f49d89ee PH	74	-1 for default (order by other properties),
f49d89ee PH	75	-2 or smaller for less than default.
5d73273f PH	76	* quality Order number of the video quality of this
	77	format, irrespective of the file format.
	78	-1 for default (order by other properties),
	79	-2 or smaller for less than default.
c0ba0f48	80	url: Final video URL.
d6983cb4	81	ext: Video filename extension.
d67b0b15 PH	82	format: The video format, defaults to ext (used for --get-format)
d67b0b15 PH	83	player_url: SWF Player URL (used for rtmpdump).
2f5865cc	84
d6983cb4 PH	85	The following fields are optional:
d6983cb4 PH	86
73e79f2a PH	87	thumbnails: A list of dictionaries (with the entries "resolution" and
73e79f2a PH	88	"url") for the varying thumbnails
d6983cb4 PH	89	thumbnail: Full URL to a video thumbnail image.
	90	description: One-line video description.
	91	uploader: Full name of the video uploader.
	92	upload_date: Video upload date (YYYYMMDD).
	93	uploader_id: Nickname or id of the video uploader.
	94	location: Physical location of the video.
5d51a883 JMF	95	subtitles: The subtitle file contents as a dictionary in the format
5d51a883 JMF	96	{language: subtitles}.
c0ba0f48	97	duration: Length of the video in seconds, as an integer.
f3d29461	98	view_count: How many users have watched the video on the platform.
19e3dfc9 PH	99	like_count: Number of positive ratings of the video
	100	dislike_count: Number of negative ratings of the video
	101	comment_count: Number of comments on the video
8dbe9899	102	age_limit: Age restriction for the video, as an integer (years)
9103bbc5 JMF	103	webpage_url: The url to the video webpage, if given to youtube-dl it
	104	should allow to get the same result again. (It will be set
	105	by YoutubeDL if it's missing)
d6983cb4	106
deefc05b	107	Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 PH	108
	109	Subclasses of this one should re-define the _real_initialize() and
	110	_real_extract() methods and define a _VALID_URL regexp.
	111	Probably, they should also be added to the list of extractors.
	112
	113	_real_extract() must return a list of information dictionaries as
	114	described above.
	115
	116	Finally, the _WORKING attribute should be set to False for broken IEs
	117	in order to warn the users and skip the tests.
	118	"""
	119
	120	_ready = False
	121	_downloader = None
	122	_WORKING = True
	123
	124	def __init__(self, downloader=None):
	125	"""Constructor. Receives an optional downloader."""
	126	self._ready = False
	127	self.set_downloader(downloader)
	128
	129	@classmethod
	130	def suitable(cls, url):
	131	"""Receives a URL and returns True if suitable for this IE."""
79cb2577 PH	132
	133	# This does not use has/getattr intentionally - we want to know whether
	134	# we have cached the regexp for this class, whereas getattr would also
	135	# match the superclass
	136	if '_VALID_URL_RE' not in cls.__dict__:
	137	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	138	return cls._VALID_URL_RE.match(url) is not None
d6983cb4 PH	139
	140	@classmethod
	141	def working(cls):
	142	"""Getter method for _WORKING."""
	143	return cls._WORKING
	144
	145	def initialize(self):
	146	"""Initializes an instance (authentication, etc)."""
	147	if not self._ready:
	148	self._real_initialize()
	149	self._ready = True
	150
	151	def extract(self, url):
	152	"""Extracts URL information and returns it in list of dicts."""
	153	self.initialize()
	154	return self._real_extract(url)
	155
	156	def set_downloader(self, downloader):
	157	"""Sets the downloader for this IE."""
	158	self._downloader = downloader
	159
	160	def _real_initialize(self):
	161	"""Real initialization process. Redefine in subclasses."""
	162	pass
	163
	164	def _real_extract(self, url):
	165	"""Real extraction process. Redefine in subclasses."""
	166	pass
	167
56c73665 JMF	168	@classmethod
	169	def ie_key(cls):
	170	"""A string for getting the InfoExtractor with get_info_extractor"""
	171	return cls.__name__[:-2]
	172
d6983cb4 PH	173	@property
	174	def IE_NAME(self):
	175	return type(self).__name__[:-2]
	176
7cc3570e	177	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 PH	178	""" Returns the response handle """
	179	if note is None:
	180	self.report_download_webpage(video_id)
	181	elif note is not False:
7cc3570e PH	182	if video_id is None:
	183	self.to_screen(u'%s' % (note,))
	184	else:
	185	self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4	186	try:
dca08720	187	return self._downloader.urlopen(url_or_request)
d6983cb4	188	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3 PH	189	if errnote is False:
aa94a6d3 PH	190	return False
d6983cb4 PH	191	if errnote is None:
d6983cb4 PH	192	errnote = u'Unable to download webpage'
7cc3570e PH	193	errmsg = u'%s: %s' % (errnote, compat_str(err))
	194	if fatal:
	195	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	196	else:
	197	self._downloader.report_warning(errmsg)
	198	return False
d6983cb4	199
7cc3570e	200	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	201	""" Returns a tuple (page content as string, URL handle) """
b9d3e163 PH	202
	203	# Strip hashes from the URL (#1038)
	204	if isinstance(url_or_request, (compat_str, str)):
	205	url_or_request = url_or_request.partition('#')[0]
	206
7cc3570e PH	207	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	208	if urlh is False:
	209	assert not fatal
	210	return False
d6983cb4	211	content_type = urlh.headers.get('Content-Type', '')
f143d86a	212	webpage_bytes = urlh.read()
d6983cb4 PH	213	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	214	if m:
	215	encoding = m.group(1)
	216	else:
0d75ae2c	217	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a PH	218	webpage_bytes[:1024])
	219	if m:
	220	encoding = m.group(1).decode('ascii')
	221	else:
	222	encoding = 'utf-8'
d6983cb4 PH	223	if self._downloader.params.get('dump_intermediate_pages', False):
	224	try:
	225	url = url_or_request.get_full_url()
	226	except AttributeError:
	227	url = url_or_request
	228	self.to_screen(u'Dumping request to ' + url)
	229	dump = base64.b64encode(webpage_bytes).decode('ascii')
	230	self._downloader.to_screen(dump)
d41e6efc PH	231	if self._downloader.params.get('write_pages', False):
	232	try:
	233	url = url_or_request.get_full_url()
	234	except AttributeError:
	235	url = url_or_request
	236	raw_filename = ('%s_%s.dump' % (video_id, url))
	237	filename = sanitize_filename(raw_filename, restricted=True)
	238	self.to_screen(u'Saving request to ' + filename)
	239	with open(filename, 'wb') as outf:
	240	outf.write(webpage_bytes)
	241
d6983cb4 PH	242	content = webpage_bytes.decode(encoding, 'replace')
	243	return (content, urlh)
	244
7cc3570e	245	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4	246	""" Returns the data of the page as a string """
7cc3570e PH	247	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	248	if res is False:
	249	return res
	250	else:
	251	content, _ = res
	252	return content
d6983cb4	253
2a275ab0	254	def _download_xml(self, url_or_request, video_id,
e2b38da9 PH	255	note=u'Downloading XML', errnote=u'Unable to download XML',
e2b38da9 PH	256	transform_source=None):
267ed0c5 JMF	257	"""Return the xml as an xml.etree.ElementTree.Element"""
267ed0c5 JMF	258	xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9 PH	259	if transform_source:
e2b38da9 PH	260	xml_string = transform_source(xml_string)
267ed0c5 JMF	261	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
267ed0c5 JMF	262
f45f96f8 PH	263	def report_warning(self, msg, video_id=None):
	264	idstr = u'' if video_id is None else u'%s: ' % video_id
	265	self._downloader.report_warning(
	266	u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
	267
d6983cb4 PH	268	def to_screen(self, msg):
	269	"""Print msg to screen, prefixing it with '[ie_name]'"""
	270	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	271
	272	def report_extraction(self, id_or_name):
	273	"""Report information extraction."""
	274	self.to_screen(u'%s: Extracting information' % id_or_name)
	275
	276	def report_download_webpage(self, video_id):
	277	"""Report webpage download."""
	278	self.to_screen(u'%s: Downloading webpage' % video_id)
	279
	280	def report_age_confirmation(self):
	281	"""Report attempt to confirm age."""
	282	self.to_screen(u'Confirming age')
	283
fc79158d JMF	284	def report_login(self):
	285	"""Report attempt to log in."""
	286	self.to_screen(u'Logging in')
	287
d6983cb4	288	#Methods for following #608
c0d0b01f JMF	289	@staticmethod
c0d0b01f JMF	290	def url_result(url, ie=None, video_id=None):
d6983cb4 PH	291	"""Returns a url that points to a page that should be processed"""
	292	#TODO: ie should be the class used for getting the info
	293	video_info = {'_type': 'url',
	294	'url': url,
	295	'ie_key': ie}
7012b23c PH	296	if video_id is not None:
7012b23c PH	297	video_info['id'] = video_id
d6983cb4	298	return video_info
c0d0b01f JMF	299	@staticmethod
c0d0b01f JMF	300	def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4 PH	301	"""Returns a playlist"""
	302	video_info = {'_type': 'playlist',
	303	'entries': entries}
	304	if playlist_id:
	305	video_info['id'] = playlist_id
	306	if playlist_title:
	307	video_info['title'] = playlist_title
	308	return video_info
	309
46374a56	310	def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4 PH	311	"""
	312	Perform a regex search on the given string, using a single or a list of
	313	patterns returning the first matching group.
	314	In case of failure return a default value or raise a WARNING or a
55b3e45b	315	RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 PH	316	"""
	317	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	318	mobj = re.search(pattern, string, flags)
	319	else:
	320	for p in pattern:
	321	mobj = re.search(p, string, flags)
	322	if mobj: break
	323
87a28127	324	if os.name != 'nt' and sys.stderr.isatty():
d6983cb4 PH	325	_name = u'\033[0;34m%s\033[0m' % name
	326	else:
	327	_name = name
	328
	329	if mobj:
	330	# return the first matching group
	331	return next(g for g in mobj.groups() if g is not None)
46374a56	332	elif default is not _NO_DEFAULT:
d6983cb4 PH	333	return default
d6983cb4 PH	334	elif fatal:
55b3e45b	335	raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4 PH	336	else:
d6983cb4 PH	337	self._downloader.report_warning(u'unable to extract %s; '
98bcd283	338	u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4 PH	339	return None
d6983cb4 PH	340
46374a56	341	def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4 PH	342	"""
	343	Like _search_regex, but strips HTML tags and unescapes entities.
	344	"""
	345	res = self._search_regex(pattern, string, name, default, fatal, flags)
	346	if res:
	347	return clean_html(res).strip()
	348	else:
	349	return res
	350
fc79158d JMF	351	def _get_login_info(self):
	352	"""
	353	Get the the login info as (username, password)
	354	It will look in the netrc file using the _NETRC_MACHINE value
	355	If there's no info available, return (None, None)
	356	"""
	357	if self._downloader is None:
	358	return (None, None)
	359
	360	username = None
	361	password = None
	362	downloader_params = self._downloader.params
	363
	364	# Attempt to use provided username and password or .netrc data
	365	if downloader_params.get('username', None) is not None:
	366	username = downloader_params['username']
	367	password = downloader_params['password']
	368	elif downloader_params.get('usenetrc', False):
	369	try:
	370	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	371	if info is not None:
	372	username = info[0]
	373	password = info[2]
	374	else:
	375	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	376	except (IOError, netrc.NetrcParseError) as err:
	377	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	378
	379	return (username, password)
	380
46720279 JMF	381	# Helper functions for extracting OpenGraph info
46720279 JMF	382	@staticmethod
ab2d5247	383	def _og_regexes(prop):
78fb87b2	384	content_re = r'content=(?:"([^>]+?)"\|\'(.+?)\')'
9887c9b2	385	property_re = r'(?:name\|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2	386	template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247	387	return [
78fb87b2 JMF	388	template % (property_re, content_re),
78fb87b2 JMF	389	template % (content_re, property_re),
ab2d5247	390	]
46720279	391
3c4e6d83	392	def _og_search_property(self, prop, html, name=None, **kargs):
46720279	393	if name is None:
3c4e6d83	394	name = 'OpenGraph %s' % prop
ab2d5247	395	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398 PH	396	if escaped is None:
	397	return None
	398	return unescapeHTML(escaped)
46720279 JMF	399
46720279 JMF	400	def _og_search_thumbnail(self, html, **kargs):
3c4e6d83	401	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279 JMF	402
	403	def _og_search_description(self, html, **kargs):
	404	return self._og_search_property('description', html, fatal=False, **kargs)
	405
	406	def _og_search_title(self, html, **kargs):
	407	return self._og_search_property('title', html, **kargs)
	408
8ffa13e0	409	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247 JMF	410	regexes = self._og_regexes('video')
ab2d5247 JMF	411	if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0	412	return self._html_search_regex(regexes, html, name, **kargs)
46720279	413
59040888 PH	414	def _html_search_meta(self, name, html, display_name=None):
	415	if display_name is None:
	416	display_name = name
	417	return self._html_search_regex(
aaebed13 PH	418	r'''(?ix)<meta
aaebed13 PH	419	(?=[^>]+(?:itemprop\|name\|property)=["\']%s["\'])
59040888 PH	420	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	421	html, display_name, fatal=False)
	422
	423	def _dc_search_uploader(self, html):
	424	return self._html_search_meta('dc.creator', html, 'uploader')
	425
8dbe9899 PH	426	def _rta_search(self, html):
	427	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	428	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	429	r' content="RTA-5042-1996-1400-1577-RTA"',
	430	html):
	431	return 18
	432	return 0
	433
59040888 PH	434	def _media_rating_search(self, html):
	435	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	436	rating = self._html_search_meta('rating', html)
	437
	438	if not rating:
	439	return None
	440
	441	RATING_TABLE = {
	442	'safe for kids': 0,
	443	'general': 8,
	444	'14 years': 14,
	445	'mature': 17,
	446	'restricted': 19,
	447	}
	448	return RATING_TABLE.get(rating.lower(), None)
	449
4bcc7bd1 PH	450	def _sort_formats(self, formats):
4bcc7bd1 PH	451	def _formats_key(f):
e6812ac9 PH	452	# TODO remove the following workaround
	453	from ..utils import determine_ext
	454	if not f.get('ext') and 'url' in f:
	455	f['ext'] = determine_ext(f['url'])
	456
4bcc7bd1 PH	457	preference = f.get('preference')
4bcc7bd1 PH	458	if preference is None:
c7deaa4c PH	459	proto = f.get('protocol')
	460	if proto is None:
	461	proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
	462
	463	preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1 PH	464	if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
	465	preference -= 0.5
	466
	467	if f.get('vcodec') == 'none': # audio only
	468	if self._downloader.params.get('prefer_free_formats'):
	469	ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
	470	else:
	471	ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
	472	ext_preference = 0
	473	try:
	474	audio_ext_preference = ORDER.index(f['ext'])
	475	except ValueError:
	476	audio_ext_preference = -1
	477	else:
	478	if self._downloader.params.get('prefer_free_formats'):
	479	ORDER = [u'flv', u'mp4', u'webm']
	480	else:
	481	ORDER = [u'webm', u'flv', u'mp4']
	482	try:
	483	ext_preference = ORDER.index(f['ext'])
	484	except ValueError:
	485	ext_preference = -1
	486	audio_ext_preference = 0
	487
	488	return (
	489	preference,
5d73273f	490	f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1 PH	491	f.get('height') if f.get('height') is not None else -1,
	492	f.get('width') if f.get('width') is not None else -1,
	493	ext_preference,
	494	f.get('vbr') if f.get('vbr') is not None else -1,
	495	f.get('abr') if f.get('abr') is not None else -1,
	496	audio_ext_preference,
	497	f.get('filesize') if f.get('filesize') is not None else -1,
	498	f.get('format_id'),
	499	)
	500	formats.sort(key=_formats_key)
59040888	501
8dbe9899	502
d6983cb4 PH	503	class SearchInfoExtractor(InfoExtractor):
	504	"""
	505	Base class for paged search queries extractors.
	506	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	507	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	508	"""
	509
	510	@classmethod
	511	def _make_valid_url(cls):
	512	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	513
	514	@classmethod
	515	def suitable(cls, url):
	516	return re.match(cls._make_valid_url(), url) is not None
	517
	518	def _real_extract(self, query):
	519	mobj = re.match(self._make_valid_url(), query)
	520	if mobj is None:
	521	raise ExtractorError(u'Invalid search query "%s"' % query)
	522
	523	prefix = mobj.group('prefix')
	524	query = mobj.group('query')
	525	if prefix == '':
	526	return self._get_n_results(query, 1)
	527	elif prefix == 'all':
	528	return self._get_n_results(query, self._MAX_RESULTS)
	529	else:
	530	n = int(prefix)
	531	if n <= 0:
	532	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	533	elif n > self._MAX_RESULTS:
	534	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	535	n = self._MAX_RESULTS
	536	return self._get_n_results(query, n)
	537
	538	def _get_n_results(self, query, n):
	539	"""Get a specified number of results for a query"""
416a5efc	540	raise NotImplementedError("This method must be implemented by subclasses")
0f818663 PH	541
	542	@property
	543	def SEARCH_KEY(self):
	544	return self._SEARCH_KEY