[yt-dlp.git] / youtube_dl / extractor / common.py

import base64
import os
import re
import socket
import sys
import netrc

from ..utils import (
    compat_http_client,
    compat_urllib_error,
    compat_urllib_request,
    compat_str,

    clean_html,
    compiled_regex_type,
    ExtractorError,
    unescapeHTML,
)

class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the FileDownloader. The FileDownloader processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The dictionaries must include the following fields:

    id:             Video identifier.
    url:            Final video URL.
    title:          Video title, unescaped.
    ext:            Video filename extension.

    The following fields are optional:

    format:         The video format, defaults to ext (used for --get-format)
    thumbnails:     A list of dictionaries (with the entries "resolution" and
                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
    upload_date:    Video upload date (YYYYMMDD).
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location of the video.
    player_url:     SWF Player URL (used for rtmpdump).
    subtitles:      The subtitle file contents as a dictionary in the format
                    {language: subtitles}.
    view_count:     How many users have watched the video on the platform.
    urlhandle:      [internal] The urlHandle to be used to download the file,
                    like returned by urllib.request.urlopen

    The fields should all be Unicode strings.

    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _real_extract() must return a *list* of information dictionaries as
    described above.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
        return re.match(cls._VALID_URL, url) is not None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        self.initialize()
        return self._real_extract(url)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @property
    def IE_NAME(self):
        return type(self).__name__[:-2]

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
        """ Returns the response handle """
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            self.to_screen(u'%s: %s' % (video_id, note))
        try:
            return compat_urllib_request.urlopen(url_or_request)
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            if errnote is None:
                errnote = u'Unable to download webpage'
            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
        """ Returns a tuple (page content as string, URL handle) """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote)
        content_type = urlh.headers.get('Content-Type', '')
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            encoding = 'utf-8'
        webpage_bytes = urlh.read()
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        content = webpage_bytes.decode(encoding, 'replace')
        return (content, urlh)

    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
        """ Returns the data of the page as a string """
        return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(u'%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(u'%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    #Methods for following #608
    def url_result(self, url, ie=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        return video_info
    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info

    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        ExtractorError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj: break

        if sys.stderr.isatty() and os.name != 'nt':
            _name = u'\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            # return the first matching group
            return next(g for g in mobj.groups() if g is not None)
        elif default is not None:
            return default
        elif fatal:
            raise ExtractorError(u'Unable to extract %s' % _name)
        else:
            self._downloader.report_warning(u'unable to extract %s; '
                u'please report this issue on http://yt-dl.org/bug' % _name)
            return None

    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_login_info(self):
        """
        Get the the login info as (username, password)
        It will look in the netrc file using the _NETRC_MACHINE value
        If there's no info available, return (None, None)
        """
        if self._downloader is None:
            return (None, None)

        username = None
        password = None
        downloader_params = self._downloader.params

        # Attempt to use provided username and password or .netrc data
        if downloader_params.get('username', None) is not None:
            username = downloader_params['username']
            password = downloader_params['password']
        elif downloader_params.get('usenetrc', False):
            try:
                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
            except (IOError, netrc.NetrcParseError) as err:
                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
        
        return (username, password)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regex(prop):
        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)

    def _og_search_property(self, prop, html, name=None, **kargs):
        if name is None:
            name = 'OpenGraph %s' % prop
        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', **kargs):
        return self._html_search_regex([self._og_regex('video:secure_url'),
                                        self._og_regex('video')],
                                       html, name, **kargs)

class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError(u'Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError("This method must be implemented by sublclasses")

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
Commit	Line	Data
d6983cb4 PH	1	import base64
	2	import os
	3	import re
	4	import socket
	5	import sys
fc79158d	6	import netrc
d6983cb4 PH	7
	8	from ..utils import (
	9	compat_http_client,
	10	compat_urllib_error,
	11	compat_urllib_request,
	12	compat_str,
	13
	14	clean_html,
	15	compiled_regex_type,
	16	ExtractorError,
f38de77f	17	unescapeHTML,
d6983cb4 PH	18	)
	19
	20	class InfoExtractor(object):
	21	"""Information Extractor class.
	22
	23	Information extractors are the classes that, given a URL, extract
	24	information about the video (or videos) the URL refers to. This
	25	information includes the real video URL, the video title, author and
	26	others. The information is stored in a dictionary which is then
	27	passed to the FileDownloader. The FileDownloader processes this
	28	information possibly downloading the video to the file system, among
	29	other possible outcomes.
	30
	31	The dictionaries must include the following fields:
	32
	33	id: Video identifier.
	34	url: Final video URL.
	35	title: Video title, unescaped.
	36	ext: Video filename extension.
	37
	38	The following fields are optional:
	39
	40	format: The video format, defaults to ext (used for --get-format)
73e79f2a PH	41	thumbnails: A list of dictionaries (with the entries "resolution" and
73e79f2a PH	42	"url") for the varying thumbnails
d6983cb4 PH	43	thumbnail: Full URL to a video thumbnail image.
	44	description: One-line video description.
	45	uploader: Full name of the video uploader.
	46	upload_date: Video upload date (YYYYMMDD).
	47	uploader_id: Nickname or id of the video uploader.
	48	location: Physical location of the video.
	49	player_url: SWF Player URL (used for rtmpdump).
5d51a883 JMF	50	subtitles: The subtitle file contents as a dictionary in the format
5d51a883 JMF	51	{language: subtitles}.
f3d29461	52	view_count: How many users have watched the video on the platform.
d6983cb4 PH	53	urlhandle: [internal] The urlHandle to be used to download the file,
	54	like returned by urllib.request.urlopen
	55
	56	The fields should all be Unicode strings.
	57
	58	Subclasses of this one should re-define the _real_initialize() and
	59	_real_extract() methods and define a _VALID_URL regexp.
	60	Probably, they should also be added to the list of extractors.
	61
	62	_real_extract() must return a list of information dictionaries as
	63	described above.
	64
	65	Finally, the _WORKING attribute should be set to False for broken IEs
	66	in order to warn the users and skip the tests.
	67	"""
	68
	69	_ready = False
	70	_downloader = None
	71	_WORKING = True
	72
	73	def __init__(self, downloader=None):
	74	"""Constructor. Receives an optional downloader."""
	75	self._ready = False
	76	self.set_downloader(downloader)
	77
	78	@classmethod
	79	def suitable(cls, url):
	80	"""Receives a URL and returns True if suitable for this IE."""
	81	return re.match(cls._VALID_URL, url) is not None
	82
	83	@classmethod
	84	def working(cls):
	85	"""Getter method for _WORKING."""
	86	return cls._WORKING
	87
	88	def initialize(self):
	89	"""Initializes an instance (authentication, etc)."""
	90	if not self._ready:
	91	self._real_initialize()
	92	self._ready = True
	93
	94	def extract(self, url):
	95	"""Extracts URL information and returns it in list of dicts."""
	96	self.initialize()
	97	return self._real_extract(url)
	98
	99	def set_downloader(self, downloader):
	100	"""Sets the downloader for this IE."""
	101	self._downloader = downloader
	102
	103	def _real_initialize(self):
	104	"""Real initialization process. Redefine in subclasses."""
	105	pass
	106
	107	def _real_extract(self, url):
	108	"""Real extraction process. Redefine in subclasses."""
	109	pass
	110
	111	@property
	112	def IE_NAME(self):
	113	return type(self).__name__[:-2]
	114
	115	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
	116	""" Returns the response handle """
117	if note is None:
118	self.report_download_webpage(video_id)
119	elif note is not False:
120	self.to_screen(u'%s: %s' % (video_id, note))
121	try:
122	return compat_urllib_request.urlopen(url_or_request)
123	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124	if errnote is None:
125	errnote = u'Unable to download webpage'
126	raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127
128	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
129	""" Returns a tuple (page content as string, URL handle) """
b9d3e163 PH	130
	131	# Strip hashes from the URL (#1038)
	132	if isinstance(url_or_request, (compat_str, str)):
	133	url_or_request = url_or_request.partition('#')[0]
	134
d6983cb4 PH	135	urlh = self._request_webpage(url_or_request, video_id, note, errnote)
	136	content_type = urlh.headers.get('Content-Type', '')
	137	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	138	if m:
	139	encoding = m.group(1)
	140	else:
	141	encoding = 'utf-8'
	142	webpage_bytes = urlh.read()
	143	if self._downloader.params.get('dump_intermediate_pages', False):
	144	try:
	145	url = url_or_request.get_full_url()
	146	except AttributeError:
	147	url = url_or_request
	148	self.to_screen(u'Dumping request to ' + url)
	149	dump = base64.b64encode(webpage_bytes).decode('ascii')
	150	self._downloader.to_screen(dump)
	151	content = webpage_bytes.decode(encoding, 'replace')
	152	return (content, urlh)
	153
	154	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
	155	""" Returns the data of the page as a string """
	156	return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
	157
	158	def to_screen(self, msg):
	159	"""Print msg to screen, prefixing it with '[ie_name]'"""
	160	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	161
	162	def report_extraction(self, id_or_name):
	163	"""Report information extraction."""
	164	self.to_screen(u'%s: Extracting information' % id_or_name)
	165
	166	def report_download_webpage(self, video_id):
	167	"""Report webpage download."""
	168	self.to_screen(u'%s: Downloading webpage' % video_id)
	169
	170	def report_age_confirmation(self):
	171	"""Report attempt to confirm age."""
	172	self.to_screen(u'Confirming age')
	173
fc79158d JMF	174	def report_login(self):
	175	"""Report attempt to log in."""
	176	self.to_screen(u'Logging in')
	177
d6983cb4	178	#Methods for following #608
d6983cb4 PH	179	def url_result(self, url, ie=None):
	180	"""Returns a url that points to a page that should be processed"""
	181	#TODO: ie should be the class used for getting the info
	182	video_info = {'_type': 'url',
	183	'url': url,
	184	'ie_key': ie}
	185	return video_info
	186	def playlist_result(self, entries, playlist_id=None, playlist_title=None):
	187	"""Returns a playlist"""
	188	video_info = {'_type': 'playlist',
	189	'entries': entries}
	190	if playlist_id:
	191	video_info['id'] = playlist_id
	192	if playlist_title:
	193	video_info['title'] = playlist_title
	194	return video_info
	195
	196	def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	197	"""
	198	Perform a regex search on the given string, using a single or a list of
	199	patterns returning the first matching group.
	200	In case of failure return a default value or raise a WARNING or a
	201	ExtractorError, depending on fatal, specifying the field name.
	202	"""
	203	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	204	mobj = re.search(pattern, string, flags)
	205	else:
	206	for p in pattern:
	207	mobj = re.search(p, string, flags)
	208	if mobj: break
	209
	210	if sys.stderr.isatty() and os.name != 'nt':
	211	_name = u'\033[0;34m%s\033[0m' % name
	212	else:
	213	_name = name
	214
	215	if mobj:
	216	# return the first matching group
	217	return next(g for g in mobj.groups() if g is not None)
	218	elif default is not None:
	219	return default
	220	elif fatal:
	221	raise ExtractorError(u'Unable to extract %s' % _name)
	222	else:
	223	self._downloader.report_warning(u'unable to extract %s; '
98bcd283	224	u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4 PH	225	return None
	226
	227	def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	228	"""
	229	Like _search_regex, but strips HTML tags and unescapes entities.
	230	"""
	231	res = self._search_regex(pattern, string, name, default, fatal, flags)
	232	if res:
	233	return clean_html(res).strip()
	234	else:
	235	return res
	236
fc79158d JMF	237	def _get_login_info(self):
	238	"""
	239	Get the the login info as (username, password)
	240	It will look in the netrc file using the _NETRC_MACHINE value
	241	If there's no info available, return (None, None)
	242	"""
	243	if self._downloader is None:
	244	return (None, None)
	245
	246	username = None
	247	password = None
	248	downloader_params = self._downloader.params
	249
	250	# Attempt to use provided username and password or .netrc data
	251	if downloader_params.get('username', None) is not None:
	252	username = downloader_params['username']
	253	password = downloader_params['password']
	254	elif downloader_params.get('usenetrc', False):
	255	try:
	256	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	257	if info is not None:
	258	username = info[0]
	259	password = info[2]
	260	else:
	261	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	262	except (IOError, netrc.NetrcParseError) as err:
	263	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	264
	265	return (username, password)
	266
46720279 JMF	267	# Helper functions for extracting OpenGraph info
46720279 JMF	268	@staticmethod
3c4e6d83 PH	269	def _og_regex(prop):
3c4e6d83 PH	270	return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"\|\'(.+?)\')' % re.escape(prop)
46720279	271
3c4e6d83	272	def _og_search_property(self, prop, html, name=None, **kargs):
46720279	273	if name is None:
3c4e6d83	274	name = 'OpenGraph %s' % prop
f38de77f PH	275	escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
f38de77f PH	276	return unescapeHTML(escaped)
46720279 JMF	277
46720279 JMF	278	def _og_search_thumbnail(self, html, **kargs):
3c4e6d83	279	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279 JMF	280
	281	def _og_search_description(self, html, **kargs):
	282	return self._og_search_property('description', html, fatal=False, **kargs)
	283
	284	def _og_search_title(self, html, **kargs):
	285	return self._og_search_property('title', html, **kargs)
	286
	287	def _og_search_video_url(self, html, name='video url', **kargs):
	288	return self._html_search_regex([self._og_regex('video:secure_url'),
	289	self._og_regex('video')],
	290	html, name, **kargs)
	291
d6983cb4 PH	292	class SearchInfoExtractor(InfoExtractor):
	293	"""
	294	Base class for paged search queries extractors.
	295	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	296	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	297	"""
	298
	299	@classmethod
	300	def _make_valid_url(cls):
	301	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	302
	303	@classmethod
	304	def suitable(cls, url):
	305	return re.match(cls._make_valid_url(), url) is not None
	306
	307	def _real_extract(self, query):
	308	mobj = re.match(self._make_valid_url(), query)
	309	if mobj is None:
	310	raise ExtractorError(u'Invalid search query "%s"' % query)
	311
	312	prefix = mobj.group('prefix')
	313	query = mobj.group('query')
	314	if prefix == '':
	315	return self._get_n_results(query, 1)
	316	elif prefix == 'all':
	317	return self._get_n_results(query, self._MAX_RESULTS)
	318	else:
	319	n = int(prefix)
	320	if n <= 0:
	321	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	322	elif n > self._MAX_RESULTS:
	323	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	324	n = self._MAX_RESULTS
	325	return self._get_n_results(query, n)
	326
	327	def _get_n_results(self, query, n):
	328	"""Get a specified number of results for a query"""
	329	raise NotImplementedError("This method must be implemented by sublclasses")
0f818663 PH	330
	331	@property
	332	def SEARCH_KEY(self):
	333	return self._SEARCH_KEY