[yt-dlp.git] / youtube_dl / extractor / common.py

import base64
import os
import re
import socket
import sys
import netrc

from ..utils import (
    compat_http_client,
    compat_urllib_error,
    compat_urllib_request,
    compat_str,

    clean_html,
    compiled_regex_type,
    ExtractorError,
    unescapeHTML,
)

class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the FileDownloader. The FileDownloader processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The dictionaries must include the following fields:

    id:             Video identifier.
    url:            Final video URL.
    title:          Video title, unescaped.
    ext:            Video filename extension.

    The following fields are optional:

    format:         The video format, defaults to ext (used for --get-format)
    thumbnails:     A list of dictionaries (with the entries "resolution" and
                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
    upload_date:    Video upload date (YYYYMMDD).
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location of the video.
    player_url:     SWF Player URL (used for rtmpdump).
    subtitles:      The subtitle file contents.
    view_count:     How many users have watched the video on the platform.
    urlhandle:      [internal] The urlHandle to be used to download the file,
                    like returned by urllib.request.urlopen

    The fields should all be Unicode strings.

    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

    _real_extract() must return a *list* of information dictionaries as
    described above.

    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
        return re.match(cls._VALID_URL, url) is not None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        if not self._ready:
            self._real_initialize()
            self._ready = True

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        self.initialize()
        return self._real_extract(url)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

    @property
    def IE_NAME(self):
        return type(self).__name__[:-2]

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
        """ Returns the response handle """
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            self.to_screen(u'%s: %s' % (video_id, note))
        try:
            return compat_urllib_request.urlopen(url_or_request)
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            if errnote is None:
                errnote = u'Unable to download webpage'
            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
        """ Returns a tuple (page content as string, URL handle) """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote)
        content_type = urlh.headers.get('Content-Type', '')
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            encoding = 'utf-8'
        webpage_bytes = urlh.read()
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
            except AttributeError:
                url = url_or_request
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        content = webpage_bytes.decode(encoding, 'replace')
        return (content, urlh)

    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
        """ Returns the data of the page as a string """
        return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(u'%s: Extracting information' % id_or_name)

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(u'%s: Downloading webpage' % video_id)

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen(u'Logging in')

    #Methods for following #608
    def url_result(self, url, ie=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
        return video_info
    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info

    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        ExtractorError, depending on fatal, specifying the field name.
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj: break

        if sys.stderr.isatty() and os.name != 'nt':
            _name = u'\033[0;34m%s\033[0m' % name
        else:
            _name = name

        if mobj:
            # return the first matching group
            return next(g for g in mobj.groups() if g is not None)
        elif default is not None:
            return default
        elif fatal:
            raise ExtractorError(u'Unable to extract %s' % _name)
        else:
            self._downloader.report_warning(u'unable to extract %s; '
                u'please report this issue on http://yt-dl.org/bug' % _name)
            return None

    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags)
        if res:
            return clean_html(res).strip()
        else:
            return res

    def _get_login_info(self):
        """
        Get the the login info as (username, password)
        It will look in the netrc file using the _NETRC_MACHINE value
        If there's no info available, return (None, None)
        """
        if self._downloader is None:
            return (None, None)

        username = None
        password = None
        downloader_params = self._downloader.params

        # Attempt to use provided username and password or .netrc data
        if downloader_params.get('username', None) is not None:
            username = downloader_params['username']
            password = downloader_params['password']
        elif downloader_params.get('usenetrc', False):
            try:
                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
                if info is not None:
                    username = info[0]
                    password = info[2]
                else:
                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
            except (IOError, netrc.NetrcParseError) as err:
                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
        
        return (username, password)

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regex(prop):
        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)

    def _og_search_property(self, prop, html, name=None, **kargs):
        if name is None:
            name = 'OpenGraph %s' % prop
        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, **kargs):
        return self._og_search_property('title', html, **kargs)

    def _og_search_video_url(self, html, name='video url', **kargs):
        return self._html_search_regex([self._og_regex('video:secure_url'),
                                        self._og_regex('video')],
                                       html, name, **kargs)

class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and _MAX_RESULTS.
    """

    @classmethod
    def _make_valid_url(cls):
        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

    @classmethod
    def suitable(cls, url):
        return re.match(cls._make_valid_url(), url) is not None

    def _real_extract(self, query):
        mobj = re.match(self._make_valid_url(), query)
        if mobj is None:
            raise ExtractorError(u'Invalid search query "%s"' % query)

        prefix = mobj.group('prefix')
        query = mobj.group('query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
            elif n > self._MAX_RESULTS:
                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        raise NotImplementedError("This method must be implemented by sublclasses")

    @property
    def SEARCH_KEY(self):
        return self._SEARCH_KEY
Commit	Line	Data
d6983cb4 PH	1	import base64
	2	import os
	3	import re
	4	import socket
	5	import sys
fc79158d	6	import netrc
d6983cb4 PH	7
	8	from ..utils import (
	9	compat_http_client,
	10	compat_urllib_error,
	11	compat_urllib_request,
	12	compat_str,
	13
	14	clean_html,
	15	compiled_regex_type,
	16	ExtractorError,
f38de77f	17	unescapeHTML,
d6983cb4 PH	18	)
	19
	20	class InfoExtractor(object):
	21	"""Information Extractor class.
	22
	23	Information extractors are the classes that, given a URL, extract
	24	information about the video (or videos) the URL refers to. This
	25	information includes the real video URL, the video title, author and
	26	others. The information is stored in a dictionary which is then
	27	passed to the FileDownloader. The FileDownloader processes this
	28	information possibly downloading the video to the file system, among
	29	other possible outcomes.
	30
	31	The dictionaries must include the following fields:
	32
	33	id: Video identifier.
	34	url: Final video URL.
	35	title: Video title, unescaped.
	36	ext: Video filename extension.
	37
	38	The following fields are optional:
	39
	40	format: The video format, defaults to ext (used for --get-format)
73e79f2a PH	41	thumbnails: A list of dictionaries (with the entries "resolution" and
73e79f2a PH	42	"url") for the varying thumbnails
d6983cb4 PH	43	thumbnail: Full URL to a video thumbnail image.
	44	description: One-line video description.
	45	uploader: Full name of the video uploader.
	46	upload_date: Video upload date (YYYYMMDD).
	47	uploader_id: Nickname or id of the video uploader.
	48	location: Physical location of the video.
	49	player_url: SWF Player URL (used for rtmpdump).
	50	subtitles: The subtitle file contents.
f3d29461	51	view_count: How many users have watched the video on the platform.
d6983cb4 PH	52	urlhandle: [internal] The urlHandle to be used to download the file,
	53	like returned by urllib.request.urlopen
	54
	55	The fields should all be Unicode strings.
	56
	57	Subclasses of this one should re-define the _real_initialize() and
	58	_real_extract() methods and define a _VALID_URL regexp.
	59	Probably, they should also be added to the list of extractors.
	60
	61	_real_extract() must return a list of information dictionaries as
	62	described above.
	63
	64	Finally, the _WORKING attribute should be set to False for broken IEs
	65	in order to warn the users and skip the tests.
	66	"""
	67
	68	_ready = False
	69	_downloader = None
	70	_WORKING = True
	71
	72	def __init__(self, downloader=None):
	73	"""Constructor. Receives an optional downloader."""
	74	self._ready = False
	75	self.set_downloader(downloader)
	76
	77	@classmethod
	78	def suitable(cls, url):
	79	"""Receives a URL and returns True if suitable for this IE."""
	80	return re.match(cls._VALID_URL, url) is not None
	81
	82	@classmethod
	83	def working(cls):
	84	"""Getter method for _WORKING."""
	85	return cls._WORKING
	86
	87	def initialize(self):
	88	"""Initializes an instance (authentication, etc)."""
	89	if not self._ready:
	90	self._real_initialize()
	91	self._ready = True
	92
	93	def extract(self, url):
	94	"""Extracts URL information and returns it in list of dicts."""
	95	self.initialize()
	96	return self._real_extract(url)
	97
	98	def set_downloader(self, downloader):
	99	"""Sets the downloader for this IE."""
	100	self._downloader = downloader
	101
	102	def _real_initialize(self):
	103	"""Real initialization process. Redefine in subclasses."""
	104	pass
	105
	106	def _real_extract(self, url):
	107	"""Real extraction process. Redefine in subclasses."""
	108	pass
	109
	110	@property
	111	def IE_NAME(self):
	112	return type(self).__name__[:-2]
	113
	114	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
	115	""" Returns the response handle """
116	if note is None:
117	self.report_download_webpage(video_id)
118	elif note is not False:
119	self.to_screen(u'%s: %s' % (video_id, note))
120	try:
121	return compat_urllib_request.urlopen(url_or_request)
122	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123	if errnote is None:
124	errnote = u'Unable to download webpage'
125	raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128	""" Returns a tuple (page content as string, URL handle) """
b9d3e163 PH	129
	130	# Strip hashes from the URL (#1038)
	131	if isinstance(url_or_request, (compat_str, str)):
	132	url_or_request = url_or_request.partition('#')[0]
	133
d6983cb4 PH	134	urlh = self._request_webpage(url_or_request, video_id, note, errnote)
	135	content_type = urlh.headers.get('Content-Type', '')
	136	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	137	if m:
	138	encoding = m.group(1)
	139	else:
	140	encoding = 'utf-8'
	141	webpage_bytes = urlh.read()
	142	if self._downloader.params.get('dump_intermediate_pages', False):
	143	try:
	144	url = url_or_request.get_full_url()
	145	except AttributeError:
	146	url = url_or_request
	147	self.to_screen(u'Dumping request to ' + url)
	148	dump = base64.b64encode(webpage_bytes).decode('ascii')
	149	self._downloader.to_screen(dump)
	150	content = webpage_bytes.decode(encoding, 'replace')
	151	return (content, urlh)
	152
	153	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
	154	""" Returns the data of the page as a string """
	155	return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
	156
	157	def to_screen(self, msg):
	158	"""Print msg to screen, prefixing it with '[ie_name]'"""
	159	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	160
	161	def report_extraction(self, id_or_name):
	162	"""Report information extraction."""
	163	self.to_screen(u'%s: Extracting information' % id_or_name)
	164
	165	def report_download_webpage(self, video_id):
	166	"""Report webpage download."""
	167	self.to_screen(u'%s: Downloading webpage' % video_id)
	168
	169	def report_age_confirmation(self):
	170	"""Report attempt to confirm age."""
	171	self.to_screen(u'Confirming age')
	172
fc79158d JMF	173	def report_login(self):
	174	"""Report attempt to log in."""
	175	self.to_screen(u'Logging in')
	176
d6983cb4	177	#Methods for following #608
d6983cb4 PH	178	def url_result(self, url, ie=None):
	179	"""Returns a url that points to a page that should be processed"""
	180	#TODO: ie should be the class used for getting the info
	181	video_info = {'_type': 'url',
	182	'url': url,
	183	'ie_key': ie}
	184	return video_info
	185	def playlist_result(self, entries, playlist_id=None, playlist_title=None):
	186	"""Returns a playlist"""
	187	video_info = {'_type': 'playlist',
	188	'entries': entries}
	189	if playlist_id:
	190	video_info['id'] = playlist_id
	191	if playlist_title:
	192	video_info['title'] = playlist_title
	193	return video_info
	194
	195	def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	196	"""
	197	Perform a regex search on the given string, using a single or a list of
	198	patterns returning the first matching group.
	199	In case of failure return a default value or raise a WARNING or a
	200	ExtractorError, depending on fatal, specifying the field name.
	201	"""
	202	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	203	mobj = re.search(pattern, string, flags)
	204	else:
	205	for p in pattern:
	206	mobj = re.search(p, string, flags)
	207	if mobj: break
	208
	209	if sys.stderr.isatty() and os.name != 'nt':
	210	_name = u'\033[0;34m%s\033[0m' % name
	211	else:
	212	_name = name
	213
	214	if mobj:
	215	# return the first matching group
	216	return next(g for g in mobj.groups() if g is not None)
	217	elif default is not None:
	218	return default
	219	elif fatal:
	220	raise ExtractorError(u'Unable to extract %s' % _name)
	221	else:
	222	self._downloader.report_warning(u'unable to extract %s; '
98bcd283	223	u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4 PH	224	return None
	225
	226	def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
	227	"""
	228	Like _search_regex, but strips HTML tags and unescapes entities.
	229	"""
	230	res = self._search_regex(pattern, string, name, default, fatal, flags)
	231	if res:
	232	return clean_html(res).strip()
	233	else:
	234	return res
	235
fc79158d JMF	236	def _get_login_info(self):
	237	"""
	238	Get the the login info as (username, password)
	239	It will look in the netrc file using the _NETRC_MACHINE value
	240	If there's no info available, return (None, None)
	241	"""
	242	if self._downloader is None:
	243	return (None, None)
	244
	245	username = None
	246	password = None
	247	downloader_params = self._downloader.params
	248
	249	# Attempt to use provided username and password or .netrc data
	250	if downloader_params.get('username', None) is not None:
	251	username = downloader_params['username']
	252	password = downloader_params['password']
	253	elif downloader_params.get('usenetrc', False):
	254	try:
	255	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	256	if info is not None:
	257	username = info[0]
	258	password = info[2]
	259	else:
	260	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	261	except (IOError, netrc.NetrcParseError) as err:
	262	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	263
	264	return (username, password)
	265
46720279 JMF	266	# Helper functions for extracting OpenGraph info
46720279 JMF	267	@staticmethod
3c4e6d83 PH	268	def _og_regex(prop):
3c4e6d83 PH	269	return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"\|\'(.+?)\')' % re.escape(prop)
46720279	270
3c4e6d83	271	def _og_search_property(self, prop, html, name=None, **kargs):
46720279	272	if name is None:
3c4e6d83	273	name = 'OpenGraph %s' % prop
f38de77f PH	274	escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
f38de77f PH	275	return unescapeHTML(escaped)
46720279 JMF	276
46720279 JMF	277	def _og_search_thumbnail(self, html, **kargs):
3c4e6d83	278	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279 JMF	279
	280	def _og_search_description(self, html, **kargs):
	281	return self._og_search_property('description', html, fatal=False, **kargs)
	282
	283	def _og_search_title(self, html, **kargs):
	284	return self._og_search_property('title', html, **kargs)
	285
	286	def _og_search_video_url(self, html, name='video url', **kargs):
	287	return self._html_search_regex([self._og_regex('video:secure_url'),
	288	self._og_regex('video')],
	289	html, name, **kargs)
	290
d6983cb4 PH	291	class SearchInfoExtractor(InfoExtractor):
	292	"""
	293	Base class for paged search queries extractors.
	294	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	295	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	296	"""
	297
	298	@classmethod
	299	def _make_valid_url(cls):
	300	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	301
	302	@classmethod
	303	def suitable(cls, url):
	304	return re.match(cls._make_valid_url(), url) is not None
	305
	306	def _real_extract(self, query):
	307	mobj = re.match(self._make_valid_url(), query)
	308	if mobj is None:
	309	raise ExtractorError(u'Invalid search query "%s"' % query)
	310
	311	prefix = mobj.group('prefix')
	312	query = mobj.group('query')
	313	if prefix == '':
	314	return self._get_n_results(query, 1)
	315	elif prefix == 'all':
	316	return self._get_n_results(query, self._MAX_RESULTS)
	317	else:
	318	n = int(prefix)
	319	if n <= 0:
	320	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	321	elif n > self._MAX_RESULTS:
	322	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	323	n = self._MAX_RESULTS
	324	return self._get_n_results(query, n)
	325
	326	def _get_n_results(self, query, n):
	327	"""Get a specified number of results for a query"""
	328	raise NotImplementedError("This method must be implemented by sublclasses")
0f818663 PH	329
	330	@property
	331	def SEARCH_KEY(self):
	332	return self._SEARCH_KEY