[yt-dlp.git] / youtube_dl / utils.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import gzip
import io
import json
import locale
import os
import re
import sys
import zlib
import email.utils
import json

try:
    import urllib.request as compat_urllib_request
except ImportError: # Python 2
    import urllib2 as compat_urllib_request

try:
    import urllib.error as compat_urllib_error
except ImportError: # Python 2
    import urllib2 as compat_urllib_error

try:
    import urllib.parse as compat_urllib_parse
except ImportError: # Python 2
    import urllib as compat_urllib_parse

try:
    from urllib.parse import urlparse as compat_urllib_parse_urlparse
except ImportError: # Python 2
    from urlparse import urlparse as compat_urllib_parse_urlparse

try:
    import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
    import cookielib as compat_cookiejar

try:
    import html.entities as compat_html_entities
except ImportError: # Python 2
    import htmlentitydefs as compat_html_entities

try:
    import html.parser as compat_html_parser
except ImportError: # Python 2
    import HTMLParser as compat_html_parser

try:
    import http.client as compat_http_client
except ImportError: # Python 2
    import httplib as compat_http_client

try:
    from subprocess import DEVNULL
    compat_subprocess_get_DEVNULL = lambda: DEVNULL
except ImportError:
    compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')

try:
    from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
    # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
    # Python 2's version is apparently totally broken
    def _unquote(string, encoding='utf-8', errors='replace'):
        if string == '':
            return string
        res = string.split('%')
        if len(res) == 1:
            return string
        if encoding is None:
            encoding = 'utf-8'
        if errors is None:
            errors = 'replace'
        # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
        pct_sequence = b''
        string = res[0]
        for item in res[1:]:
            try:
                if not item:
                    raise ValueError
                pct_sequence += item[:2].decode('hex')
                rest = item[2:]
                if not rest:
                    # This segment was just a single percent-encoded character.
                    # May be part of a sequence of code units, so delay decoding.
                    # (Stored in pct_sequence).
                    continue
            except ValueError:
                rest = '%' + item
            # Encountered non-percent-encoded characters. Flush the current
            # pct_sequence.
            string += pct_sequence.decode(encoding, errors) + rest
            pct_sequence = b''
        if pct_sequence:
            # Flush the final pct_sequence
            string += pct_sequence.decode(encoding, errors)
        return string

    def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
                encoding='utf-8', errors='replace'):
        qs, _coerce_result = qs, unicode
        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
        r = []
        for name_value in pairs:
            if not name_value and not strict_parsing:
                continue
            nv = name_value.split('=', 1)
            if len(nv) != 2:
                if strict_parsing:
                    raise ValueError("bad query field: %r" % (name_value,))
                # Handle case of a control-name with no equal sign
                if keep_blank_values:
                    nv.append('')
                else:
                    continue
            if len(nv[1]) or keep_blank_values:
                name = nv[0].replace('+', ' ')
                name = _unquote(name, encoding=encoding, errors=errors)
                name = _coerce_result(name)
                value = nv[1].replace('+', ' ')
                value = _unquote(value, encoding=encoding, errors=errors)
                value = _coerce_result(value)
                r.append((name, value))
        return r

    def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
                encoding='utf-8', errors='replace'):
        parsed_result = {}
        pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
                        encoding=encoding, errors=errors)
        for name, value in pairs:
            if name in parsed_result:
                parsed_result[name].append(value)
            else:
                parsed_result[name] = [value]
        return parsed_result

try:
    compat_str = unicode # Python 2
except NameError:
    compat_str = str

try:
    compat_chr = unichr # Python 2
except NameError:
    compat_chr = chr

std_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-us,en;q=0.5',
}
def preferredencoding():
    """Get preferred encoding.

    Returns the best encoding scheme for the system, based on
    locale.getpreferredencoding() and some further tweaks.
    """
    try:
        pref = locale.getpreferredencoding()
        u'TEST'.encode(pref)
    except:
        pref = 'UTF-8'

    return pref

if sys.version_info < (3,0):
    def compat_print(s):
        print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
else:
    def compat_print(s):
        assert type(s) == type(u'')
        print(s)

# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys.version_info < (3,0):
    def write_json_file(obj, fn):
        with open(fn, 'wb') as f:
            json.dump(obj, f)
else:
    def write_json_file(obj, fn):
        with open(fn, 'w', encoding='utf-8') as f:
            json.dump(obj, f)


def htmlentity_transform(matchobj):
    """Transforms an HTML entity to a character.

    This function receives a match object and is intended to be used with
    the re.sub() function.
    """
    entity = matchobj.group(1)

    # Known non-numeric HTML entity
    if entity in compat_html_entities.name2codepoint:
        return compat_chr(compat_html_entities.name2codepoint[entity])

    mobj = re.match(u'(?u)#(x?\\d+)', entity)
    if mobj is not None:
        numstr = mobj.group(1)
        if numstr.startswith(u'x'):
            base = 16
            numstr = u'0%s' % numstr
        else:
            base = 10
        return compat_chr(int(numstr, base))

    # Unknown entity in name, return its literal representation
    return (u'&%s;' % entity)

compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class AttrParser(compat_html_parser.HTMLParser):
    """Modified HTMLParser that isolates a tag with the specified attribute"""
    def __init__(self, attribute, value):
        self.attribute = attribute
        self.value = value
        self.result = None
        self.started = False
        self.depth = {}
        self.html = None
        self.watch_startpos = False
        self.error_count = 0
        compat_html_parser.HTMLParser.__init__(self)

    def error(self, message):
        if self.error_count > 10 or self.started:
            raise compat_html_parser.HTMLParseError(message, self.getpos())
        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
        self.error_count += 1
        self.goahead(1)

    def loads(self, html):
        self.html = html
        self.feed(html)
        self.close()

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if self.started:
            self.find_startpos(None)
        if self.attribute in attrs and attrs[self.attribute] == self.value:
            self.result = [tag]
            self.started = True
            self.watch_startpos = True
        if self.started:
            if not tag in self.depth: self.depth[tag] = 0
            self.depth[tag] += 1

    def handle_endtag(self, tag):
        if self.started:
            if tag in self.depth: self.depth[tag] -= 1
            if self.depth[self.result[0]] == 0:
                self.started = False
                self.result.append(self.getpos())

    def find_startpos(self, x):
        """Needed to put the start position of the result (self.result[1])
        after the opening tag with the requested id"""
        if self.watch_startpos:
            self.watch_startpos = False
            self.result.append(self.getpos())
    handle_entityref = handle_charref = handle_data = handle_comment = \
    handle_decl = handle_pi = unknown_decl = find_startpos

    def get_result(self):
        if self.result is None:
            return None
        if len(self.result) != 3:
            return None
        lines = self.html.split('\n')
        lines = lines[self.result[1][0]-1:self.result[2][0]]
        lines[0] = lines[0][self.result[1][1]:]
        if len(lines) == 1:
            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
        lines[-1] = lines[-1][:self.result[2][1]]
        return '\n'.join(lines).strip()

def get_element_by_id(id, html):
    """Return the content of the tag with the specified ID in the passed HTML document"""
    return get_element_by_attribute("id", id, html)

def get_element_by_attribute(attribute, value, html):
    """Return the content of the tag with the specified attribute in the passed HTML document"""
    parser = AttrParser(attribute, value)
    try:
        parser.loads(html)
    except compat_html_parser.HTMLParseError:
        pass
    return parser.get_result()


def clean_html(html):
    """Clean an HTML snippet into a readable string"""
    # Newline vs <br />
    html = html.replace('\n', ' ')
    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
    # Strip html tags
    html = re.sub('<.*?>', '', html)
    # Replace html entities
    html = unescapeHTML(html)
    return html


def sanitize_open(filename, open_mode):
    """Try to open the given filename, and slightly tweak it if this fails.

    Attempts to open the given filename. If this fails, it tries to change
    the filename slightly, step by step, until it's either able to open it
    or it fails and raises a final exception, like the standard open()
    function.

    It returns the tuple (stream, definitive_file_name).
    """
    try:
        if filename == u'-':
            if sys.platform == 'win32':
                import msvcrt
                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
            return (sys.stdout, filename)
        stream = open(encodeFilename(filename), open_mode)
        return (stream, filename)
    except (IOError, OSError) as err:
        # In case of error, try to remove win32 forbidden chars
        filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)

        # An exception here should be caught in the caller
        stream = open(encodeFilename(filename), open_mode)
        return (stream, filename)


def timeconvert(timestr):
    """Convert RFC 2822 defined time string into system timestamp"""
    timestamp = None
    timetuple = email.utils.parsedate_tz(timestr)
    if timetuple is not None:
        timestamp = email.utils.mktime_tz(timetuple)
    return timestamp

def sanitize_filename(s, restricted=False, is_id=False):
    """Sanitizes a string so it could be used as part of a filename.
    If restricted is set, use a stricter subset of allowed characters.
    Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
    """
    def replace_insane(char):
        if char == '?' or ord(char) < 32 or ord(char) == 127:
            return ''
        elif char == '"':
            return '' if restricted else '\''
        elif char == ':':
            return '_-' if restricted else ' -'
        elif char in '\\/|*<>':
            return '_'
        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
            return '_'
        if restricted and ord(char) > 127:
            return '_'
        return char

    result = u''.join(map(replace_insane, s))
    if not is_id:
        while '__' in result:
            result = result.replace('__', '_')
        result = result.strip('_')
        # Common case of "Foreign band name - English song title"
        if restricted and result.startswith('-_'):
            result = result[2:]
        if not result:
            result = '_'
    return result

def orderedSet(iterable):
    """ Remove all duplicates from the input iterable """
    res = []
    for el in iterable:
        if el not in res:
            res.append(el)
    return res

def unescapeHTML(s):
    """
    @param s a string
    """
    assert type(s) == type(u'')

    result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
    return result

def encodeFilename(s):
    """
    @param s The name of the file
    """

    assert type(s) == type(u'')

    # Python 3 has a Unicode API
    if sys.version_info >= (3, 0):
        return s

    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
        # Pass u'' directly to use Unicode APIs on Windows 2000 and up
        # (Detecting Windows NT 4 is tricky because 'major >= 4' would
        # match Windows 9x series as well. Besides, NT 4 is obsolete.)
        return s
    else:
        return s.encode(sys.getfilesystemencoding(), 'ignore')

class DownloadError(Exception):
    """Download Error exception.

    This exception may be thrown by FileDownloader objects if they are not
    configured to continue on errors. They will contain the appropriate
    error message.
    """
    pass


class SameFileError(Exception):
    """Same File exception.

    This exception will be thrown by FileDownloader objects if they detect
    multiple files would have to be downloaded to the same file on disk.
    """
    pass


class PostProcessingError(Exception):
    """Post Processing exception.

    This exception may be raised by PostProcessor's .run() method to
    indicate an error in the postprocessing task.
    """
    pass

class MaxDownloadsReached(Exception):
    """ --max-downloads limit has been reached. """
    pass


class UnavailableVideoError(Exception):
    """Unavailable Format exception.

    This exception will be thrown when a video is requested
    in a format that is not available for that video.
    """
    pass


class ContentTooShortError(Exception):
    """Content Too Short exception.

    This exception may be raised by FileDownloader objects when a file they
    download is too small for what the server announced first, indicating
    the connection was probably interrupted.
    """
    # Both in bytes
    downloaded = None
    expected = None

    def __init__(self, downloaded, expected):
        self.downloaded = downloaded
        self.expected = expected

class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
    """Handler for HTTP requests and responses.

    This class, when installed with an OpenerDirector, automatically adds
    the standard headers to every HTTP request and handles gzipped and
    deflated responses from web servers. If compression is to be avoided in
    a particular request, the original request in the program code only has
    to include the HTTP header "Youtubedl-No-Compression", which will be
    removed before making the real request.

    Part of this code was copied from:

    http://techknack.net/python-urllib2-handlers/

    Andrew Rowls, the author of that code, agreed to release it to the
    public domain.
    """

    @staticmethod
    def deflate(data):
        try:
            return zlib.decompress(data, -zlib.MAX_WBITS)
        except zlib.error:
            return zlib.decompress(data)

    @staticmethod
    def addinfourl_wrapper(stream, headers, url, code):
        if hasattr(compat_urllib_request.addinfourl, 'getcode'):
            return compat_urllib_request.addinfourl(stream, headers, url, code)
        ret = compat_urllib_request.addinfourl(stream, headers, url)
        ret.code = code
        return ret

    def http_request(self, req):
        for h in std_headers:
            if h in req.headers:
                del req.headers[h]
            req.add_header(h, std_headers[h])
        if 'Youtubedl-no-compression' in req.headers:
            if 'Accept-encoding' in req.headers:
                del req.headers['Accept-encoding']
            del req.headers['Youtubedl-no-compression']
        return req

    def http_response(self, req, resp):
        old_resp = resp
        # gzip
        if resp.headers.get('Content-encoding', '') == 'gzip':
            gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # deflate
        if resp.headers.get('Content-encoding', '') == 'deflate':
            gz = io.BytesIO(self.deflate(resp.read()))
            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        return resp

    https_request = http_request
    https_response = http_response
Commit	Line	Data
d77c3dfd FV	1	#!/usr/bin/env python
	2	# -- coding: utf-8 --
	3
	4	import gzip
03f9daab	5	import io
f4bfd65f	6	import json
d77c3dfd FV	7	import locale
	8	import os
	9	import re
	10	import sys
	11	import zlib
d77c3dfd	12	import email.utils
921a1455	13	import json
d77c3dfd	14
01ba00ca	15	try:
59ae15a5	16	import urllib.request as compat_urllib_request
01ba00ca	17	except ImportError: # Python 2
59ae15a5	18	import urllib2 as compat_urllib_request
01ba00ca PH	19
01ba00ca PH	20	try:
59ae15a5	21	import urllib.error as compat_urllib_error
01ba00ca	22	except ImportError: # Python 2
59ae15a5	23	import urllib2 as compat_urllib_error
01ba00ca PH	24
01ba00ca PH	25	try:
59ae15a5	26	import urllib.parse as compat_urllib_parse
01ba00ca	27	except ImportError: # Python 2
59ae15a5	28	import urllib as compat_urllib_parse
01ba00ca	29
799c0763 PH	30	try:
	31	from urllib.parse import urlparse as compat_urllib_parse_urlparse
	32	except ImportError: # Python 2
	33	from urlparse import urlparse as compat_urllib_parse_urlparse
	34
01ba00ca	35	try:
59ae15a5	36	import http.cookiejar as compat_cookiejar
01ba00ca	37	except ImportError: # Python 2
59ae15a5	38	import cookielib as compat_cookiejar
01ba00ca	39
3e669f36	40	try:
59ae15a5	41	import html.entities as compat_html_entities
9f37a959	42	except ImportError: # Python 2
59ae15a5	43	import htmlentitydefs as compat_html_entities
3e669f36	44
a8156c1d	45	try:
59ae15a5	46	import html.parser as compat_html_parser
9f37a959	47	except ImportError: # Python 2
59ae15a5	48	import HTMLParser as compat_html_parser
a8156c1d	49
348d0a7a	50	try:
59ae15a5	51	import http.client as compat_http_client
9f37a959	52	except ImportError: # Python 2
59ae15a5	53	import httplib as compat_http_client
348d0a7a	54
5910e210 PH	55	try:
	56	from subprocess import DEVNULL
	57	compat_subprocess_get_DEVNULL = lambda: DEVNULL
	58	except ImportError:
	59	compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
	60
9f37a959	61	try:
59ae15a5	62	from urllib.parse import parse_qs as compat_parse_qs
9f37a959	63	except ImportError: # Python 2
59ae15a5 PH	64	# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
	65	# Python 2's version is apparently totally broken
	66	def _unquote(string, encoding='utf-8', errors='replace'):
	67	if string == '':
	68	return string
	69	res = string.split('%')
	70	if len(res) == 1:
	71	return string
	72	if encoding is None:
	73	encoding = 'utf-8'
	74	if errors is None:
	75	errors = 'replace'
	76	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
	77	pct_sequence = b''
	78	string = res[0]
	79	for item in res[1:]:
	80	try:
	81	if not item:
	82	raise ValueError
	83	pct_sequence += item[:2].decode('hex')
	84	rest = item[2:]
	85	if not rest:
	86	# This segment was just a single percent-encoded character.
	87	# May be part of a sequence of code units, so delay decoding.
	88	# (Stored in pct_sequence).
	89	continue
	90	except ValueError:
	91	rest = '%' + item
	92	# Encountered non-percent-encoded characters. Flush the current
	93	# pct_sequence.
	94	string += pct_sequence.decode(encoding, errors) + rest
	95	pct_sequence = b''
	96	if pct_sequence:
	97	# Flush the final pct_sequence
	98	string += pct_sequence.decode(encoding, errors)
	99	return string
	100
	101	def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
	102	encoding='utf-8', errors='replace'):
	103	qs, _coerce_result = qs, unicode
	104	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
	105	r = []
	106	for name_value in pairs:
	107	if not name_value and not strict_parsing:
	108	continue
	109	nv = name_value.split('=', 1)
	110	if len(nv) != 2:
	111	if strict_parsing:
	112	raise ValueError("bad query field: %r" % (name_value,))
	113	# Handle case of a control-name with no equal sign
	114	if keep_blank_values:
	115	nv.append('')
	116	else:
	117	continue
	118	if len(nv[1]) or keep_blank_values:
	119	name = nv[0].replace('+', ' ')
	120	name = _unquote(name, encoding=encoding, errors=errors)
	121	name = _coerce_result(name)
	122	value = nv[1].replace('+', ' ')
	123	value = _unquote(value, encoding=encoding, errors=errors)
	124	value = _coerce_result(value)
	125	r.append((name, value))
	126	return r
	127
128	def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
129	encoding='utf-8', errors='replace'):
130	parsed_result = {}
131	pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
132	encoding=encoding, errors=errors)
133	for name, value in pairs:
134	if name in parsed_result:
135	parsed_result[name].append(value)
136	else:
137	parsed_result[name] = [value]
138	return parsed_result
348d0a7a	139
3e669f36	140	try:
59ae15a5	141	compat_str = unicode # Python 2
3e669f36	142	except NameError:
59ae15a5	143	compat_str = str
3e669f36 PH	144
3e669f36 PH	145	try:
59ae15a5	146	compat_chr = unichr # Python 2
3e669f36	147	except NameError:
59ae15a5	148	compat_chr = chr
3e669f36	149
3e669f36	150	std_headers = {
59ae15a5 PH	151	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
	152	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	153	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	154	'Accept-Encoding': 'gzip, deflate',
	155	'Accept-Language': 'en-us,en;q=0.5',
3e669f36	156	}
d77c3dfd	157	def preferredencoding():
59ae15a5	158	"""Get preferred encoding.
d77c3dfd	159
59ae15a5 PH	160	Returns the best encoding scheme for the system, based on
	161	locale.getpreferredencoding() and some further tweaks.
	162	"""
	163	try:
	164	pref = locale.getpreferredencoding()
	165	u'TEST'.encode(pref)
	166	except:
	167	pref = 'UTF-8'
bae611f2	168
59ae15a5	169	return pref
d77c3dfd	170
8cd10ac4	171	if sys.version_info < (3,0):
59ae15a5 PH	172	def compat_print(s):
59ae15a5 PH	173	print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4	174	else:
59ae15a5 PH	175	def compat_print(s):
	176	assert type(s) == type(u'')
	177	print(s)
d77c3dfd	178
f4bfd65f PH	179	# In Python 2.x, json.dump expects a bytestream.
	180	# In Python 3.x, it writes to a character stream
	181	if sys.version_info < (3,0):
	182	def write_json_file(obj, fn):
	183	with open(fn, 'wb') as f:
	184	json.dump(obj, f)
	185	else:
	186	def write_json_file(obj, fn):
	187	with open(fn, 'w', encoding='utf-8') as f:
	188	json.dump(obj, f)
	189
	190
d77c3dfd	191	def htmlentity_transform(matchobj):
59ae15a5 PH	192	"""Transforms an HTML entity to a character.
	193
	194	This function receives a match object and is intended to be used with
	195	the re.sub() function.
	196	"""
	197	entity = matchobj.group(1)
	198
	199	# Known non-numeric HTML entity
	200	if entity in compat_html_entities.name2codepoint:
	201	return compat_chr(compat_html_entities.name2codepoint[entity])
	202
	203	mobj = re.match(u'(?u)#(x?\\d+)', entity)
	204	if mobj is not None:
	205	numstr = mobj.group(1)
	206	if numstr.startswith(u'x'):
	207	base = 16
	208	numstr = u'0%s' % numstr
	209	else:
	210	base = 10
	211	return compat_chr(int(numstr, base))
	212
	213	# Unknown entity in name, return its literal representation
	214	return (u'&%s;' % entity)
d77c3dfd	215
a8156c1d	216	compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_](?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>](?:\s=+\s(?:'[^']'\|"[^"]"\|(?!['"])[^>\s]))?\s))?\s""", re.VERBOSE) # backport bugfix
43e8fafd ND	217	class AttrParser(compat_html_parser.HTMLParser):
	218	"""Modified HTMLParser that isolates a tag with the specified attribute"""
	219	def __init__(self, attribute, value):
	220	self.attribute = attribute
	221	self.value = value
59ae15a5 PH	222	self.result = None
	223	self.started = False
	224	self.depth = {}
	225	self.html = None
	226	self.watch_startpos = False
	227	self.error_count = 0
	228	compat_html_parser.HTMLParser.__init__(self)
	229
	230	def error(self, message):
	231	if self.error_count > 10 or self.started:
	232	raise compat_html_parser.HTMLParseError(message, self.getpos())
	233	self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
	234	self.error_count += 1
	235	self.goahead(1)
	236
	237	def loads(self, html):
	238	self.html = html
	239	self.feed(html)
	240	self.close()
	241
	242	def handle_starttag(self, tag, attrs):
	243	attrs = dict(attrs)
	244	if self.started:
	245	self.find_startpos(None)
43e8fafd	246	if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5 PH	247	self.result = [tag]
	248	self.started = True
	249	self.watch_startpos = True
	250	if self.started:
	251	if not tag in self.depth: self.depth[tag] = 0
	252	self.depth[tag] += 1
	253
	254	def handle_endtag(self, tag):
	255	if self.started:
	256	if tag in self.depth: self.depth[tag] -= 1
	257	if self.depth[self.result[0]] == 0:
	258	self.started = False
	259	self.result.append(self.getpos())
	260
	261	def find_startpos(self, x):
	262	"""Needed to put the start position of the result (self.result[1])
	263	after the opening tag with the requested id"""
	264	if self.watch_startpos:
	265	self.watch_startpos = False
	266	self.result.append(self.getpos())
	267	handle_entityref = handle_charref = handle_data = handle_comment = \
	268	handle_decl = handle_pi = unknown_decl = find_startpos
	269
	270	def get_result(self):
	271	if self.result is None:
	272	return None
	273	if len(self.result) != 3:
	274	return None
	275	lines = self.html.split('\n')
	276	lines = lines[self.result[1][0]-1:self.result[2][0]]
	277	lines[0] = lines[0][self.result[1][1]:]
	278	if len(lines) == 1:
	279	lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
	280	lines[-1] = lines[-1][:self.result[2][1]]
	281	return '\n'.join(lines).strip()
9e6dd238 FV	282
9e6dd238 FV	283	def get_element_by_id(id, html):
43e8fafd ND	284	"""Return the content of the tag with the specified ID in the passed HTML document"""
	285	return get_element_by_attribute("id", id, html)
	286
	287	def get_element_by_attribute(attribute, value, html):
	288	"""Return the content of the tag with the specified attribute in the passed HTML document"""
	289	parser = AttrParser(attribute, value)
59ae15a5 PH	290	try:
	291	parser.loads(html)
	292	except compat_html_parser.HTMLParseError:
	293	pass
	294	return parser.get_result()
9e6dd238 FV	295
	296
	297	def clean_html(html):
59ae15a5 PH	298	"""Clean an HTML snippet into a readable string"""
	299	# Newline vs <br />
	300	html = html.replace('\n', ' ')
6b3aef80 FV	301	html = re.sub(r'\s<\sbr\s/?\s>\s*', '\n', html)
6b3aef80 FV	302	html = re.sub(r'<\s/\sp\s>\s<\sp[^>]>', '\n', html)
59ae15a5 PH	303	# Strip html tags
	304	html = re.sub('<.*?>', '', html)
	305	# Replace html entities
	306	html = unescapeHTML(html)
	307	return html
9e6dd238 FV	308
9e6dd238 FV	309
d77c3dfd	310	def sanitize_open(filename, open_mode):
59ae15a5 PH	311	"""Try to open the given filename, and slightly tweak it if this fails.
	312
	313	Attempts to open the given filename. If this fails, it tries to change
	314	the filename slightly, step by step, until it's either able to open it
	315	or it fails and raises a final exception, like the standard open()
	316	function.
	317
	318	It returns the tuple (stream, definitive_file_name).
	319	"""
	320	try:
	321	if filename == u'-':
	322	if sys.platform == 'win32':
	323	import msvcrt
	324	msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
	325	return (sys.stdout, filename)
	326	stream = open(encodeFilename(filename), open_mode)
	327	return (stream, filename)
	328	except (IOError, OSError) as err:
	329	# In case of error, try to remove win32 forbidden chars
	330	filename = re.sub(u'[/<>:"\\\|\\\\?\\*]', u'#', filename)
	331
	332	# An exception here should be caught in the caller
	333	stream = open(encodeFilename(filename), open_mode)
	334	return (stream, filename)
d77c3dfd FV	335
	336
	337	def timeconvert(timestr):
59ae15a5 PH	338	"""Convert RFC 2822 defined time string into system timestamp"""
	339	timestamp = None
	340	timetuple = email.utils.parsedate_tz(timestr)
	341	if timetuple is not None:
	342	timestamp = email.utils.mktime_tz(timetuple)
	343	return timestamp
1c469a94	344
796173d0	345	def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5 PH	346	"""Sanitizes a string so it could be used as part of a filename.
59ae15a5 PH	347	If restricted is set, use a stricter subset of allowed characters.
796173d0	348	Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5 PH	349	"""
	350	def replace_insane(char):
	351	if char == '?' or ord(char) < 32 or ord(char) == 127:
	352	return ''
	353	elif char == '"':
	354	return '' if restricted else '\''
	355	elif char == ':':
	356	return '_-' if restricted else ' -'
	357	elif char in '\\/\|*<>':
	358	return '_'
627dcfff	359	if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5 PH	360	return '_'
	361	if restricted and ord(char) > 127:
	362	return '_'
	363	return char
	364
	365	result = u''.join(map(replace_insane, s))
796173d0 PH	366	if not is_id:
	367	while '__' in result:
	368	result = result.replace('__', '_')
	369	result = result.strip('_')
	370	# Common case of "Foreign band name - English song title"
	371	if restricted and result.startswith('-_'):
	372	result = result[2:]
	373	if not result:
	374	result = '_'
59ae15a5	375	return result
d77c3dfd FV	376
d77c3dfd FV	377	def orderedSet(iterable):
59ae15a5 PH	378	""" Remove all duplicates from the input iterable """
	379	res = []
	380	for el in iterable:
	381	if el not in res:
	382	res.append(el)
	383	return res
d77c3dfd FV	384
d77c3dfd FV	385	def unescapeHTML(s):
59ae15a5 PH	386	"""
	387	@param s a string
	388	"""
	389	assert type(s) == type(u'')
d77c3dfd	390
59ae15a5 PH	391	result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 PH	392	return result
d77c3dfd FV	393
d77c3dfd FV	394	def encodeFilename(s):
59ae15a5 PH	395	"""
	396	@param s The name of the file
	397	"""
d77c3dfd	398
59ae15a5	399	assert type(s) == type(u'')
d77c3dfd	400
59ae15a5 PH	401	# Python 3 has a Unicode API
	402	if sys.version_info >= (3, 0):
	403	return s
0f00efed	404
59ae15a5 PH	405	if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
	406	# Pass u'' directly to use Unicode APIs on Windows 2000 and up
	407	# (Detecting Windows NT 4 is tricky because 'major >= 4' would
	408	# match Windows 9x series as well. Besides, NT 4 is obsolete.)
	409	return s
	410	else:
	411	return s.encode(sys.getfilesystemencoding(), 'ignore')
d77c3dfd FV	412
d77c3dfd FV	413	class DownloadError(Exception):
59ae15a5	414	"""Download Error exception.
d77c3dfd	415
59ae15a5 PH	416	This exception may be thrown by FileDownloader objects if they are not
	417	configured to continue on errors. They will contain the appropriate
	418	error message.
	419	"""
	420	pass
d77c3dfd FV	421
	422
	423	class SameFileError(Exception):
59ae15a5	424	"""Same File exception.
d77c3dfd	425
59ae15a5 PH	426	This exception will be thrown by FileDownloader objects if they detect
	427	multiple files would have to be downloaded to the same file on disk.
	428	"""
	429	pass
d77c3dfd FV	430
	431
	432	class PostProcessingError(Exception):
59ae15a5	433	"""Post Processing exception.
d77c3dfd	434
59ae15a5 PH	435	This exception may be raised by PostProcessor's .run() method to
	436	indicate an error in the postprocessing task.
	437	"""
	438	pass
d77c3dfd FV	439
d77c3dfd FV	440	class MaxDownloadsReached(Exception):
59ae15a5 PH	441	""" --max-downloads limit has been reached. """
59ae15a5 PH	442	pass
d77c3dfd FV	443
	444
	445	class UnavailableVideoError(Exception):
59ae15a5	446	"""Unavailable Format exception.
d77c3dfd	447
59ae15a5 PH	448	This exception will be thrown when a video is requested
	449	in a format that is not available for that video.
	450	"""
	451	pass
d77c3dfd FV	452
	453
	454	class ContentTooShortError(Exception):
59ae15a5	455	"""Content Too Short exception.
d77c3dfd	456
59ae15a5 PH	457	This exception may be raised by FileDownloader objects when a file they
	458	download is too small for what the server announced first, indicating
	459	the connection was probably interrupted.
	460	"""
	461	# Both in bytes
	462	downloaded = None
	463	expected = None
d77c3dfd	464
59ae15a5 PH	465	def __init__(self, downloaded, expected):
	466	self.downloaded = downloaded
	467	self.expected = expected
d77c3dfd	468
01ba00ca	469	class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5 PH	470	"""Handler for HTTP requests and responses.
	471
	472	This class, when installed with an OpenerDirector, automatically adds
	473	the standard headers to every HTTP request and handles gzipped and
	474	deflated responses from web servers. If compression is to be avoided in
	475	a particular request, the original request in the program code only has
	476	to include the HTTP header "Youtubedl-No-Compression", which will be
	477	removed before making the real request.
	478
	479	Part of this code was copied from:
	480
	481	http://techknack.net/python-urllib2-handlers/
	482
	483	Andrew Rowls, the author of that code, agreed to release it to the
	484	public domain.
	485	"""
	486
	487	@staticmethod
	488	def deflate(data):
	489	try:
	490	return zlib.decompress(data, -zlib.MAX_WBITS)
	491	except zlib.error:
	492	return zlib.decompress(data)
	493
	494	@staticmethod
	495	def addinfourl_wrapper(stream, headers, url, code):
	496	if hasattr(compat_urllib_request.addinfourl, 'getcode'):
	497	return compat_urllib_request.addinfourl(stream, headers, url, code)
	498	ret = compat_urllib_request.addinfourl(stream, headers, url)
	499	ret.code = code
	500	return ret
	501
	502	def http_request(self, req):
	503	for h in std_headers:
	504	if h in req.headers:
	505	del req.headers[h]
	506	req.add_header(h, std_headers[h])
	507	if 'Youtubedl-no-compression' in req.headers:
	508	if 'Accept-encoding' in req.headers:
	509	del req.headers['Accept-encoding']
	510	del req.headers['Youtubedl-no-compression']
	511	return req
	512
	513	def http_response(self, req, resp):
	514	old_resp = resp
	515	# gzip
	516	if resp.headers.get('Content-encoding', '') == 'gzip':
	517	gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
	518	resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
	519	resp.msg = old_resp.msg
	520	# deflate
	521	if resp.headers.get('Content-encoding', '') == 'deflate':
	522	gz = io.BytesIO(self.deflate(resp.read()))
	523	resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
	524	resp.msg = old_resp.msg
	525	return resp
0f8d03f8 PH	526
	527	https_request = http_request
	528	https_response = http_response