[yt-dlp.git] / youtube_dl / utils.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import gzip
import io
import json
import locale
import os
import re
import sys
import zlib
import email.utils
import json

try:
    import urllib.request as compat_urllib_request
except ImportError: # Python 2
    import urllib2 as compat_urllib_request

try:
    import urllib.error as compat_urllib_error
except ImportError: # Python 2
    import urllib2 as compat_urllib_error

try:
    import urllib.parse as compat_urllib_parse
except ImportError: # Python 2
    import urllib as compat_urllib_parse

try:
    from urllib.parse import urlparse as compat_urllib_parse_urlparse
except ImportError: # Python 2
    from urlparse import urlparse as compat_urllib_parse_urlparse

try:
    import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
    import cookielib as compat_cookiejar

try:
    import html.entities as compat_html_entities
except ImportError: # Python 2
    import htmlentitydefs as compat_html_entities

try:
    import html.parser as compat_html_parser
except ImportError: # Python 2
    import HTMLParser as compat_html_parser

try:
    import http.client as compat_http_client
except ImportError: # Python 2
    import httplib as compat_http_client

try:
    from subprocess import DEVNULL
    compat_subprocess_get_DEVNULL = lambda: DEVNULL
except ImportError:
    compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')

try:
    from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
    # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
    # Python 2's version is apparently totally broken
    def _unquote(string, encoding='utf-8', errors='replace'):
        if string == '':
            return string
        res = string.split('%')
        if len(res) == 1:
            return string
        if encoding is None:
            encoding = 'utf-8'
        if errors is None:
            errors = 'replace'
        # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
        pct_sequence = b''
        string = res[0]
        for item in res[1:]:
            try:
                if not item:
                    raise ValueError
                pct_sequence += item[:2].decode('hex')
                rest = item[2:]
                if not rest:
                    # This segment was just a single percent-encoded character.
                    # May be part of a sequence of code units, so delay decoding.
                    # (Stored in pct_sequence).
                    continue
            except ValueError:
                rest = '%' + item
            # Encountered non-percent-encoded characters. Flush the current
            # pct_sequence.
            string += pct_sequence.decode(encoding, errors) + rest
            pct_sequence = b''
        if pct_sequence:
            # Flush the final pct_sequence
            string += pct_sequence.decode(encoding, errors)
        return string

    def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
                encoding='utf-8', errors='replace'):
        qs, _coerce_result = qs, unicode
        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
        r = []
        for name_value in pairs:
            if not name_value and not strict_parsing:
                continue
            nv = name_value.split('=', 1)
            if len(nv) != 2:
                if strict_parsing:
                    raise ValueError("bad query field: %r" % (name_value,))
                # Handle case of a control-name with no equal sign
                if keep_blank_values:
                    nv.append('')
                else:
                    continue
            if len(nv[1]) or keep_blank_values:
                name = nv[0].replace('+', ' ')
                name = _unquote(name, encoding=encoding, errors=errors)
                name = _coerce_result(name)
                value = nv[1].replace('+', ' ')
                value = _unquote(value, encoding=encoding, errors=errors)
                value = _coerce_result(value)
                r.append((name, value))
        return r

    def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
                encoding='utf-8', errors='replace'):
        parsed_result = {}
        pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
                        encoding=encoding, errors=errors)
        for name, value in pairs:
            if name in parsed_result:
                parsed_result[name].append(value)
            else:
                parsed_result[name] = [value]
        return parsed_result

try:
    compat_str = unicode # Python 2
except NameError:
    compat_str = str

try:
    compat_chr = unichr # Python 2
except NameError:
    compat_chr = chr

std_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-us,en;q=0.5',
}
def preferredencoding():
    """Get preferred encoding.

    Returns the best encoding scheme for the system, based on
    locale.getpreferredencoding() and some further tweaks.
    """
    try:
        pref = locale.getpreferredencoding()
        u'TEST'.encode(pref)
    except:
        pref = 'UTF-8'

    return pref

if sys.version_info < (3,0):
    def compat_print(s):
        print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
else:
    def compat_print(s):
        assert type(s) == type(u'')
        print(s)

# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys.version_info < (3,0):
    def write_json_file(obj, fn):
        with open(fn, 'wb') as f:
            json.dump(obj, f)
else:
    def write_json_file(obj, fn):
        with open(fn, 'w', encoding='utf-8') as f:
            json.dump(obj, f)


def htmlentity_transform(matchobj):
    """Transforms an HTML entity to a character.

    This function receives a match object and is intended to be used with
    the re.sub() function.
    """
    entity = matchobj.group(1)

    # Known non-numeric HTML entity
    if entity in compat_html_entities.name2codepoint:
        return compat_chr(compat_html_entities.name2codepoint[entity])

    mobj = re.match(u'(?u)#(x?\\d+)', entity)
    if mobj is not None:
        numstr = mobj.group(1)
        if numstr.startswith(u'x'):
            base = 16
            numstr = u'0%s' % numstr
        else:
            base = 10
        return compat_chr(int(numstr, base))

    # Unknown entity in name, return its literal representation
    return (u'&%s;' % entity)

compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class IDParser(compat_html_parser.HTMLParser):
    """Modified HTMLParser that isolates a tag with the specified id"""
    def __init__(self, id):
        self.id = id
        self.result = None
        self.started = False
        self.depth = {}
        self.html = None
        self.watch_startpos = False
        self.error_count = 0
        compat_html_parser.HTMLParser.__init__(self)

    def error(self, message):
        if self.error_count > 10 or self.started:
            raise compat_html_parser.HTMLParseError(message, self.getpos())
        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
        self.error_count += 1
        self.goahead(1)

    def loads(self, html):
        self.html = html
        self.feed(html)
        self.close()

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if self.started:
            self.find_startpos(None)
        if 'id' in attrs and attrs['id'] == self.id:
            self.result = [tag]
            self.started = True
            self.watch_startpos = True
        if self.started:
            if not tag in self.depth: self.depth[tag] = 0
            self.depth[tag] += 1

    def handle_endtag(self, tag):
        if self.started:
            if tag in self.depth: self.depth[tag] -= 1
            if self.depth[self.result[0]] == 0:
                self.started = False
                self.result.append(self.getpos())

    def find_startpos(self, x):
        """Needed to put the start position of the result (self.result[1])
        after the opening tag with the requested id"""
        if self.watch_startpos:
            self.watch_startpos = False
            self.result.append(self.getpos())
    handle_entityref = handle_charref = handle_data = handle_comment = \
    handle_decl = handle_pi = unknown_decl = find_startpos

    def get_result(self):
        if self.result is None:
            return None
        if len(self.result) != 3:
            return None
        lines = self.html.split('\n')
        lines = lines[self.result[1][0]-1:self.result[2][0]]
        lines[0] = lines[0][self.result[1][1]:]
        if len(lines) == 1:
            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
        lines[-1] = lines[-1][:self.result[2][1]]
        return '\n'.join(lines).strip()

def get_element_by_id(id, html):
    """Return the content of the tag with the specified id in the passed HTML document"""
    parser = IDParser(id)
    try:
        parser.loads(html)
    except compat_html_parser.HTMLParseError:
        pass
    return parser.get_result()


def clean_html(html):
    """Clean an HTML snippet into a readable string"""
    # Newline vs <br />
    html = html.replace('\n', ' ')
    html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
    # Strip html tags
    html = re.sub('<.*?>', '', html)
    # Replace html entities
    html = unescapeHTML(html)
    return html


def sanitize_open(filename, open_mode):
    """Try to open the given filename, and slightly tweak it if this fails.

    Attempts to open the given filename. If this fails, it tries to change
    the filename slightly, step by step, until it's either able to open it
    or it fails and raises a final exception, like the standard open()
    function.

    It returns the tuple (stream, definitive_file_name).
    """
    try:
        if filename == u'-':
            if sys.platform == 'win32':
                import msvcrt
                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
            return (sys.stdout, filename)
        stream = open(encodeFilename(filename), open_mode)
        return (stream, filename)
    except (IOError, OSError) as err:
        # In case of error, try to remove win32 forbidden chars
        filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)

        # An exception here should be caught in the caller
        stream = open(encodeFilename(filename), open_mode)
        return (stream, filename)


def timeconvert(timestr):
    """Convert RFC 2822 defined time string into system timestamp"""
    timestamp = None
    timetuple = email.utils.parsedate_tz(timestr)
    if timetuple is not None:
        timestamp = email.utils.mktime_tz(timetuple)
    return timestamp

def sanitize_filename(s, restricted=False, is_id=False):
    """Sanitizes a string so it could be used as part of a filename.
    If restricted is set, use a stricter subset of allowed characters.
    Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
    """
    def replace_insane(char):
        if char == '?' or ord(char) < 32 or ord(char) == 127:
            return ''
        elif char == '"':
            return '' if restricted else '\''
        elif char == ':':
            return '_-' if restricted else ' -'
        elif char in '\\/|*<>':
            return '_'
        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
            return '_'
        if restricted and ord(char) > 127:
            return '_'
        return char

    result = u''.join(map(replace_insane, s))
    if not is_id:
        while '__' in result:
            result = result.replace('__', '_')
        result = result.strip('_')
        # Common case of "Foreign band name - English song title"
        if restricted and result.startswith('-_'):
            result = result[2:]
        if not result:
            result = '_'
    return result

def orderedSet(iterable):
    """ Remove all duplicates from the input iterable """
    res = []
    for el in iterable:
        if el not in res:
            res.append(el)
    return res

def unescapeHTML(s):
    """
    @param s a string
    """
    assert type(s) == type(u'')

    result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
    return result

def encodeFilename(s):
    """
    @param s The name of the file
    """

    assert type(s) == type(u'')

    # Python 3 has a Unicode API
    if sys.version_info >= (3, 0):
        return s

    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
        # Pass u'' directly to use Unicode APIs on Windows 2000 and up
        # (Detecting Windows NT 4 is tricky because 'major >= 4' would
        # match Windows 9x series as well. Besides, NT 4 is obsolete.)
        return s
    else:
        return s.encode(sys.getfilesystemencoding(), 'ignore')

class DownloadError(Exception):
    """Download Error exception.

    This exception may be thrown by FileDownloader objects if they are not
    configured to continue on errors. They will contain the appropriate
    error message.
    """
    pass


class SameFileError(Exception):
    """Same File exception.

    This exception will be thrown by FileDownloader objects if they detect
    multiple files would have to be downloaded to the same file on disk.
    """
    pass


class PostProcessingError(Exception):
    """Post Processing exception.

    This exception may be raised by PostProcessor's .run() method to
    indicate an error in the postprocessing task.
    """
    pass

class MaxDownloadsReached(Exception):
    """ --max-downloads limit has been reached. """
    pass


class UnavailableVideoError(Exception):
    """Unavailable Format exception.

    This exception will be thrown when a video is requested
    in a format that is not available for that video.
    """
    pass


class ContentTooShortError(Exception):
    """Content Too Short exception.

    This exception may be raised by FileDownloader objects when a file they
    download is too small for what the server announced first, indicating
    the connection was probably interrupted.
    """
    # Both in bytes
    downloaded = None
    expected = None

    def __init__(self, downloaded, expected):
        self.downloaded = downloaded
        self.expected = expected


class Trouble(Exception):
    """Trouble helper exception

    This is an exception to be handled with
    FileDownloader.trouble
    """

class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
    """Handler for HTTP requests and responses.

    This class, when installed with an OpenerDirector, automatically adds
    the standard headers to every HTTP request and handles gzipped and
    deflated responses from web servers. If compression is to be avoided in
    a particular request, the original request in the program code only has
    to include the HTTP header "Youtubedl-No-Compression", which will be
    removed before making the real request.

    Part of this code was copied from:

    http://techknack.net/python-urllib2-handlers/

    Andrew Rowls, the author of that code, agreed to release it to the
    public domain.
    """

    @staticmethod
    def deflate(data):
        try:
            return zlib.decompress(data, -zlib.MAX_WBITS)
        except zlib.error:
            return zlib.decompress(data)

    @staticmethod
    def addinfourl_wrapper(stream, headers, url, code):
        if hasattr(compat_urllib_request.addinfourl, 'getcode'):
            return compat_urllib_request.addinfourl(stream, headers, url, code)
        ret = compat_urllib_request.addinfourl(stream, headers, url)
        ret.code = code
        return ret

    def http_request(self, req):
        for h in std_headers:
            if h in req.headers:
                del req.headers[h]
            req.add_header(h, std_headers[h])
        if 'Youtubedl-no-compression' in req.headers:
            if 'Accept-encoding' in req.headers:
                del req.headers['Accept-encoding']
            del req.headers['Youtubedl-no-compression']
        return req

    def http_response(self, req, resp):
        old_resp = resp
        # gzip
        if resp.headers.get('Content-encoding', '') == 'gzip':
            gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # deflate
        if resp.headers.get('Content-encoding', '') == 'deflate':
            gz = io.BytesIO(self.deflate(resp.read()))
            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        return resp

    https_request = http_request
    https_response = http_response
Commit	Line	Data
d77c3dfd FV	1	#!/usr/bin/env python
	2	# -- coding: utf-8 --
	3
	4	import gzip
03f9daab	5	import io
f4bfd65f	6	import json
d77c3dfd FV	7	import locale
	8	import os
	9	import re
	10	import sys
	11	import zlib
d77c3dfd	12	import email.utils
921a1455	13	import json
d77c3dfd	14
01ba00ca	15	try:
59ae15a5	16	import urllib.request as compat_urllib_request
01ba00ca	17	except ImportError: # Python 2
59ae15a5	18	import urllib2 as compat_urllib_request
01ba00ca PH	19
01ba00ca PH	20	try:
59ae15a5	21	import urllib.error as compat_urllib_error
01ba00ca	22	except ImportError: # Python 2
59ae15a5	23	import urllib2 as compat_urllib_error
01ba00ca PH	24
01ba00ca PH	25	try:
59ae15a5	26	import urllib.parse as compat_urllib_parse
01ba00ca	27	except ImportError: # Python 2
59ae15a5	28	import urllib as compat_urllib_parse
01ba00ca	29
799c0763 PH	30	try:
	31	from urllib.parse import urlparse as compat_urllib_parse_urlparse
	32	except ImportError: # Python 2
	33	from urlparse import urlparse as compat_urllib_parse_urlparse
	34
01ba00ca	35	try:
59ae15a5	36	import http.cookiejar as compat_cookiejar
01ba00ca	37	except ImportError: # Python 2
59ae15a5	38	import cookielib as compat_cookiejar
01ba00ca	39
3e669f36	40	try:
59ae15a5	41	import html.entities as compat_html_entities
9f37a959	42	except ImportError: # Python 2
59ae15a5	43	import htmlentitydefs as compat_html_entities
3e669f36	44
a8156c1d	45	try:
59ae15a5	46	import html.parser as compat_html_parser
9f37a959	47	except ImportError: # Python 2
59ae15a5	48	import HTMLParser as compat_html_parser
a8156c1d	49
348d0a7a	50	try:
59ae15a5	51	import http.client as compat_http_client
9f37a959	52	except ImportError: # Python 2
59ae15a5	53	import httplib as compat_http_client
348d0a7a	54
5910e210 PH	55	try:
	56	from subprocess import DEVNULL
	57	compat_subprocess_get_DEVNULL = lambda: DEVNULL
	58	except ImportError:
	59	compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
	60
9f37a959	61	try:
59ae15a5	62	from urllib.parse import parse_qs as compat_parse_qs
9f37a959	63	except ImportError: # Python 2
59ae15a5 PH	64	# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
	65	# Python 2's version is apparently totally broken
	66	def _unquote(string, encoding='utf-8', errors='replace'):
	67	if string == '':
	68	return string
	69	res = string.split('%')
	70	if len(res) == 1:
	71	return string
	72	if encoding is None:
	73	encoding = 'utf-8'
	74	if errors is None:
	75	errors = 'replace'
	76	# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
	77	pct_sequence = b''
	78	string = res[0]
	79	for item in res[1:]:
	80	try:
	81	if not item:
	82	raise ValueError
	83	pct_sequence += item[:2].decode('hex')
	84	rest = item[2:]
	85	if not rest:
	86	# This segment was just a single percent-encoded character.
	87	# May be part of a sequence of code units, so delay decoding.
	88	# (Stored in pct_sequence).
	89	continue
	90	except ValueError:
	91	rest = '%' + item
	92	# Encountered non-percent-encoded characters. Flush the current
	93	# pct_sequence.
	94	string += pct_sequence.decode(encoding, errors) + rest
	95	pct_sequence = b''
	96	if pct_sequence:
	97	# Flush the final pct_sequence
	98	string += pct_sequence.decode(encoding, errors)
	99	return string
	100
	101	def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
	102	encoding='utf-8', errors='replace'):
	103	qs, _coerce_result = qs, unicode
	104	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
	105	r = []
	106	for name_value in pairs:
	107	if not name_value and not strict_parsing:
	108	continue
	109	nv = name_value.split('=', 1)
	110	if len(nv) != 2:
	111	if strict_parsing:
	112	raise ValueError("bad query field: %r" % (name_value,))
	113	# Handle case of a control-name with no equal sign
	114	if keep_blank_values:
	115	nv.append('')
	116	else:
	117	continue
	118	if len(nv[1]) or keep_blank_values:
	119	name = nv[0].replace('+', ' ')
	120	name = _unquote(name, encoding=encoding, errors=errors)
	121	name = _coerce_result(name)
	122	value = nv[1].replace('+', ' ')
	123	value = _unquote(value, encoding=encoding, errors=errors)
	124	value = _coerce_result(value)
	125	r.append((name, value))
	126	return r
	127
128	def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
129	encoding='utf-8', errors='replace'):
130	parsed_result = {}
131	pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
132	encoding=encoding, errors=errors)
133	for name, value in pairs:
134	if name in parsed_result:
135	parsed_result[name].append(value)
136	else:
137	parsed_result[name] = [value]
138	return parsed_result
348d0a7a	139
3e669f36	140	try:
59ae15a5	141	compat_str = unicode # Python 2
3e669f36	142	except NameError:
59ae15a5	143	compat_str = str
3e669f36 PH	144
3e669f36 PH	145	try:
59ae15a5	146	compat_chr = unichr # Python 2
3e669f36	147	except NameError:
59ae15a5	148	compat_chr = chr
3e669f36	149
3e669f36	150	std_headers = {
59ae15a5 PH	151	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
	152	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	153	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	154	'Accept-Encoding': 'gzip, deflate',
	155	'Accept-Language': 'en-us,en;q=0.5',
3e669f36	156	}
d77c3dfd	157	def preferredencoding():
59ae15a5	158	"""Get preferred encoding.
d77c3dfd	159
59ae15a5 PH	160	Returns the best encoding scheme for the system, based on
	161	locale.getpreferredencoding() and some further tweaks.
	162	"""
	163	try:
	164	pref = locale.getpreferredencoding()
	165	u'TEST'.encode(pref)
	166	except:
	167	pref = 'UTF-8'
bae611f2	168
59ae15a5	169	return pref
d77c3dfd	170
8cd10ac4	171	if sys.version_info < (3,0):
59ae15a5 PH	172	def compat_print(s):
59ae15a5 PH	173	print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4	174	else:
59ae15a5 PH	175	def compat_print(s):
	176	assert type(s) == type(u'')
	177	print(s)
d77c3dfd	178
f4bfd65f PH	179	# In Python 2.x, json.dump expects a bytestream.
	180	# In Python 3.x, it writes to a character stream
	181	if sys.version_info < (3,0):
	182	def write_json_file(obj, fn):
	183	with open(fn, 'wb') as f:
	184	json.dump(obj, f)
	185	else:
	186	def write_json_file(obj, fn):
	187	with open(fn, 'w', encoding='utf-8') as f:
	188	json.dump(obj, f)
	189
	190
d77c3dfd	191	def htmlentity_transform(matchobj):
59ae15a5 PH	192	"""Transforms an HTML entity to a character.
	193
	194	This function receives a match object and is intended to be used with
	195	the re.sub() function.
	196	"""
	197	entity = matchobj.group(1)
	198
	199	# Known non-numeric HTML entity
	200	if entity in compat_html_entities.name2codepoint:
	201	return compat_chr(compat_html_entities.name2codepoint[entity])
	202
	203	mobj = re.match(u'(?u)#(x?\\d+)', entity)
	204	if mobj is not None:
	205	numstr = mobj.group(1)
	206	if numstr.startswith(u'x'):
	207	base = 16
	208	numstr = u'0%s' % numstr
	209	else:
	210	base = 10
	211	return compat_chr(int(numstr, base))
	212
	213	# Unknown entity in name, return its literal representation
	214	return (u'&%s;' % entity)
d77c3dfd	215
a8156c1d PH	216	compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_](?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>](?:\s=+\s(?:'[^']'\|"[^"]"\|(?!['"])[^>\s]))?\s))?\s""", re.VERBOSE) # backport bugfix
a8156c1d PH	217	class IDParser(compat_html_parser.HTMLParser):
59ae15a5 PH	218	"""Modified HTMLParser that isolates a tag with the specified id"""
	219	def __init__(self, id):
	220	self.id = id
	221	self.result = None
	222	self.started = False
	223	self.depth = {}
	224	self.html = None
	225	self.watch_startpos = False
	226	self.error_count = 0
	227	compat_html_parser.HTMLParser.__init__(self)
	228
	229	def error(self, message):
	230	if self.error_count > 10 or self.started:
	231	raise compat_html_parser.HTMLParseError(message, self.getpos())
	232	self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
	233	self.error_count += 1
	234	self.goahead(1)
	235
	236	def loads(self, html):
	237	self.html = html
	238	self.feed(html)
	239	self.close()
	240
	241	def handle_starttag(self, tag, attrs):
	242	attrs = dict(attrs)
	243	if self.started:
	244	self.find_startpos(None)
	245	if 'id' in attrs and attrs['id'] == self.id:
	246	self.result = [tag]
	247	self.started = True
	248	self.watch_startpos = True
	249	if self.started:
	250	if not tag in self.depth: self.depth[tag] = 0
	251	self.depth[tag] += 1
	252
	253	def handle_endtag(self, tag):
	254	if self.started:
	255	if tag in self.depth: self.depth[tag] -= 1
	256	if self.depth[self.result[0]] == 0:
	257	self.started = False
	258	self.result.append(self.getpos())
	259
	260	def find_startpos(self, x):
	261	"""Needed to put the start position of the result (self.result[1])
	262	after the opening tag with the requested id"""
	263	if self.watch_startpos:
	264	self.watch_startpos = False
	265	self.result.append(self.getpos())
	266	handle_entityref = handle_charref = handle_data = handle_comment = \
	267	handle_decl = handle_pi = unknown_decl = find_startpos
	268
	269	def get_result(self):
	270	if self.result is None:
	271	return None
	272	if len(self.result) != 3:
	273	return None
	274	lines = self.html.split('\n')
	275	lines = lines[self.result[1][0]-1:self.result[2][0]]
	276	lines[0] = lines[0][self.result[1][1]:]
	277	if len(lines) == 1:
	278	lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
	279	lines[-1] = lines[-1][:self.result[2][1]]
	280	return '\n'.join(lines).strip()
9e6dd238 FV	281
9e6dd238 FV	282	def get_element_by_id(id, html):
59ae15a5 PH	283	"""Return the content of the tag with the specified id in the passed HTML document"""
	284	parser = IDParser(id)
	285	try:
	286	parser.loads(html)
	287	except compat_html_parser.HTMLParseError:
	288	pass
	289	return parser.get_result()
9e6dd238 FV	290
	291
	292	def clean_html(html):
59ae15a5 PH	293	"""Clean an HTML snippet into a readable string"""
	294	# Newline vs <br />
	295	html = html.replace('\n', ' ')
	296	html = re.sub('\s<\sbr\s/?\s>\s*', '\n', html)
	297	# Strip html tags
	298	html = re.sub('<.*?>', '', html)
	299	# Replace html entities
	300	html = unescapeHTML(html)
	301	return html
9e6dd238 FV	302
9e6dd238 FV	303
d77c3dfd	304	def sanitize_open(filename, open_mode):
59ae15a5 PH	305	"""Try to open the given filename, and slightly tweak it if this fails.
	306
	307	Attempts to open the given filename. If this fails, it tries to change
	308	the filename slightly, step by step, until it's either able to open it
	309	or it fails and raises a final exception, like the standard open()
	310	function.
	311
	312	It returns the tuple (stream, definitive_file_name).
	313	"""
	314	try:
	315	if filename == u'-':
	316	if sys.platform == 'win32':
	317	import msvcrt
	318	msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
	319	return (sys.stdout, filename)
	320	stream = open(encodeFilename(filename), open_mode)
	321	return (stream, filename)
	322	except (IOError, OSError) as err:
	323	# In case of error, try to remove win32 forbidden chars
	324	filename = re.sub(u'[/<>:"\\\|\\\\?\\*]', u'#', filename)
	325
	326	# An exception here should be caught in the caller
	327	stream = open(encodeFilename(filename), open_mode)
	328	return (stream, filename)
d77c3dfd FV	329
	330
	331	def timeconvert(timestr):
59ae15a5 PH	332	"""Convert RFC 2822 defined time string into system timestamp"""
	333	timestamp = None
	334	timetuple = email.utils.parsedate_tz(timestr)
	335	if timetuple is not None:
	336	timestamp = email.utils.mktime_tz(timetuple)
	337	return timestamp
1c469a94	338
796173d0	339	def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5 PH	340	"""Sanitizes a string so it could be used as part of a filename.
59ae15a5 PH	341	If restricted is set, use a stricter subset of allowed characters.
796173d0	342	Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5 PH	343	"""
	344	def replace_insane(char):
	345	if char == '?' or ord(char) < 32 or ord(char) == 127:
	346	return ''
	347	elif char == '"':
	348	return '' if restricted else '\''
	349	elif char == ':':
	350	return '_-' if restricted else ' -'
	351	elif char in '\\/\|*<>':
	352	return '_'
627dcfff	353	if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5 PH	354	return '_'
	355	if restricted and ord(char) > 127:
	356	return '_'
	357	return char
	358
	359	result = u''.join(map(replace_insane, s))
796173d0 PH	360	if not is_id:
	361	while '__' in result:
	362	result = result.replace('__', '_')
	363	result = result.strip('_')
	364	# Common case of "Foreign band name - English song title"
	365	if restricted and result.startswith('-_'):
	366	result = result[2:]
	367	if not result:
	368	result = '_'
59ae15a5	369	return result
d77c3dfd FV	370
d77c3dfd FV	371	def orderedSet(iterable):
59ae15a5 PH	372	""" Remove all duplicates from the input iterable """
	373	res = []
	374	for el in iterable:
	375	if el not in res:
	376	res.append(el)
	377	return res
d77c3dfd FV	378
d77c3dfd FV	379	def unescapeHTML(s):
59ae15a5 PH	380	"""
	381	@param s a string
	382	"""
	383	assert type(s) == type(u'')
d77c3dfd	384
59ae15a5 PH	385	result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 PH	386	return result
d77c3dfd FV	387
d77c3dfd FV	388	def encodeFilename(s):
59ae15a5 PH	389	"""
	390	@param s The name of the file
	391	"""
d77c3dfd	392
59ae15a5	393	assert type(s) == type(u'')
d77c3dfd	394
59ae15a5 PH	395	# Python 3 has a Unicode API
	396	if sys.version_info >= (3, 0):
	397	return s
0f00efed	398
59ae15a5 PH	399	if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
	400	# Pass u'' directly to use Unicode APIs on Windows 2000 and up
	401	# (Detecting Windows NT 4 is tricky because 'major >= 4' would
	402	# match Windows 9x series as well. Besides, NT 4 is obsolete.)
	403	return s
	404	else:
	405	return s.encode(sys.getfilesystemencoding(), 'ignore')
d77c3dfd FV	406
d77c3dfd FV	407	class DownloadError(Exception):
59ae15a5	408	"""Download Error exception.
d77c3dfd	409
59ae15a5 PH	410	This exception may be thrown by FileDownloader objects if they are not
	411	configured to continue on errors. They will contain the appropriate
	412	error message.
	413	"""
	414	pass
d77c3dfd FV	415
	416
	417	class SameFileError(Exception):
59ae15a5	418	"""Same File exception.
d77c3dfd	419
59ae15a5 PH	420	This exception will be thrown by FileDownloader objects if they detect
	421	multiple files would have to be downloaded to the same file on disk.
	422	"""
	423	pass
d77c3dfd FV	424
	425
	426	class PostProcessingError(Exception):
59ae15a5	427	"""Post Processing exception.
d77c3dfd	428
59ae15a5 PH	429	This exception may be raised by PostProcessor's .run() method to
	430	indicate an error in the postprocessing task.
	431	"""
	432	pass
d77c3dfd FV	433
d77c3dfd FV	434	class MaxDownloadsReached(Exception):
59ae15a5 PH	435	""" --max-downloads limit has been reached. """
59ae15a5 PH	436	pass
d77c3dfd FV	437
	438
	439	class UnavailableVideoError(Exception):
59ae15a5	440	"""Unavailable Format exception.
d77c3dfd	441
59ae15a5 PH	442	This exception will be thrown when a video is requested
	443	in a format that is not available for that video.
	444	"""
	445	pass
d77c3dfd FV	446
	447
	448	class ContentTooShortError(Exception):
59ae15a5	449	"""Content Too Short exception.
d77c3dfd	450
59ae15a5 PH	451	This exception may be raised by FileDownloader objects when a file they
	452	download is too small for what the server announced first, indicating
	453	the connection was probably interrupted.
	454	"""
	455	# Both in bytes
	456	downloaded = None
	457	expected = None
d77c3dfd	458
59ae15a5 PH	459	def __init__(self, downloaded, expected):
	460	self.downloaded = downloaded
	461	self.expected = expected
d77c3dfd FV	462
d77c3dfd FV	463
0b8c922d	464	class Trouble(Exception):
59ae15a5	465	"""Trouble helper exception
dffe658b	466
59ae15a5 PH	467	This is an exception to be handled with
	468	FileDownloader.trouble
	469	"""
0b8c922d	470
01ba00ca	471	class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5 PH	472	"""Handler for HTTP requests and responses.
	473
	474	This class, when installed with an OpenerDirector, automatically adds
	475	the standard headers to every HTTP request and handles gzipped and
	476	deflated responses from web servers. If compression is to be avoided in
	477	a particular request, the original request in the program code only has
	478	to include the HTTP header "Youtubedl-No-Compression", which will be
	479	removed before making the real request.
	480
	481	Part of this code was copied from:
	482
	483	http://techknack.net/python-urllib2-handlers/
	484
	485	Andrew Rowls, the author of that code, agreed to release it to the
	486	public domain.
	487	"""
	488
	489	@staticmethod
	490	def deflate(data):
	491	try:
	492	return zlib.decompress(data, -zlib.MAX_WBITS)
	493	except zlib.error:
	494	return zlib.decompress(data)
	495
	496	@staticmethod
	497	def addinfourl_wrapper(stream, headers, url, code):
	498	if hasattr(compat_urllib_request.addinfourl, 'getcode'):
	499	return compat_urllib_request.addinfourl(stream, headers, url, code)
	500	ret = compat_urllib_request.addinfourl(stream, headers, url)
	501	ret.code = code
	502	return ret
	503
	504	def http_request(self, req):
	505	for h in std_headers:
	506	if h in req.headers:
	507	del req.headers[h]
	508	req.add_header(h, std_headers[h])
	509	if 'Youtubedl-no-compression' in req.headers:
	510	if 'Accept-encoding' in req.headers:
	511	del req.headers['Accept-encoding']
	512	del req.headers['Youtubedl-no-compression']
	513	return req
	514
	515	def http_response(self, req, resp):
	516	old_resp = resp
	517	# gzip
	518	if resp.headers.get('Content-encoding', '') == 'gzip':
	519	gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
	520	resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
	521	resp.msg = old_resp.msg
	522	# deflate
	523	if resp.headers.get('Content-encoding', '') == 'deflate':
	524	gz = io.BytesIO(self.deflate(resp.read()))
	525	resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
	526	resp.msg = old_resp.msg
	527	return resp
0f8d03f8 PH	528
	529	https_request = http_request
	530	https_response = http_response