yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import base64
   8 import binascii
   9 import calendar
  10 import codecs
  11 import collections
  12 import contextlib
  13 import ctypes
  14 import datetime
  15 import email.utils
  16 import email.header
  17 import errno
  18 import functools
  19 import gzip
  20 import hashlib
  21 import hmac
  22 import importlib.util
  23 import io
  24 import itertools
  25 import json
  26 import locale
  27 import math
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import socket
  34 import ssl
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import xml.etree.ElementTree
  41 import zlib
  42 import mimetypes
  43
  44 from .compat import (
  45     compat_HTMLParseError,
  46     compat_HTMLParser,
  47     compat_HTTPError,
  48     compat_basestring,
  49     compat_chr,
  50     compat_cookiejar,
  51     compat_ctypes_WINFUNCTYPE,
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_html_entities,
  55     compat_html_entities_html5,
  56     compat_http_client,
  57     compat_integer_types,
  58     compat_numeric_types,
  59     compat_kwargs,
  60     compat_os_name,
  61     compat_parse_qs,
  62     compat_shlex_split,
  63     compat_shlex_quote,
  64     compat_str,
  65     compat_struct_pack,
  66     compat_struct_unpack,
  67     compat_urllib_error,
  68     compat_urllib_parse,
  69     compat_urllib_parse_urlencode,
  70     compat_urllib_parse_urlparse,
  71     compat_urllib_parse_urlunparse,
  72     compat_urllib_parse_quote,
  73     compat_urllib_parse_quote_plus,
  74     compat_urllib_parse_unquote_plus,
  75     compat_urllib_request,
  76     compat_urlparse,
  77     compat_websockets,
  78     compat_xpath,
  79 )
  80
  81 from .socks import (
  82     ProxyType,
  83     sockssocket,
  84 )
  85
  86
  87 def register_socks_protocols():
  88     # "Register" SOCKS protocols
  89     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  90     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  91     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  92         if scheme not in compat_urlparse.uses_netloc:
  93             compat_urlparse.uses_netloc.append(scheme)
  94
  95
  96 # This is not clearly defined otherwise
  97 compiled_regex_type = type(re.compile(''))
  98
  99
 100 def random_user_agent():
 101     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 102     _CHROME_VERSIONS = (
 103         '90.0.4430.212',
 104         '90.0.4430.24',
 105         '90.0.4430.70',
 106         '90.0.4430.72',
 107         '90.0.4430.85',
 108         '90.0.4430.93',
 109         '91.0.4472.101',
 110         '91.0.4472.106',
 111         '91.0.4472.114',
 112         '91.0.4472.124',
 113         '91.0.4472.164',
 114         '91.0.4472.19',
 115         '91.0.4472.77',
 116         '92.0.4515.107',
 117         '92.0.4515.115',
 118         '92.0.4515.131',
 119         '92.0.4515.159',
 120         '92.0.4515.43',
 121         '93.0.4556.0',
 122         '93.0.4577.15',
 123         '93.0.4577.63',
 124         '93.0.4577.82',
 125         '94.0.4606.41',
 126         '94.0.4606.54',
 127         '94.0.4606.61',
 128         '94.0.4606.71',
 129         '94.0.4606.81',
 130         '94.0.4606.85',
 131         '95.0.4638.17',
 132         '95.0.4638.50',
 133         '95.0.4638.54',
 134         '95.0.4638.69',
 135         '95.0.4638.74',
 136         '96.0.4664.18',
 137         '96.0.4664.45',
 138         '96.0.4664.55',
 139         '96.0.4664.93',
 140         '97.0.4692.20',
 141     )
 142     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 143
 144
 145 std_headers = {
 146     'User-Agent': random_user_agent(),
 147     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 148     'Accept-Encoding': 'gzip, deflate',
 149     'Accept-Language': 'en-us,en;q=0.5',
 150     'Sec-Fetch-Mode': 'navigate',
 151 }
 152
 153
 154 USER_AGENTS = {
 155     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 156 }
 157
 158
 159 NO_DEFAULT = object()
 160
 161 ENGLISH_MONTH_NAMES = [
 162     'January', 'February', 'March', 'April', 'May', 'June',
 163     'July', 'August', 'September', 'October', 'November', 'December']
 164
 165 MONTH_NAMES = {
 166     'en': ENGLISH_MONTH_NAMES,
 167     'fr': [
 168         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 169         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 170 }
 171
 172 KNOWN_EXTENSIONS = (
 173     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 174     'flv', 'f4v', 'f4a', 'f4b',
 175     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 176     'mkv', 'mka', 'mk3d',
 177     'avi', 'divx',
 178     'mov',
 179     'asf', 'wmv', 'wma',
 180     '3gp', '3g2',
 181     'mp3',
 182     'flac',
 183     'ape',
 184     'wav',
 185     'f4f', 'f4m', 'm3u8', 'smil')
 186
 187 # needed for sanitizing filenames in restricted mode
 188 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 189                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 190                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 191
 192 DATE_FORMATS = (
 193     '%d %B %Y',
 194     '%d %b %Y',
 195     '%B %d %Y',
 196     '%B %dst %Y',
 197     '%B %dnd %Y',
 198     '%B %drd %Y',
 199     '%B %dth %Y',
 200     '%b %d %Y',
 201     '%b %dst %Y',
 202     '%b %dnd %Y',
 203     '%b %drd %Y',
 204     '%b %dth %Y',
 205     '%b %dst %Y %I:%M',
 206     '%b %dnd %Y %I:%M',
 207     '%b %drd %Y %I:%M',
 208     '%b %dth %Y %I:%M',
 209     '%Y %m %d',
 210     '%Y-%m-%d',
 211     '%Y.%m.%d.',
 212     '%Y/%m/%d',
 213     '%Y/%m/%d %H:%M',
 214     '%Y/%m/%d %H:%M:%S',
 215     '%Y%m%d%H%M',
 216     '%Y%m%d%H%M%S',
 217     '%Y%m%d',
 218     '%Y-%m-%d %H:%M',
 219     '%Y-%m-%d %H:%M:%S',
 220     '%Y-%m-%d %H:%M:%S.%f',
 221     '%Y-%m-%d %H:%M:%S:%f',
 222     '%d.%m.%Y %H:%M',
 223     '%d.%m.%Y %H.%M',
 224     '%Y-%m-%dT%H:%M:%SZ',
 225     '%Y-%m-%dT%H:%M:%S.%fZ',
 226     '%Y-%m-%dT%H:%M:%S.%f0Z',
 227     '%Y-%m-%dT%H:%M:%S',
 228     '%Y-%m-%dT%H:%M:%S.%f',
 229     '%Y-%m-%dT%H:%M',
 230     '%b %d %Y at %H:%M',
 231     '%b %d %Y at %H:%M:%S',
 232     '%B %d %Y at %H:%M',
 233     '%B %d %Y at %H:%M:%S',
 234     '%H:%M %d-%b-%Y',
 235 )
 236
 237 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 238 DATE_FORMATS_DAY_FIRST.extend([
 239     '%d-%m-%Y',
 240     '%d.%m.%Y',
 241     '%d.%m.%y',
 242     '%d/%m/%Y',
 243     '%d/%m/%y',
 244     '%d/%m/%Y %H:%M:%S',
 245 ])
 246
 247 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 248 DATE_FORMATS_MONTH_FIRST.extend([
 249     '%m-%d-%Y',
 250     '%m.%d.%Y',
 251     '%m/%d/%Y',
 252     '%m/%d/%y',
 253     '%m/%d/%Y %H:%M:%S',
 254 ])
 255
 256 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 257 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 258
 259
 260 def preferredencoding():
 261     """Get preferred encoding.
 262
 263     Returns the best encoding scheme for the system, based on
 264     locale.getpreferredencoding() and some further tweaks.
 265     """
 266     try:
 267         pref = locale.getpreferredencoding()
 268         'TEST'.encode(pref)
 269     except Exception:
 270         pref = 'UTF-8'
 271
 272     return pref
 273
 274
 275 def write_json_file(obj, fn):
 276     """ Encode obj as JSON and write it to fn, atomically if possible """
 277
 278     fn = encodeFilename(fn)
 279     if sys.version_info < (3, 0) and sys.platform != 'win32':
 280         encoding = get_filesystem_encoding()
 281         # os.path.basename returns a bytes object, but NamedTemporaryFile
 282         # will fail if the filename contains non ascii characters unless we
 283         # use a unicode object
 284         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 285         # the same for os.path.dirname
 286         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 287     else:
 288         path_basename = os.path.basename
 289         path_dirname = os.path.dirname
 290
 291     args = {
 292         'suffix': '.tmp',
 293         'prefix': path_basename(fn) + '.',
 294         'dir': path_dirname(fn),
 295         'delete': False,
 296     }
 297
 298     # In Python 2.x, json.dump expects a bytestream.
 299     # In Python 3.x, it writes to a character stream
 300     if sys.version_info < (3, 0):
 301         args['mode'] = 'wb'
 302     else:
 303         args.update({
 304             'mode': 'w',
 305             'encoding': 'utf-8',
 306         })
 307
 308     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 309
 310     try:
 311         with tf:
 312             json.dump(obj, tf, ensure_ascii=False)
 313         if sys.platform == 'win32':
 314             # Need to remove existing file on Windows, else os.rename raises
 315             # WindowsError or FileExistsError.
 316             try:
 317                 os.unlink(fn)
 318             except OSError:
 319                 pass
 320         try:
 321             mask = os.umask(0)
 322             os.umask(mask)
 323             os.chmod(tf.name, 0o666 & ~mask)
 324         except OSError:
 325             pass
 326         os.rename(tf.name, fn)
 327     except Exception:
 328         try:
 329             os.remove(tf.name)
 330         except OSError:
 331             pass
 332         raise
 333
 334
 335 if sys.version_info >= (2, 7):
 336     def find_xpath_attr(node, xpath, key, val=None):
 337         """ Find the xpath xpath[@key=val] """
 338         assert re.match(r'^[a-zA-Z_-]+$', key)
 339         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 340         return node.find(expr)
 341 else:
 342     def find_xpath_attr(node, xpath, key, val=None):
 343         for f in node.findall(compat_xpath(xpath)):
 344             if key not in f.attrib:
 345                 continue
 346             if val is None or f.attrib.get(key) == val:
 347                 return f
 348         return None
 349
 350 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 351 # the namespace parameter
 352
 353
 354 def xpath_with_ns(path, ns_map):
 355     components = [c.split(':') for c in path.split('/')]
 356     replaced = []
 357     for c in components:
 358         if len(c) == 1:
 359             replaced.append(c[0])
 360         else:
 361             ns, tag = c
 362             replaced.append('{%s}%s' % (ns_map[ns], tag))
 363     return '/'.join(replaced)
 364
 365
 366 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 367     def _find_xpath(xpath):
 368         return node.find(compat_xpath(xpath))
 369
 370     if isinstance(xpath, (str, compat_str)):
 371         n = _find_xpath(xpath)
 372     else:
 373         for xp in xpath:
 374             n = _find_xpath(xp)
 375             if n is not None:
 376                 break
 377
 378     if n is None:
 379         if default is not NO_DEFAULT:
 380             return default
 381         elif fatal:
 382             name = xpath if name is None else name
 383             raise ExtractorError('Could not find XML element %s' % name)
 384         else:
 385             return None
 386     return n
 387
 388
 389 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 390     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 391     if n is None or n == default:
 392         return n
 393     if n.text is None:
 394         if default is not NO_DEFAULT:
 395             return default
 396         elif fatal:
 397             name = xpath if name is None else name
 398             raise ExtractorError('Could not find XML element\'s text %s' % name)
 399         else:
 400             return None
 401     return n.text
 402
 403
 404 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 405     n = find_xpath_attr(node, xpath, key)
 406     if n is None:
 407         if default is not NO_DEFAULT:
 408             return default
 409         elif fatal:
 410             name = '%s[@%s]' % (xpath, key) if name is None else name
 411             raise ExtractorError('Could not find XML attribute %s' % name)
 412         else:
 413             return None
 414     return n.attrib[key]
 415
 416
 417 def get_element_by_id(id, html):
 418     """Return the content of the tag with the specified ID in the passed HTML document"""
 419     return get_element_by_attribute('id', id, html)
 420
 421
 422 def get_element_html_by_id(id, html):
 423     """Return the html of the tag with the specified ID in the passed HTML document"""
 424     return get_element_html_by_attribute('id', id, html)
 425
 426
 427 def get_element_by_class(class_name, html):
 428     """Return the content of the first tag with the specified class in the passed HTML document"""
 429     retval = get_elements_by_class(class_name, html)
 430     return retval[0] if retval else None
 431
 432
 433 def get_element_html_by_class(class_name, html):
 434     """Return the html of the first tag with the specified class in the passed HTML document"""
 435     retval = get_elements_html_by_class(class_name, html)
 436     return retval[0] if retval else None
 437
 438
 439 def get_element_by_attribute(attribute, value, html, escape_value=True):
 440     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 441     return retval[0] if retval else None
 442
 443
 444 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 445     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 446     return retval[0] if retval else None
 447
 448
 449 def get_elements_by_class(class_name, html):
 450     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 451     return get_elements_by_attribute(
 452         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 453         html, escape_value=False)
 454
 455
 456 def get_elements_html_by_class(class_name, html):
 457     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 458     return get_elements_html_by_attribute(
 459         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 460         html, escape_value=False)
 461
 462
 463 def get_elements_by_attribute(*args, **kwargs):
 464     """Return the content of the tag with the specified attribute in the passed HTML document"""
 465     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 466
 467
 468 def get_elements_html_by_attribute(*args, **kwargs):
 469     """Return the html of the tag with the specified attribute in the passed HTML document"""
 470     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 471
 472
 473 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 474     """
 475     Return the text (content) and the html (whole) of the tag with the specified
 476     attribute in the passed HTML document
 477     """
 478
 479     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 480
 481     value = re.escape(value) if escape_value else value
 482
 483     partial_element_re = r'''(?x)
 484         <(?P<tag>[a-zA-Z0-9:._-]+)
 485          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 486          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 487         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 488
 489     for m in re.finditer(partial_element_re, html):
 490         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 491
 492         yield (
 493             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 494             whole
 495         )
 496
 497
 498 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 499     """
 500     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 501     closing tag for the first opening tag it has encountered, and can be used
 502     as a context manager
 503     """
 504
 505     class HTMLBreakOnClosingTagException(Exception):
 506         pass
 507
 508     def __init__(self):
 509         self.tagstack = collections.deque()
 510         compat_HTMLParser.__init__(self)
 511
 512     def __enter__(self):
 513         return self
 514
 515     def __exit__(self, *_):
 516         self.close()
 517
 518     def close(self):
 519         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 520         # so data remains buffered; we no longer have any interest in it, thus
 521         # override this method to discard it
 522         pass
 523
 524     def handle_starttag(self, tag, _):
 525         self.tagstack.append(tag)
 526
 527     def handle_endtag(self, tag):
 528         if not self.tagstack:
 529             raise compat_HTMLParseError('no tags in the stack')
 530         while self.tagstack:
 531             inner_tag = self.tagstack.pop()
 532             if inner_tag == tag:
 533                 break
 534         else:
 535             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 536         if not self.tagstack:
 537             raise self.HTMLBreakOnClosingTagException()
 538
 539
 540 def get_element_text_and_html_by_tag(tag, html):
 541     """
 542     For the first element with the specified tag in the passed HTML document
 543     return its' content (text) and the whole element (html)
 544     """
 545     def find_or_raise(haystack, needle, exc):
 546         try:
 547             return haystack.index(needle)
 548         except ValueError:
 549             raise exc
 550     closing_tag = f'</{tag}>'
 551     whole_start = find_or_raise(
 552         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 553     content_start = find_or_raise(
 554         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 555     content_start += whole_start + 1
 556     with HTMLBreakOnClosingTagParser() as parser:
 557         parser.feed(html[whole_start:content_start])
 558         if not parser.tagstack or parser.tagstack[0] != tag:
 559             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 560         offset = content_start
 561         while offset < len(html):
 562             next_closing_tag_start = find_or_raise(
 563                 html[offset:], closing_tag,
 564                 compat_HTMLParseError(f'closing {tag} tag not found'))
 565             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 566             try:
 567                 parser.feed(html[offset:offset + next_closing_tag_end])
 568                 offset += next_closing_tag_end
 569             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 570                 return html[content_start:offset + next_closing_tag_start], \
 571                     html[whole_start:offset + next_closing_tag_end]
 572         raise compat_HTMLParseError('unexpected end of html')
 573
 574
 575 class HTMLAttributeParser(compat_HTMLParser):
 576     """Trivial HTML parser to gather the attributes for a single element"""
 577
 578     def __init__(self):
 579         self.attrs = {}
 580         compat_HTMLParser.__init__(self)
 581
 582     def handle_starttag(self, tag, attrs):
 583         self.attrs = dict(attrs)
 584
 585
 586 class HTMLListAttrsParser(compat_HTMLParser):
 587     """HTML parser to gather the attributes for the elements of a list"""
 588
 589     def __init__(self):
 590         compat_HTMLParser.__init__(self)
 591         self.items = []
 592         self._level = 0
 593
 594     def handle_starttag(self, tag, attrs):
 595         if tag == 'li' and self._level == 0:
 596             self.items.append(dict(attrs))
 597         self._level += 1
 598
 599     def handle_endtag(self, tag):
 600         self._level -= 1
 601
 602
 603 def extract_attributes(html_element):
 604     """Given a string for an HTML element such as
 605     <el
 606          a="foo" B="bar" c="&98;az" d=boz
 607          empty= noval entity="&amp;"
 608          sq='"' dq="'"
 609     >
 610     Decode and return a dictionary of attributes.
 611     {
 612         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 613         'empty': '', 'noval': None, 'entity': '&',
 614         'sq': '"', 'dq': '\''
 615     }.
 616     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 617     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 618     """
 619     parser = HTMLAttributeParser()
 620     try:
 621         parser.feed(html_element)
 622         parser.close()
 623     # Older Python may throw HTMLParseError in case of malformed HTML
 624     except compat_HTMLParseError:
 625         pass
 626     return parser.attrs
 627
 628
 629 def parse_list(webpage):
 630     """Given a string for an series of HTML <li> elements,
 631     return a dictionary of their attributes"""
 632     parser = HTMLListAttrsParser()
 633     parser.feed(webpage)
 634     parser.close()
 635     return parser.items
 636
 637
 638 def clean_html(html):
 639     """Clean an HTML snippet into a readable string"""
 640
 641     if html is None:  # Convenience for sanitizing descriptions etc.
 642         return html
 643
 644     html = re.sub(r'\s+', ' ', html)
 645     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 646     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 647     # Strip html tags
 648     html = re.sub('<.*?>', '', html)
 649     # Replace html entities
 650     html = unescapeHTML(html)
 651     return html.strip()
 652
 653
 654 def sanitize_open(filename, open_mode):
 655     """Try to open the given filename, and slightly tweak it if this fails.
 656
 657     Attempts to open the given filename. If this fails, it tries to change
 658     the filename slightly, step by step, until it's either able to open it
 659     or it fails and raises a final exception, like the standard open()
 660     function.
 661
 662     It returns the tuple (stream, definitive_file_name).
 663     """
 664     try:
 665         if filename == '-':
 666             if sys.platform == 'win32':
 667                 import msvcrt
 668                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 669             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 670         stream = locked_file(filename, open_mode, block=False).open()
 671         return (stream, filename)
 672     except (IOError, OSError) as err:
 673         if err.errno in (errno.EACCES,):
 674             raise
 675
 676         # In case of error, try to remove win32 forbidden chars
 677         alt_filename = sanitize_path(filename)
 678         if alt_filename == filename:
 679             raise
 680         else:
 681             # An exception here should be caught in the caller
 682             stream = locked_file(filename, open_mode, block=False).open()
 683             return (stream, alt_filename)
 684
 685
 686 def timeconvert(timestr):
 687     """Convert RFC 2822 defined time string into system timestamp"""
 688     timestamp = None
 689     timetuple = email.utils.parsedate_tz(timestr)
 690     if timetuple is not None:
 691         timestamp = email.utils.mktime_tz(timetuple)
 692     return timestamp
 693
 694
 695 def sanitize_filename(s, restricted=False, is_id=False):
 696     """Sanitizes a string so it could be used as part of a filename.
 697     If restricted is set, use a stricter subset of allowed characters.
 698     Set is_id if this is not an arbitrary string, but an ID that should be kept
 699     if possible.
 700     """
 701     def replace_insane(char):
 702         if restricted and char in ACCENT_CHARS:
 703             return ACCENT_CHARS[char]
 704         elif not restricted and char == '\n':
 705             return ' '
 706         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 707             return ''
 708         elif char == '"':
 709             return '' if restricted else '\''
 710         elif char == ':':
 711             return '_-' if restricted else ' -'
 712         elif char in '\\/|*<>':
 713             return '_'
 714         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 715             return '_'
 716         if restricted and ord(char) > 127:
 717             return '_'
 718         return char
 719
 720     if s == '':
 721         return ''
 722     # Handle timestamps
 723     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 724     result = ''.join(map(replace_insane, s))
 725     if not is_id:
 726         while '__' in result:
 727             result = result.replace('__', '_')
 728         result = result.strip('_')
 729         # Common case of "Foreign band name - English song title"
 730         if restricted and result.startswith('-_'):
 731             result = result[2:]
 732         if result.startswith('-'):
 733             result = '_' + result[len('-'):]
 734         result = result.lstrip('.')
 735         if not result:
 736             result = '_'
 737     return result
 738
 739
 740 def sanitize_path(s, force=False):
 741     """Sanitizes and normalizes path on Windows"""
 742     if sys.platform == 'win32':
 743         force = False
 744         drive_or_unc, _ = os.path.splitdrive(s)
 745         if sys.version_info < (2, 7) and not drive_or_unc:
 746             drive_or_unc, _ = os.path.splitunc(s)
 747     elif force:
 748         drive_or_unc = ''
 749     else:
 750         return s
 751
 752     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 753     if drive_or_unc:
 754         norm_path.pop(0)
 755     sanitized_path = [
 756         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 757         for path_part in norm_path]
 758     if drive_or_unc:
 759         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 760     elif force and s[0] == os.path.sep:
 761         sanitized_path.insert(0, os.path.sep)
 762     return os.path.join(*sanitized_path)
 763
 764
 765 def sanitize_url(url):
 766     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 767     # the number of unwanted failures due to missing protocol
 768     if url.startswith('//'):
 769         return 'http:%s' % url
 770     # Fix some common typos seen so far
 771     COMMON_TYPOS = (
 772         # https://github.com/ytdl-org/youtube-dl/issues/15649
 773         (r'^httpss://', r'https://'),
 774         # https://bx1.be/lives/direct-tv/
 775         (r'^rmtp([es]?)://', r'rtmp\1://'),
 776     )
 777     for mistake, fixup in COMMON_TYPOS:
 778         if re.match(mistake, url):
 779             return re.sub(mistake, fixup, url)
 780     return url
 781
 782
 783 def extract_basic_auth(url):
 784     parts = compat_urlparse.urlsplit(url)
 785     if parts.username is None:
 786         return url, None
 787     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 788         parts.hostname if parts.port is None
 789         else '%s:%d' % (parts.hostname, parts.port))))
 790     auth_payload = base64.b64encode(
 791         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 792     return url, 'Basic ' + auth_payload.decode('utf-8')
 793
 794
 795 def sanitized_Request(url, *args, **kwargs):
 796     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 797     if auth_header is not None:
 798         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 799         headers['Authorization'] = auth_header
 800     return compat_urllib_request.Request(url, *args, **kwargs)
 801
 802
 803 def expand_path(s):
 804     """Expand shell variables and ~"""
 805     return os.path.expandvars(compat_expanduser(s))
 806
 807
 808 def orderedSet(iterable):
 809     """ Remove all duplicates from the input iterable """
 810     res = []
 811     for el in iterable:
 812         if el not in res:
 813             res.append(el)
 814     return res
 815
 816
 817 def _htmlentity_transform(entity_with_semicolon):
 818     """Transforms an HTML entity to a character."""
 819     entity = entity_with_semicolon[:-1]
 820
 821     # Known non-numeric HTML entity
 822     if entity in compat_html_entities.name2codepoint:
 823         return compat_chr(compat_html_entities.name2codepoint[entity])
 824
 825     # TODO: HTML5 allows entities without a semicolon. For example,
 826     # '&Eacuteric' should be decoded as 'Éric'.
 827     if entity_with_semicolon in compat_html_entities_html5:
 828         return compat_html_entities_html5[entity_with_semicolon]
 829
 830     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 831     if mobj is not None:
 832         numstr = mobj.group(1)
 833         if numstr.startswith('x'):
 834             base = 16
 835             numstr = '0%s' % numstr
 836         else:
 837             base = 10
 838         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 839         try:
 840             return compat_chr(int(numstr, base))
 841         except ValueError:
 842             pass
 843
 844     # Unknown entity in name, return its literal representation
 845     return '&%s;' % entity
 846
 847
 848 def unescapeHTML(s):
 849     if s is None:
 850         return None
 851     assert type(s) == compat_str
 852
 853     return re.sub(
 854         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 855
 856
 857 def escapeHTML(text):
 858     return (
 859         text
 860         .replace('&', '&amp;')
 861         .replace('<', '&lt;')
 862         .replace('>', '&gt;')
 863         .replace('"', '&quot;')
 864         .replace("'", '&#39;')
 865     )
 866
 867
 868 def process_communicate_or_kill(p, *args, **kwargs):
 869     try:
 870         return p.communicate(*args, **kwargs)
 871     except BaseException:  # Including KeyboardInterrupt
 872         p.kill()
 873         p.wait()
 874         raise
 875
 876
 877 class Popen(subprocess.Popen):
 878     if sys.platform == 'win32':
 879         _startupinfo = subprocess.STARTUPINFO()
 880         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 881     else:
 882         _startupinfo = None
 883
 884     def __init__(self, *args, **kwargs):
 885         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 886
 887     def communicate_or_kill(self, *args, **kwargs):
 888         return process_communicate_or_kill(self, *args, **kwargs)
 889
 890
 891 def get_subprocess_encoding():
 892     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 893         # For subprocess calls, encode with locale encoding
 894         # Refer to http://stackoverflow.com/a/9951851/35070
 895         encoding = preferredencoding()
 896     else:
 897         encoding = sys.getfilesystemencoding()
 898     if encoding is None:
 899         encoding = 'utf-8'
 900     return encoding
 901
 902
 903 def encodeFilename(s, for_subprocess=False):
 904     """
 905     @param s The name of the file
 906     """
 907
 908     assert type(s) == compat_str
 909
 910     # Python 3 has a Unicode API
 911     if sys.version_info >= (3, 0):
 912         return s
 913
 914     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 915     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 916     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 917     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 918         return s
 919
 920     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 921     if sys.platform.startswith('java'):
 922         return s
 923
 924     return s.encode(get_subprocess_encoding(), 'ignore')
 925
 926
 927 def decodeFilename(b, for_subprocess=False):
 928
 929     if sys.version_info >= (3, 0):
 930         return b
 931
 932     if not isinstance(b, bytes):
 933         return b
 934
 935     return b.decode(get_subprocess_encoding(), 'ignore')
 936
 937
 938 def encodeArgument(s):
 939     if not isinstance(s, compat_str):
 940         # Legacy code that uses byte strings
 941         # Uncomment the following line after fixing all post processors
 942         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 943         s = s.decode('ascii')
 944     return encodeFilename(s, True)
 945
 946
 947 def decodeArgument(b):
 948     return decodeFilename(b, True)
 949
 950
 951 def decodeOption(optval):
 952     if optval is None:
 953         return optval
 954     if isinstance(optval, bytes):
 955         optval = optval.decode(preferredencoding())
 956
 957     assert isinstance(optval, compat_str)
 958     return optval
 959
 960
 961 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 962
 963
 964 def timetuple_from_msec(msec):
 965     secs, msec = divmod(msec, 1000)
 966     mins, secs = divmod(secs, 60)
 967     hrs, mins = divmod(mins, 60)
 968     return _timetuple(hrs, mins, secs, msec)
 969
 970
 971 def formatSeconds(secs, delim=':', msec=False):
 972     time = timetuple_from_msec(secs * 1000)
 973     if time.hours:
 974         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 975     elif time.minutes:
 976         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 977     else:
 978         ret = '%d' % time.seconds
 979     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 980
 981
 982 def _ssl_load_windows_store_certs(ssl_context, storename):
 983     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 984     try:
 985         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 986                  if encoding == 'x509_asn' and (
 987                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 988     except PermissionError:
 989         return
 990     for cert in certs:
 991         try:
 992             ssl_context.load_verify_locations(cadata=cert)
 993         except ssl.SSLError:
 994             pass
 995
 996
 997 def make_HTTPS_handler(params, **kwargs):
 998     opts_check_certificate = not params.get('nocheckcertificate')
 999     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1000     context.check_hostname = opts_check_certificate
1001     if params.get('legacyserverconnect'):
1002         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1003     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1004     if opts_check_certificate:
1005         try:
1006             context.load_default_certs()
1007             # Work around the issue in load_default_certs when there are bad certificates. See:
1008             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1009             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1010         except ssl.SSLError:
1011             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1012             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1013                 # Create a new context to discard any certificates that were already loaded
1014                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1015                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1016                 for storename in ('CA', 'ROOT'):
1017                     _ssl_load_windows_store_certs(context, storename)
1018             context.set_default_verify_paths()
1019     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1020
1021
1022 def bug_reports_message(before=';'):
1023     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1024            'filling out the "Broken site" issue template properly. '
1025            'Confirm you are on the latest version using -U')
1026
1027     before = before.rstrip()
1028     if not before or before.endswith(('.', '!', '?')):
1029         msg = msg[0].title() + msg[1:]
1030
1031     return (before + ' ' if before else '') + msg
1032
1033
1034 class YoutubeDLError(Exception):
1035     """Base exception for YoutubeDL errors."""
1036     msg = None
1037
1038     def __init__(self, msg=None):
1039         if msg is not None:
1040             self.msg = msg
1041         elif self.msg is None:
1042             self.msg = type(self).__name__
1043         super().__init__(self.msg)
1044
1045
1046 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1047 if hasattr(ssl, 'CertificateError'):
1048     network_exceptions.append(ssl.CertificateError)
1049 network_exceptions = tuple(network_exceptions)
1050
1051
1052 class ExtractorError(YoutubeDLError):
1053     """Error during info extraction."""
1054
1055     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1056         """ tb, if given, is the original traceback (so that it can be printed out).
1057         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1058         """
1059         if sys.exc_info()[0] in network_exceptions:
1060             expected = True
1061
1062         self.msg = str(msg)
1063         self.traceback = tb
1064         self.expected = expected
1065         self.cause = cause
1066         self.video_id = video_id
1067         self.ie = ie
1068         self.exc_info = sys.exc_info()  # preserve original exception
1069
1070         super(ExtractorError, self).__init__(''.join((
1071             format_field(ie, template='[%s] '),
1072             format_field(video_id, template='%s: '),
1073             self.msg,
1074             format_field(cause, template=' (caused by %r)'),
1075             '' if expected else bug_reports_message())))
1076
1077     def format_traceback(self):
1078         if self.traceback is None:
1079             return None
1080         return ''.join(traceback.format_tb(self.traceback))
1081
1082
1083 class UnsupportedError(ExtractorError):
1084     def __init__(self, url):
1085         super(UnsupportedError, self).__init__(
1086             'Unsupported URL: %s' % url, expected=True)
1087         self.url = url
1088
1089
1090 class RegexNotFoundError(ExtractorError):
1091     """Error when a regex didn't match"""
1092     pass
1093
1094
1095 class GeoRestrictedError(ExtractorError):
1096     """Geographic restriction Error exception.
1097
1098     This exception may be thrown when a video is not available from your
1099     geographic location due to geographic restrictions imposed by a website.
1100     """
1101
1102     def __init__(self, msg, countries=None, **kwargs):
1103         kwargs['expected'] = True
1104         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1105         self.countries = countries
1106
1107
1108 class DownloadError(YoutubeDLError):
1109     """Download Error exception.
1110
1111     This exception may be thrown by FileDownloader objects if they are not
1112     configured to continue on errors. They will contain the appropriate
1113     error message.
1114     """
1115
1116     def __init__(self, msg, exc_info=None):
1117         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1118         super(DownloadError, self).__init__(msg)
1119         self.exc_info = exc_info
1120
1121
1122 class EntryNotInPlaylist(YoutubeDLError):
1123     """Entry not in playlist exception.
1124
1125     This exception will be thrown by YoutubeDL when a requested entry
1126     is not found in the playlist info_dict
1127     """
1128     msg = 'Entry not found in info'
1129
1130
1131 class SameFileError(YoutubeDLError):
1132     """Same File exception.
1133
1134     This exception will be thrown by FileDownloader objects if they detect
1135     multiple files would have to be downloaded to the same file on disk.
1136     """
1137     msg = 'Fixed output name but more than one file to download'
1138
1139     def __init__(self, filename=None):
1140         if filename is not None:
1141             self.msg += f': {filename}'
1142         super().__init__(self.msg)
1143
1144
1145 class PostProcessingError(YoutubeDLError):
1146     """Post Processing exception.
1147
1148     This exception may be raised by PostProcessor's .run() method to
1149     indicate an error in the postprocessing task.
1150     """
1151
1152
1153 class DownloadCancelled(YoutubeDLError):
1154     """ Exception raised when the download queue should be interrupted """
1155     msg = 'The download was cancelled'
1156
1157
1158 class ExistingVideoReached(DownloadCancelled):
1159     """ --break-on-existing triggered """
1160     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1161
1162
1163 class RejectedVideoReached(DownloadCancelled):
1164     """ --break-on-reject triggered """
1165     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1166
1167
1168 class MaxDownloadsReached(DownloadCancelled):
1169     """ --max-downloads limit has been reached. """
1170     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1171
1172
1173 class ReExtractInfo(YoutubeDLError):
1174     """ Video info needs to be re-extracted. """
1175
1176     def __init__(self, msg, expected=False):
1177         super().__init__(msg)
1178         self.expected = expected
1179
1180
1181 class ThrottledDownload(ReExtractInfo):
1182     """ Download speed below --throttled-rate. """
1183     msg = 'The download speed is below throttle limit'
1184
1185     def __init__(self):
1186         super().__init__(self.msg, expected=False)
1187
1188
1189 class UnavailableVideoError(YoutubeDLError):
1190     """Unavailable Format exception.
1191
1192     This exception will be thrown when a video is requested
1193     in a format that is not available for that video.
1194     """
1195     msg = 'Unable to download video'
1196
1197     def __init__(self, err=None):
1198         if err is not None:
1199             self.msg += f': {err}'
1200         super().__init__(self.msg)
1201
1202
1203 class ContentTooShortError(YoutubeDLError):
1204     """Content Too Short exception.
1205
1206     This exception may be raised by FileDownloader objects when a file they
1207     download is too small for what the server announced first, indicating
1208     the connection was probably interrupted.
1209     """
1210
1211     def __init__(self, downloaded, expected):
1212         super(ContentTooShortError, self).__init__(
1213             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1214         )
1215         # Both in bytes
1216         self.downloaded = downloaded
1217         self.expected = expected
1218
1219
1220 class XAttrMetadataError(YoutubeDLError):
1221     def __init__(self, code=None, msg='Unknown error'):
1222         super(XAttrMetadataError, self).__init__(msg)
1223         self.code = code
1224         self.msg = msg
1225
1226         # Parsing code and msg
1227         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1228                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1229             self.reason = 'NO_SPACE'
1230         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231             self.reason = 'VALUE_TOO_LONG'
1232         else:
1233             self.reason = 'NOT_SUPPORTED'
1234
1235
1236 class XAttrUnavailableError(YoutubeDLError):
1237     pass
1238
1239
1240 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1241     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1242     # expected HTTP responses to meet HTTP/1.0 or later (see also
1243     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1244     if sys.version_info < (3, 0):
1245         kwargs['strict'] = True
1246     hc = http_class(*args, **compat_kwargs(kwargs))
1247     source_address = ydl_handler._params.get('source_address')
1248
1249     if source_address is not None:
1250         # This is to workaround _create_connection() from socket where it will try all
1251         # address data from getaddrinfo() including IPv6. This filters the result from
1252         # getaddrinfo() based on the source_address value.
1253         # This is based on the cpython socket.create_connection() function.
1254         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1255         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1256             host, port = address
1257             err = None
1258             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1259             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1260             ip_addrs = [addr for addr in addrs if addr[0] == af]
1261             if addrs and not ip_addrs:
1262                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1263                 raise socket.error(
1264                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1265                     % (ip_version, source_address[0]))
1266             for res in ip_addrs:
1267                 af, socktype, proto, canonname, sa = res
1268                 sock = None
1269                 try:
1270                     sock = socket.socket(af, socktype, proto)
1271                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1272                         sock.settimeout(timeout)
1273                     sock.bind(source_address)
1274                     sock.connect(sa)
1275                     err = None  # Explicitly break reference cycle
1276                     return sock
1277                 except socket.error as _:
1278                     err = _
1279                     if sock is not None:
1280                         sock.close()
1281             if err is not None:
1282                 raise err
1283             else:
1284                 raise socket.error('getaddrinfo returns an empty list')
1285         if hasattr(hc, '_create_connection'):
1286             hc._create_connection = _create_connection
1287         sa = (source_address, 0)
1288         if hasattr(hc, 'source_address'):  # Python 2.7+
1289             hc.source_address = sa
1290         else:  # Python 2.6
1291             def _hc_connect(self, *args, **kwargs):
1292                 sock = _create_connection(
1293                     (self.host, self.port), self.timeout, sa)
1294                 if is_https:
1295                     self.sock = ssl.wrap_socket(
1296                         sock, self.key_file, self.cert_file,
1297                         ssl_version=ssl.PROTOCOL_TLSv1)
1298                 else:
1299                     self.sock = sock
1300             hc.connect = functools.partial(_hc_connect, hc)
1301
1302     return hc
1303
1304
1305 def handle_youtubedl_headers(headers):
1306     filtered_headers = headers
1307
1308     if 'Youtubedl-no-compression' in filtered_headers:
1309         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1310         del filtered_headers['Youtubedl-no-compression']
1311
1312     return filtered_headers
1313
1314
1315 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1316     """Handler for HTTP requests and responses.
1317
1318     This class, when installed with an OpenerDirector, automatically adds
1319     the standard headers to every HTTP request and handles gzipped and
1320     deflated responses from web servers. If compression is to be avoided in
1321     a particular request, the original request in the program code only has
1322     to include the HTTP header "Youtubedl-no-compression", which will be
1323     removed before making the real request.
1324
1325     Part of this code was copied from:
1326
1327     http://techknack.net/python-urllib2-handlers/
1328
1329     Andrew Rowls, the author of that code, agreed to release it to the
1330     public domain.
1331     """
1332
1333     def __init__(self, params, *args, **kwargs):
1334         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1335         self._params = params
1336
1337     def http_open(self, req):
1338         conn_class = compat_http_client.HTTPConnection
1339
1340         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341         if socks_proxy:
1342             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343             del req.headers['Ytdl-socks-proxy']
1344
1345         return self.do_open(functools.partial(
1346             _create_http_connection, self, conn_class, False),
1347             req)
1348
1349     @staticmethod
1350     def deflate(data):
1351         if not data:
1352             return data
1353         try:
1354             return zlib.decompress(data, -zlib.MAX_WBITS)
1355         except zlib.error:
1356             return zlib.decompress(data)
1357
1358     def http_request(self, req):
1359         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1360         # always respected by websites, some tend to give out URLs with non percent-encoded
1361         # non-ASCII characters (see telemb.py, ard.py [#3412])
1362         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1363         # To work around aforementioned issue we will replace request's original URL with
1364         # percent-encoded one
1365         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1366         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1367         url = req.get_full_url()
1368         url_escaped = escape_url(url)
1369
1370         # Substitute URL if any change after escaping
1371         if url != url_escaped:
1372             req = update_Request(req, url=url_escaped)
1373
1374         for h, v in std_headers.items():
1375             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1376             # The dict keys are capitalized because of this bug by urllib
1377             if h.capitalize() not in req.headers:
1378                 req.add_header(h, v)
1379
1380         req.headers = handle_youtubedl_headers(req.headers)
1381
1382         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1383             # Python 2.6 is brain-dead when it comes to fragments
1384             req._Request__original = req._Request__original.partition('#')[0]
1385             req._Request__r_type = req._Request__r_type.partition('#')[0]
1386
1387         return req
1388
1389     def http_response(self, req, resp):
1390         old_resp = resp
1391         # gzip
1392         if resp.headers.get('Content-encoding', '') == 'gzip':
1393             content = resp.read()
1394             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1395             try:
1396                 uncompressed = io.BytesIO(gz.read())
1397             except IOError as original_ioerror:
1398                 # There may be junk add the end of the file
1399                 # See http://stackoverflow.com/q/4928560/35070 for details
1400                 for i in range(1, 1024):
1401                     try:
1402                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1403                         uncompressed = io.BytesIO(gz.read())
1404                     except IOError:
1405                         continue
1406                     break
1407                 else:
1408                     raise original_ioerror
1409             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1410             resp.msg = old_resp.msg
1411             del resp.headers['Content-encoding']
1412         # deflate
1413         if resp.headers.get('Content-encoding', '') == 'deflate':
1414             gz = io.BytesIO(self.deflate(resp.read()))
1415             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1416             resp.msg = old_resp.msg
1417             del resp.headers['Content-encoding']
1418         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1419         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1420         if 300 <= resp.code < 400:
1421             location = resp.headers.get('Location')
1422             if location:
1423                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1424                 if sys.version_info >= (3, 0):
1425                     location = location.encode('iso-8859-1').decode('utf-8')
1426                 else:
1427                     location = location.decode('utf-8')
1428                 location_escaped = escape_url(location)
1429                 if location != location_escaped:
1430                     del resp.headers['Location']
1431                     if sys.version_info < (3, 0):
1432                         location_escaped = location_escaped.encode('utf-8')
1433                     resp.headers['Location'] = location_escaped
1434         return resp
1435
1436     https_request = http_request
1437     https_response = http_response
1438
1439
1440 def make_socks_conn_class(base_class, socks_proxy):
1441     assert issubclass(base_class, (
1442         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1443
1444     url_components = compat_urlparse.urlparse(socks_proxy)
1445     if url_components.scheme.lower() == 'socks5':
1446         socks_type = ProxyType.SOCKS5
1447     elif url_components.scheme.lower() in ('socks', 'socks4'):
1448         socks_type = ProxyType.SOCKS4
1449     elif url_components.scheme.lower() == 'socks4a':
1450         socks_type = ProxyType.SOCKS4A
1451
1452     def unquote_if_non_empty(s):
1453         if not s:
1454             return s
1455         return compat_urllib_parse_unquote_plus(s)
1456
1457     proxy_args = (
1458         socks_type,
1459         url_components.hostname, url_components.port or 1080,
1460         True,  # Remote DNS
1461         unquote_if_non_empty(url_components.username),
1462         unquote_if_non_empty(url_components.password),
1463     )
1464
1465     class SocksConnection(base_class):
1466         def connect(self):
1467             self.sock = sockssocket()
1468             self.sock.setproxy(*proxy_args)
1469             if type(self.timeout) in (int, float):
1470                 self.sock.settimeout(self.timeout)
1471             self.sock.connect((self.host, self.port))
1472
1473             if isinstance(self, compat_http_client.HTTPSConnection):
1474                 if hasattr(self, '_context'):  # Python > 2.6
1475                     self.sock = self._context.wrap_socket(
1476                         self.sock, server_hostname=self.host)
1477                 else:
1478                     self.sock = ssl.wrap_socket(self.sock)
1479
1480     return SocksConnection
1481
1482
1483 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1484     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1485         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1486         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1487         self._params = params
1488
1489     def https_open(self, req):
1490         kwargs = {}
1491         conn_class = self._https_conn_class
1492
1493         if hasattr(self, '_context'):  # python > 2.6
1494             kwargs['context'] = self._context
1495         if hasattr(self, '_check_hostname'):  # python 3.x
1496             kwargs['check_hostname'] = self._check_hostname
1497
1498         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1499         if socks_proxy:
1500             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1501             del req.headers['Ytdl-socks-proxy']
1502
1503         return self.do_open(functools.partial(
1504             _create_http_connection, self, conn_class, True),
1505             req, **kwargs)
1506
1507
1508 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1509     """
1510     See [1] for cookie file format.
1511
1512     1. https://curl.haxx.se/docs/http-cookies.html
1513     """
1514     _HTTPONLY_PREFIX = '#HttpOnly_'
1515     _ENTRY_LEN = 7
1516     _HEADER = '''# Netscape HTTP Cookie File
1517 # This file is generated by yt-dlp.  Do not edit.
1518
1519 '''
1520     _CookieFileEntry = collections.namedtuple(
1521         'CookieFileEntry',
1522         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1523
1524     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1525         """
1526         Save cookies to a file.
1527
1528         Most of the code is taken from CPython 3.8 and slightly adapted
1529         to support cookie files with UTF-8 in both python 2 and 3.
1530         """
1531         if filename is None:
1532             if self.filename is not None:
1533                 filename = self.filename
1534             else:
1535                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1536
1537         # Store session cookies with `expires` set to 0 instead of an empty
1538         # string
1539         for cookie in self:
1540             if cookie.expires is None:
1541                 cookie.expires = 0
1542
1543         with io.open(filename, 'w', encoding='utf-8') as f:
1544             f.write(self._HEADER)
1545             now = time.time()
1546             for cookie in self:
1547                 if not ignore_discard and cookie.discard:
1548                     continue
1549                 if not ignore_expires and cookie.is_expired(now):
1550                     continue
1551                 if cookie.secure:
1552                     secure = 'TRUE'
1553                 else:
1554                     secure = 'FALSE'
1555                 if cookie.domain.startswith('.'):
1556                     initial_dot = 'TRUE'
1557                 else:
1558                     initial_dot = 'FALSE'
1559                 if cookie.expires is not None:
1560                     expires = compat_str(cookie.expires)
1561                 else:
1562                     expires = ''
1563                 if cookie.value is None:
1564                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1565                     # with no name, whereas http.cookiejar regards it as a
1566                     # cookie with no value.
1567                     name = ''
1568                     value = cookie.name
1569                 else:
1570                     name = cookie.name
1571                     value = cookie.value
1572                 f.write(
1573                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1574                                secure, expires, name, value]) + '\n')
1575
1576     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1577         """Load cookies from a file."""
1578         if filename is None:
1579             if self.filename is not None:
1580                 filename = self.filename
1581             else:
1582                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1583
1584         def prepare_line(line):
1585             if line.startswith(self._HTTPONLY_PREFIX):
1586                 line = line[len(self._HTTPONLY_PREFIX):]
1587             # comments and empty lines are fine
1588             if line.startswith('#') or not line.strip():
1589                 return line
1590             cookie_list = line.split('\t')
1591             if len(cookie_list) != self._ENTRY_LEN:
1592                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1593             cookie = self._CookieFileEntry(*cookie_list)
1594             if cookie.expires_at and not cookie.expires_at.isdigit():
1595                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1596             return line
1597
1598         cf = io.StringIO()
1599         with io.open(filename, encoding='utf-8') as f:
1600             for line in f:
1601                 try:
1602                     cf.write(prepare_line(line))
1603                 except compat_cookiejar.LoadError as e:
1604                     write_string(
1605                         'WARNING: skipping cookie file entry due to %s: %r\n'
1606                         % (e, line), sys.stderr)
1607                     continue
1608         cf.seek(0)
1609         self._really_load(cf, filename, ignore_discard, ignore_expires)
1610         # Session cookies are denoted by either `expires` field set to
1611         # an empty string or 0. MozillaCookieJar only recognizes the former
1612         # (see [1]). So we need force the latter to be recognized as session
1613         # cookies on our own.
1614         # Session cookies may be important for cookies-based authentication,
1615         # e.g. usually, when user does not check 'Remember me' check box while
1616         # logging in on a site, some important cookies are stored as session
1617         # cookies so that not recognizing them will result in failed login.
1618         # 1. https://bugs.python.org/issue17164
1619         for cookie in self:
1620             # Treat `expires=0` cookies as session cookies
1621             if cookie.expires == 0:
1622                 cookie.expires = None
1623                 cookie.discard = True
1624
1625
1626 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1627     def __init__(self, cookiejar=None):
1628         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1629
1630     def http_response(self, request, response):
1631         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1632         # characters in Set-Cookie HTTP header of last response (see
1633         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1634         # In order to at least prevent crashing we will percent encode Set-Cookie
1635         # header before HTTPCookieProcessor starts processing it.
1636         # if sys.version_info < (3, 0) and response.headers:
1637         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1638         #         set_cookie = response.headers.get(set_cookie_header)
1639         #         if set_cookie:
1640         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1641         #             if set_cookie != set_cookie_escaped:
1642         #                 del response.headers[set_cookie_header]
1643         #                 response.headers[set_cookie_header] = set_cookie_escaped
1644         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1645
1646     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1647     https_response = http_response
1648
1649
1650 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1651     """YoutubeDL redirect handler
1652
1653     The code is based on HTTPRedirectHandler implementation from CPython [1].
1654
1655     This redirect handler solves two issues:
1656      - ensures redirect URL is always unicode under python 2
1657      - introduces support for experimental HTTP response status code
1658        308 Permanent Redirect [2] used by some sites [3]
1659
1660     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1661     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1662     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1663     """
1664
1665     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1666
1667     def redirect_request(self, req, fp, code, msg, headers, newurl):
1668         """Return a Request or None in response to a redirect.
1669
1670         This is called by the http_error_30x methods when a
1671         redirection response is received.  If a redirection should
1672         take place, return a new Request to allow http_error_30x to
1673         perform the redirect.  Otherwise, raise HTTPError if no-one
1674         else should try to handle this url.  Return None if you can't
1675         but another Handler might.
1676         """
1677         m = req.get_method()
1678         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1679                  or code in (301, 302, 303) and m == "POST")):
1680             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1681         # Strictly (according to RFC 2616), 301 or 302 in response to
1682         # a POST MUST NOT cause a redirection without confirmation
1683         # from the user (of urllib.request, in this case).  In practice,
1684         # essentially all clients do redirect in this case, so we do
1685         # the same.
1686
1687         # On python 2 urlh.geturl() may sometimes return redirect URL
1688         # as byte string instead of unicode. This workaround allows
1689         # to force it always return unicode.
1690         if sys.version_info[0] < 3:
1691             newurl = compat_str(newurl)
1692
1693         # Be conciliant with URIs containing a space.  This is mainly
1694         # redundant with the more complete encoding done in http_error_302(),
1695         # but it is kept for compatibility with other callers.
1696         newurl = newurl.replace(' ', '%20')
1697
1698         CONTENT_HEADERS = ("content-length", "content-type")
1699         # NB: don't use dict comprehension for python 2.6 compatibility
1700         newheaders = dict((k, v) for k, v in req.headers.items()
1701                           if k.lower() not in CONTENT_HEADERS)
1702         return compat_urllib_request.Request(
1703             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704             unverifiable=True)
1705
1706
1707 def extract_timezone(date_str):
1708     m = re.search(
1709         r'''(?x)
1710             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1711             (?P<tz>Z|                                            # just the UTC Z, or
1712                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1713                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714                    [ ]?                                          # optional space
1715                 (?P<sign>\+|-)                                   # +/-
1716                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1717             $)
1718         ''', date_str)
1719     if not m:
1720         timezone = datetime.timedelta()
1721     else:
1722         date_str = date_str[:-len(m.group('tz'))]
1723         if not m.group('sign'):
1724             timezone = datetime.timedelta()
1725         else:
1726             sign = 1 if m.group('sign') == '+' else -1
1727             timezone = datetime.timedelta(
1728                 hours=sign * int(m.group('hours')),
1729                 minutes=sign * int(m.group('minutes')))
1730     return timezone, date_str
1731
1732
1733 def parse_iso8601(date_str, delimiter='T', timezone=None):
1734     """ Return a UNIX timestamp from the given date """
1735
1736     if date_str is None:
1737         return None
1738
1739     date_str = re.sub(r'\.[0-9]+', '', date_str)
1740
1741     if timezone is None:
1742         timezone, date_str = extract_timezone(date_str)
1743
1744     try:
1745         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1746         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1747         return calendar.timegm(dt.timetuple())
1748     except ValueError:
1749         pass
1750
1751
1752 def date_formats(day_first=True):
1753     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1754
1755
1756 def unified_strdate(date_str, day_first=True):
1757     """Return a string with the date in the format YYYYMMDD"""
1758
1759     if date_str is None:
1760         return None
1761     upload_date = None
1762     # Replace commas
1763     date_str = date_str.replace(',', ' ')
1764     # Remove AM/PM + timezone
1765     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1766     _, date_str = extract_timezone(date_str)
1767
1768     for expression in date_formats(day_first):
1769         try:
1770             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1771         except ValueError:
1772             pass
1773     if upload_date is None:
1774         timetuple = email.utils.parsedate_tz(date_str)
1775         if timetuple:
1776             try:
1777                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778             except ValueError:
1779                 pass
1780     if upload_date is not None:
1781         return compat_str(upload_date)
1782
1783
1784 def unified_timestamp(date_str, day_first=True):
1785     if date_str is None:
1786         return None
1787
1788     date_str = re.sub(r'[,|]', '', date_str)
1789
1790     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1791     timezone, date_str = extract_timezone(date_str)
1792
1793     # Remove AM/PM + timezone
1794     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1795
1796     # Remove unrecognized timezones from ISO 8601 alike timestamps
1797     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1798     if m:
1799         date_str = date_str[:-len(m.group('tz'))]
1800
1801     # Python only supports microseconds, so remove nanoseconds
1802     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1803     if m:
1804         date_str = m.group(1)
1805
1806     for expression in date_formats(day_first):
1807         try:
1808             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1809             return calendar.timegm(dt.timetuple())
1810         except ValueError:
1811             pass
1812     timetuple = email.utils.parsedate_tz(date_str)
1813     if timetuple:
1814         return calendar.timegm(timetuple) + pm_delta * 3600
1815
1816
1817 def determine_ext(url, default_ext='unknown_video'):
1818     if url is None or '.' not in url:
1819         return default_ext
1820     guess = url.partition('?')[0].rpartition('.')[2]
1821     if re.match(r'^[A-Za-z0-9]+$', guess):
1822         return guess
1823     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1824     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1825         return guess.rstrip('/')
1826     else:
1827         return default_ext
1828
1829
1830 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1831     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1832
1833
1834 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1835     """
1836     Return a datetime object from a string in the format YYYYMMDD or
1837     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1838
1839     format: string date format used to return datetime object from
1840     precision: round the time portion of a datetime object.
1841                 auto|microsecond|second|minute|hour|day.
1842                 auto: round to the unit provided in date_str (if applicable).
1843     """
1844     auto_precision = False
1845     if precision == 'auto':
1846         auto_precision = True
1847         precision = 'microsecond'
1848     today = datetime_round(datetime.datetime.utcnow(), precision)
1849     if date_str in ('now', 'today'):
1850         return today
1851     if date_str == 'yesterday':
1852         return today - datetime.timedelta(days=1)
1853     match = re.match(
1854         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1855         date_str)
1856     if match is not None:
1857         start_time = datetime_from_str(match.group('start'), precision, format)
1858         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1859         unit = match.group('unit')
1860         if unit == 'month' or unit == 'year':
1861             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1862             unit = 'day'
1863         else:
1864             if unit == 'week':
1865                 unit = 'day'
1866                 time *= 7
1867             delta = datetime.timedelta(**{unit + 's': time})
1868             new_date = start_time + delta
1869         if auto_precision:
1870             return datetime_round(new_date, unit)
1871         return new_date
1872
1873     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1874
1875
1876 def date_from_str(date_str, format='%Y%m%d', strict=False):
1877     """
1878     Return a datetime object from a string in the format YYYYMMDD or
1879     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1880
1881     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1882
1883     format: string date format used to return datetime object from
1884     """
1885     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1886         raise ValueError(f'Invalid date format {date_str}')
1887     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1888
1889
1890 def datetime_add_months(dt, months):
1891     """Increment/Decrement a datetime object by months."""
1892     month = dt.month + months - 1
1893     year = dt.year + month // 12
1894     month = month % 12 + 1
1895     day = min(dt.day, calendar.monthrange(year, month)[1])
1896     return dt.replace(year, month, day)
1897
1898
1899 def datetime_round(dt, precision='day'):
1900     """
1901     Round a datetime object's time to a specific precision
1902     """
1903     if precision == 'microsecond':
1904         return dt
1905
1906     unit_seconds = {
1907         'day': 86400,
1908         'hour': 3600,
1909         'minute': 60,
1910         'second': 1,
1911     }
1912     roundto = lambda x, n: ((x + n / 2) // n) * n
1913     timestamp = calendar.timegm(dt.timetuple())
1914     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1915
1916
1917 def hyphenate_date(date_str):
1918     """
1919     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1920     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1921     if match is not None:
1922         return '-'.join(match.groups())
1923     else:
1924         return date_str
1925
1926
1927 class DateRange(object):
1928     """Represents a time interval between two dates"""
1929
1930     def __init__(self, start=None, end=None):
1931         """start and end must be strings in the format accepted by date"""
1932         if start is not None:
1933             self.start = date_from_str(start, strict=True)
1934         else:
1935             self.start = datetime.datetime.min.date()
1936         if end is not None:
1937             self.end = date_from_str(end, strict=True)
1938         else:
1939             self.end = datetime.datetime.max.date()
1940         if self.start > self.end:
1941             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1942
1943     @classmethod
1944     def day(cls, day):
1945         """Returns a range that only contains the given day"""
1946         return cls(day, day)
1947
1948     def __contains__(self, date):
1949         """Check if the date is in the range"""
1950         if not isinstance(date, datetime.date):
1951             date = date_from_str(date)
1952         return self.start <= date <= self.end
1953
1954     def __str__(self):
1955         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1956
1957
1958 def platform_name():
1959     """ Returns the platform name as a compat_str """
1960     res = platform.platform()
1961     if isinstance(res, bytes):
1962         res = res.decode(preferredencoding())
1963
1964     assert isinstance(res, compat_str)
1965     return res
1966
1967
1968 def get_windows_version():
1969     ''' Get Windows version. None if it's not running on Windows '''
1970     if compat_os_name == 'nt':
1971         return version_tuple(platform.win32_ver()[1])
1972     else:
1973         return None
1974
1975
1976 def _windows_write_string(s, out):
1977     """ Returns True if the string was written using special methods,
1978     False if it has yet to be written out."""
1979     # Adapted from http://stackoverflow.com/a/3259271/35070
1980
1981     import ctypes.wintypes
1982
1983     WIN_OUTPUT_IDS = {
1984         1: -11,
1985         2: -12,
1986     }
1987
1988     try:
1989         fileno = out.fileno()
1990     except AttributeError:
1991         # If the output stream doesn't have a fileno, it's virtual
1992         return False
1993     except io.UnsupportedOperation:
1994         # Some strange Windows pseudo files?
1995         return False
1996     if fileno not in WIN_OUTPUT_IDS:
1997         return False
1998
1999     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2000         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2001         ('GetStdHandle', ctypes.windll.kernel32))
2002     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2003
2004     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2005         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2006         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2007         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2008     written = ctypes.wintypes.DWORD(0)
2009
2010     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2011     FILE_TYPE_CHAR = 0x0002
2012     FILE_TYPE_REMOTE = 0x8000
2013     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2014         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2015         ctypes.POINTER(ctypes.wintypes.DWORD))(
2016         ('GetConsoleMode', ctypes.windll.kernel32))
2017     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2018
2019     def not_a_console(handle):
2020         if handle == INVALID_HANDLE_VALUE or handle is None:
2021             return True
2022         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2023                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2024
2025     if not_a_console(h):
2026         return False
2027
2028     def next_nonbmp_pos(s):
2029         try:
2030             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2031         except StopIteration:
2032             return len(s)
2033
2034     while s:
2035         count = min(next_nonbmp_pos(s), 1024)
2036
2037         ret = WriteConsoleW(
2038             h, s, count if count else 2, ctypes.byref(written), None)
2039         if ret == 0:
2040             raise OSError('Failed to write string')
2041         if not count:  # We just wrote a non-BMP character
2042             assert written.value == 2
2043             s = s[1:]
2044         else:
2045             assert written.value > 0
2046             s = s[written.value:]
2047     return True
2048
2049
2050 def write_string(s, out=None, encoding=None):
2051     if out is None:
2052         out = sys.stderr
2053     assert type(s) == compat_str
2054
2055     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2056         if _windows_write_string(s, out):
2057             return
2058
2059     if ('b' in getattr(out, 'mode', '')
2060             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2061         byt = s.encode(encoding or preferredencoding(), 'ignore')
2062         out.write(byt)
2063     elif hasattr(out, 'buffer'):
2064         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2065         byt = s.encode(enc, 'ignore')
2066         out.buffer.write(byt)
2067     else:
2068         out.write(s)
2069     out.flush()
2070
2071
2072 def bytes_to_intlist(bs):
2073     if not bs:
2074         return []
2075     if isinstance(bs[0], int):  # Python 3
2076         return list(bs)
2077     else:
2078         return [ord(c) for c in bs]
2079
2080
2081 def intlist_to_bytes(xs):
2082     if not xs:
2083         return b''
2084     return compat_struct_pack('%dB' % len(xs), *xs)
2085
2086
2087 # Cross-platform file locking
2088 if sys.platform == 'win32':
2089     import ctypes.wintypes
2090     import msvcrt
2091
2092     class OVERLAPPED(ctypes.Structure):
2093         _fields_ = [
2094             ('Internal', ctypes.wintypes.LPVOID),
2095             ('InternalHigh', ctypes.wintypes.LPVOID),
2096             ('Offset', ctypes.wintypes.DWORD),
2097             ('OffsetHigh', ctypes.wintypes.DWORD),
2098             ('hEvent', ctypes.wintypes.HANDLE),
2099         ]
2100
2101     kernel32 = ctypes.windll.kernel32
2102     LockFileEx = kernel32.LockFileEx
2103     LockFileEx.argtypes = [
2104         ctypes.wintypes.HANDLE,     # hFile
2105         ctypes.wintypes.DWORD,      # dwFlags
2106         ctypes.wintypes.DWORD,      # dwReserved
2107         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2108         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2109         ctypes.POINTER(OVERLAPPED)  # Overlapped
2110     ]
2111     LockFileEx.restype = ctypes.wintypes.BOOL
2112     UnlockFileEx = kernel32.UnlockFileEx
2113     UnlockFileEx.argtypes = [
2114         ctypes.wintypes.HANDLE,     # hFile
2115         ctypes.wintypes.DWORD,      # dwReserved
2116         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2117         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2118         ctypes.POINTER(OVERLAPPED)  # Overlapped
2119     ]
2120     UnlockFileEx.restype = ctypes.wintypes.BOOL
2121     whole_low = 0xffffffff
2122     whole_high = 0x7fffffff
2123
2124     def _lock_file(f, exclusive, block):  # todo: block unused on win32
2125         overlapped = OVERLAPPED()
2126         overlapped.Offset = 0
2127         overlapped.OffsetHigh = 0
2128         overlapped.hEvent = 0
2129         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2130         handle = msvcrt.get_osfhandle(f.fileno())
2131         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2132                           whole_low, whole_high, f._lock_file_overlapped_p):
2133             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2134
2135     def _unlock_file(f):
2136         assert f._lock_file_overlapped_p
2137         handle = msvcrt.get_osfhandle(f.fileno())
2138         if not UnlockFileEx(handle, 0,
2139                             whole_low, whole_high, f._lock_file_overlapped_p):
2140             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2141
2142 else:
2143     # Some platforms, such as Jython, is missing fcntl
2144     try:
2145         import fcntl
2146
2147         def _lock_file(f, exclusive, block):
2148             fcntl.flock(f,
2149                         fcntl.LOCK_SH if not exclusive
2150                         else fcntl.LOCK_EX if block
2151                         else fcntl.LOCK_EX | fcntl.LOCK_NB)
2152
2153         def _unlock_file(f):
2154             fcntl.flock(f, fcntl.LOCK_UN)
2155
2156     except ImportError:
2157         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2158
2159         def _lock_file(f, exclusive, block):
2160             raise IOError(UNSUPPORTED_MSG)
2161
2162         def _unlock_file(f):
2163             raise IOError(UNSUPPORTED_MSG)
2164
2165
2166 class locked_file(object):
2167     def __init__(self, filename, mode, block=True, encoding=None):
2168         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2169         self.f = io.open(filename, mode, encoding=encoding)
2170         self.mode = mode
2171         self.block = block
2172
2173     def __enter__(self):
2174         exclusive = 'r' not in self.mode
2175         try:
2176             _lock_file(self.f, exclusive, self.block)
2177         except IOError:
2178             self.f.close()
2179             raise
2180         return self
2181
2182     def __exit__(self, etype, value, traceback):
2183         try:
2184             _unlock_file(self.f)
2185         finally:
2186             self.f.close()
2187
2188     def __iter__(self):
2189         return iter(self.f)
2190
2191     def write(self, *args):
2192         return self.f.write(*args)
2193
2194     def read(self, *args):
2195         return self.f.read(*args)
2196
2197     def flush(self):
2198         self.f.flush()
2199
2200     def open(self):
2201         return self.__enter__()
2202
2203     def close(self, *args):
2204         self.__exit__(self, *args, value=False, traceback=False)
2205
2206
2207 def get_filesystem_encoding():
2208     encoding = sys.getfilesystemencoding()
2209     return encoding if encoding is not None else 'utf-8'
2210
2211
2212 def shell_quote(args):
2213     quoted_args = []
2214     encoding = get_filesystem_encoding()
2215     for a in args:
2216         if isinstance(a, bytes):
2217             # We may get a filename encoded with 'encodeFilename'
2218             a = a.decode(encoding)
2219         quoted_args.append(compat_shlex_quote(a))
2220     return ' '.join(quoted_args)
2221
2222
2223 def smuggle_url(url, data):
2224     """ Pass additional data in a URL for internal use. """
2225
2226     url, idata = unsmuggle_url(url, {})
2227     data.update(idata)
2228     sdata = compat_urllib_parse_urlencode(
2229         {'__youtubedl_smuggle': json.dumps(data)})
2230     return url + '#' + sdata
2231
2232
2233 def unsmuggle_url(smug_url, default=None):
2234     if '#__youtubedl_smuggle' not in smug_url:
2235         return smug_url, default
2236     url, _, sdata = smug_url.rpartition('#')
2237     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2238     data = json.loads(jsond)
2239     return url, data
2240
2241
2242 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2243     """ Formats numbers with decimal sufixes like K, M, etc """
2244     num, factor = float_or_none(num), float(factor)
2245     if num is None:
2246         return None
2247     exponent = 0 if num == 0 else int(math.log(num, factor))
2248     suffix = ['', *'kMGTPEZY'][exponent]
2249     if factor == 1024:
2250         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2251     converted = num / (factor ** exponent)
2252     return fmt % (converted, suffix)
2253
2254
2255 def format_bytes(bytes):
2256     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2257
2258
2259 def lookup_unit_table(unit_table, s):
2260     units_re = '|'.join(re.escape(u) for u in unit_table)
2261     m = re.match(
2262         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2263     if not m:
2264         return None
2265     num_str = m.group('num').replace(',', '.')
2266     mult = unit_table[m.group('unit')]
2267     return int(float(num_str) * mult)
2268
2269
2270 def parse_filesize(s):
2271     if s is None:
2272         return None
2273
2274     # The lower-case forms are of course incorrect and unofficial,
2275     # but we support those too
2276     _UNIT_TABLE = {
2277         'B': 1,
2278         'b': 1,
2279         'bytes': 1,
2280         'KiB': 1024,
2281         'KB': 1000,
2282         'kB': 1024,
2283         'Kb': 1000,
2284         'kb': 1000,
2285         'kilobytes': 1000,
2286         'kibibytes': 1024,
2287         'MiB': 1024 ** 2,
2288         'MB': 1000 ** 2,
2289         'mB': 1024 ** 2,
2290         'Mb': 1000 ** 2,
2291         'mb': 1000 ** 2,
2292         'megabytes': 1000 ** 2,
2293         'mebibytes': 1024 ** 2,
2294         'GiB': 1024 ** 3,
2295         'GB': 1000 ** 3,
2296         'gB': 1024 ** 3,
2297         'Gb': 1000 ** 3,
2298         'gb': 1000 ** 3,
2299         'gigabytes': 1000 ** 3,
2300         'gibibytes': 1024 ** 3,
2301         'TiB': 1024 ** 4,
2302         'TB': 1000 ** 4,
2303         'tB': 1024 ** 4,
2304         'Tb': 1000 ** 4,
2305         'tb': 1000 ** 4,
2306         'terabytes': 1000 ** 4,
2307         'tebibytes': 1024 ** 4,
2308         'PiB': 1024 ** 5,
2309         'PB': 1000 ** 5,
2310         'pB': 1024 ** 5,
2311         'Pb': 1000 ** 5,
2312         'pb': 1000 ** 5,
2313         'petabytes': 1000 ** 5,
2314         'pebibytes': 1024 ** 5,
2315         'EiB': 1024 ** 6,
2316         'EB': 1000 ** 6,
2317         'eB': 1024 ** 6,
2318         'Eb': 1000 ** 6,
2319         'eb': 1000 ** 6,
2320         'exabytes': 1000 ** 6,
2321         'exbibytes': 1024 ** 6,
2322         'ZiB': 1024 ** 7,
2323         'ZB': 1000 ** 7,
2324         'zB': 1024 ** 7,
2325         'Zb': 1000 ** 7,
2326         'zb': 1000 ** 7,
2327         'zettabytes': 1000 ** 7,
2328         'zebibytes': 1024 ** 7,
2329         'YiB': 1024 ** 8,
2330         'YB': 1000 ** 8,
2331         'yB': 1024 ** 8,
2332         'Yb': 1000 ** 8,
2333         'yb': 1000 ** 8,
2334         'yottabytes': 1000 ** 8,
2335         'yobibytes': 1024 ** 8,
2336     }
2337
2338     return lookup_unit_table(_UNIT_TABLE, s)
2339
2340
2341 def parse_count(s):
2342     if s is None:
2343         return None
2344
2345     s = re.sub(r'^[^\d]+\s', '', s).strip()
2346
2347     if re.match(r'^[\d,.]+$', s):
2348         return str_to_int(s)
2349
2350     _UNIT_TABLE = {
2351         'k': 1000,
2352         'K': 1000,
2353         'm': 1000 ** 2,
2354         'M': 1000 ** 2,
2355         'kk': 1000 ** 2,
2356         'KK': 1000 ** 2,
2357         'b': 1000 ** 3,
2358         'B': 1000 ** 3,
2359     }
2360
2361     ret = lookup_unit_table(_UNIT_TABLE, s)
2362     if ret is not None:
2363         return ret
2364
2365     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2366     if mobj:
2367         return str_to_int(mobj.group(1))
2368
2369
2370 def parse_resolution(s):
2371     if s is None:
2372         return {}
2373
2374     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2375     if mobj:
2376         return {
2377             'width': int(mobj.group('w')),
2378             'height': int(mobj.group('h')),
2379         }
2380
2381     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2382     if mobj:
2383         return {'height': int(mobj.group(1))}
2384
2385     mobj = re.search(r'\b([48])[kK]\b', s)
2386     if mobj:
2387         return {'height': int(mobj.group(1)) * 540}
2388
2389     return {}
2390
2391
2392 def parse_bitrate(s):
2393     if not isinstance(s, compat_str):
2394         return
2395     mobj = re.search(r'\b(\d+)\s*kbps', s)
2396     if mobj:
2397         return int(mobj.group(1))
2398
2399
2400 def month_by_name(name, lang='en'):
2401     """ Return the number of a month by (locale-independently) English name """
2402
2403     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2404
2405     try:
2406         return month_names.index(name) + 1
2407     except ValueError:
2408         return None
2409
2410
2411 def month_by_abbreviation(abbrev):
2412     """ Return the number of a month by (locale-independently) English
2413         abbreviations """
2414
2415     try:
2416         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2417     except ValueError:
2418         return None
2419
2420
2421 def fix_xml_ampersands(xml_str):
2422     """Replace all the '&' by '&amp;' in XML"""
2423     return re.sub(
2424         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2425         '&amp;',
2426         xml_str)
2427
2428
2429 def setproctitle(title):
2430     assert isinstance(title, compat_str)
2431
2432     # ctypes in Jython is not complete
2433     # http://bugs.jython.org/issue2148
2434     if sys.platform.startswith('java'):
2435         return
2436
2437     try:
2438         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2439     except OSError:
2440         return
2441     except TypeError:
2442         # LoadLibrary in Windows Python 2.7.13 only expects
2443         # a bytestring, but since unicode_literals turns
2444         # every string into a unicode string, it fails.
2445         return
2446     title_bytes = title.encode('utf-8')
2447     buf = ctypes.create_string_buffer(len(title_bytes))
2448     buf.value = title_bytes
2449     try:
2450         libc.prctl(15, buf, 0, 0, 0)
2451     except AttributeError:
2452         return  # Strange libc, just skip this
2453
2454
2455 def remove_start(s, start):
2456     return s[len(start):] if s is not None and s.startswith(start) else s
2457
2458
2459 def remove_end(s, end):
2460     return s[:-len(end)] if s is not None and s.endswith(end) else s
2461
2462
2463 def remove_quotes(s):
2464     if s is None or len(s) < 2:
2465         return s
2466     for quote in ('"', "'", ):
2467         if s[0] == quote and s[-1] == quote:
2468             return s[1:-1]
2469     return s
2470
2471
2472 def get_domain(url):
2473     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2474     return domain.group('domain') if domain else None
2475
2476
2477 def url_basename(url):
2478     path = compat_urlparse.urlparse(url).path
2479     return path.strip('/').split('/')[-1]
2480
2481
2482 def base_url(url):
2483     return re.match(r'https?://[^?#&]+/', url).group()
2484
2485
2486 def urljoin(base, path):
2487     if isinstance(path, bytes):
2488         path = path.decode('utf-8')
2489     if not isinstance(path, compat_str) or not path:
2490         return None
2491     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2492         return path
2493     if isinstance(base, bytes):
2494         base = base.decode('utf-8')
2495     if not isinstance(base, compat_str) or not re.match(
2496             r'^(?:https?:)?//', base):
2497         return None
2498     return compat_urlparse.urljoin(base, path)
2499
2500
2501 class HEADRequest(compat_urllib_request.Request):
2502     def get_method(self):
2503         return 'HEAD'
2504
2505
2506 class PUTRequest(compat_urllib_request.Request):
2507     def get_method(self):
2508         return 'PUT'
2509
2510
2511 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2512     if get_attr and v is not None:
2513         v = getattr(v, get_attr, None)
2514     try:
2515         return int(v) * invscale // scale
2516     except (ValueError, TypeError, OverflowError):
2517         return default
2518
2519
2520 def str_or_none(v, default=None):
2521     return default if v is None else compat_str(v)
2522
2523
2524 def str_to_int(int_str):
2525     """ A more relaxed version of int_or_none """
2526     if isinstance(int_str, compat_integer_types):
2527         return int_str
2528     elif isinstance(int_str, compat_str):
2529         int_str = re.sub(r'[,\.\+]', '', int_str)
2530         return int_or_none(int_str)
2531
2532
2533 def float_or_none(v, scale=1, invscale=1, default=None):
2534     if v is None:
2535         return default
2536     try:
2537         return float(v) * invscale / scale
2538     except (ValueError, TypeError):
2539         return default
2540
2541
2542 def bool_or_none(v, default=None):
2543     return v if isinstance(v, bool) else default
2544
2545
2546 def strip_or_none(v, default=None):
2547     return v.strip() if isinstance(v, compat_str) else default
2548
2549
2550 def url_or_none(url):
2551     if not url or not isinstance(url, compat_str):
2552         return None
2553     url = url.strip()
2554     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2555
2556
2557 def strftime_or_none(timestamp, date_format, default=None):
2558     datetime_object = None
2559     try:
2560         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2561             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2562         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2563             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2564         return datetime_object.strftime(date_format)
2565     except (ValueError, TypeError, AttributeError):
2566         return default
2567
2568
2569 def parse_duration(s):
2570     if not isinstance(s, compat_basestring):
2571         return None
2572     s = s.strip()
2573     if not s:
2574         return None
2575
2576     days, hours, mins, secs, ms = [None] * 5
2577     m = re.match(r'''(?x)
2578             (?P<before_secs>
2579                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2580             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2581             (?P<ms>[.:][0-9]+)?Z?$
2582         ''', s)
2583     if m:
2584         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2585     else:
2586         m = re.match(
2587             r'''(?ix)(?:P?
2588                 (?:
2589                     [0-9]+\s*y(?:ears?)?\s*
2590                 )?
2591                 (?:
2592                     [0-9]+\s*m(?:onths?)?\s*
2593                 )?
2594                 (?:
2595                     [0-9]+\s*w(?:eeks?)?\s*
2596                 )?
2597                 (?:
2598                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2599                 )?
2600                 T)?
2601                 (?:
2602                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2603                 )?
2604                 (?:
2605                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2606                 )?
2607                 (?:
2608                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2609                 )?Z?$''', s)
2610         if m:
2611             days, hours, mins, secs, ms = m.groups()
2612         else:
2613             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2614             if m:
2615                 hours, mins = m.groups()
2616             else:
2617                 return None
2618
2619     duration = 0
2620     if secs:
2621         duration += float(secs)
2622     if mins:
2623         duration += float(mins) * 60
2624     if hours:
2625         duration += float(hours) * 60 * 60
2626     if days:
2627         duration += float(days) * 24 * 60 * 60
2628     if ms:
2629         duration += float(ms.replace(':', '.'))
2630     return duration
2631
2632
2633 def prepend_extension(filename, ext, expected_real_ext=None):
2634     name, real_ext = os.path.splitext(filename)
2635     return (
2636         '{0}.{1}{2}'.format(name, ext, real_ext)
2637         if not expected_real_ext or real_ext[1:] == expected_real_ext
2638         else '{0}.{1}'.format(filename, ext))
2639
2640
2641 def replace_extension(filename, ext, expected_real_ext=None):
2642     name, real_ext = os.path.splitext(filename)
2643     return '{0}.{1}'.format(
2644         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2645         ext)
2646
2647
2648 def check_executable(exe, args=[]):
2649     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2650     args can be a list of arguments for a short output (like -version) """
2651     try:
2652         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2653     except OSError:
2654         return False
2655     return exe
2656
2657
2658 def _get_exe_version_output(exe, args):
2659     try:
2660         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2661         # SIGTTOU if yt-dlp is run in the background.
2662         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2663         out, _ = Popen(
2664             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2665             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2666     except OSError:
2667         return False
2668     if isinstance(out, bytes):  # Python 2.x
2669         out = out.decode('ascii', 'ignore')
2670     return out
2671
2672
2673 def detect_exe_version(output, version_re=None, unrecognized='present'):
2674     assert isinstance(output, compat_str)
2675     if version_re is None:
2676         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2677     m = re.search(version_re, output)
2678     if m:
2679         return m.group(1)
2680     else:
2681         return unrecognized
2682
2683
2684 def get_exe_version(exe, args=['--version'],
2685                     version_re=None, unrecognized='present'):
2686     """ Returns the version of the specified executable,
2687     or False if the executable is not present """
2688     out = _get_exe_version_output(exe, args)
2689     return detect_exe_version(out, version_re, unrecognized) if out else False
2690
2691
2692 class LazyList(collections.abc.Sequence):
2693     ''' Lazy immutable list from an iterable
2694     Note that slices of a LazyList are lists and not LazyList'''
2695
2696     class IndexError(IndexError):
2697         pass
2698
2699     def __init__(self, iterable, *, reverse=False, _cache=None):
2700         self.__iterable = iter(iterable)
2701         self.__cache = [] if _cache is None else _cache
2702         self.__reversed = reverse
2703
2704     def __iter__(self):
2705         if self.__reversed:
2706             # We need to consume the entire iterable to iterate in reverse
2707             yield from self.exhaust()
2708             return
2709         yield from self.__cache
2710         for item in self.__iterable:
2711             self.__cache.append(item)
2712             yield item
2713
2714     def __exhaust(self):
2715         self.__cache.extend(self.__iterable)
2716         # Discard the emptied iterable to make it pickle-able
2717         self.__iterable = []
2718         return self.__cache
2719
2720     def exhaust(self):
2721         ''' Evaluate the entire iterable '''
2722         return self.__exhaust()[::-1 if self.__reversed else 1]
2723
2724     @staticmethod
2725     def __reverse_index(x):
2726         return None if x is None else -(x + 1)
2727
2728     def __getitem__(self, idx):
2729         if isinstance(idx, slice):
2730             if self.__reversed:
2731                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2732             start, stop, step = idx.start, idx.stop, idx.step or 1
2733         elif isinstance(idx, int):
2734             if self.__reversed:
2735                 idx = self.__reverse_index(idx)
2736             start, stop, step = idx, idx, 0
2737         else:
2738             raise TypeError('indices must be integers or slices')
2739         if ((start or 0) < 0 or (stop or 0) < 0
2740                 or (start is None and step < 0)
2741                 or (stop is None and step > 0)):
2742             # We need to consume the entire iterable to be able to slice from the end
2743             # Obviously, never use this with infinite iterables
2744             self.__exhaust()
2745             try:
2746                 return self.__cache[idx]
2747             except IndexError as e:
2748                 raise self.IndexError(e) from e
2749         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2750         if n > 0:
2751             self.__cache.extend(itertools.islice(self.__iterable, n))
2752         try:
2753             return self.__cache[idx]
2754         except IndexError as e:
2755             raise self.IndexError(e) from e
2756
2757     def __bool__(self):
2758         try:
2759             self[-1] if self.__reversed else self[0]
2760         except self.IndexError:
2761             return False
2762         return True
2763
2764     def __len__(self):
2765         self.__exhaust()
2766         return len(self.__cache)
2767
2768     def __reversed__(self):
2769         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2770
2771     def __copy__(self):
2772         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2773
2774     def __repr__(self):
2775         # repr and str should mimic a list. So we exhaust the iterable
2776         return repr(self.exhaust())
2777
2778     def __str__(self):
2779         return repr(self.exhaust())
2780
2781
2782 class PagedList:
2783
2784     class IndexError(IndexError):
2785         pass
2786
2787     def __len__(self):
2788         # This is only useful for tests
2789         return len(self.getslice())
2790
2791     def __init__(self, pagefunc, pagesize, use_cache=True):
2792         self._pagefunc = pagefunc
2793         self._pagesize = pagesize
2794         self._use_cache = use_cache
2795         self._cache = {}
2796
2797     def getpage(self, pagenum):
2798         page_results = self._cache.get(pagenum)
2799         if page_results is None:
2800             page_results = list(self._pagefunc(pagenum))
2801         if self._use_cache:
2802             self._cache[pagenum] = page_results
2803         return page_results
2804
2805     def getslice(self, start=0, end=None):
2806         return list(self._getslice(start, end))
2807
2808     def _getslice(self, start, end):
2809         raise NotImplementedError('This method must be implemented by subclasses')
2810
2811     def __getitem__(self, idx):
2812         # NOTE: cache must be enabled if this is used
2813         if not isinstance(idx, int) or idx < 0:
2814             raise TypeError('indices must be non-negative integers')
2815         entries = self.getslice(idx, idx + 1)
2816         if not entries:
2817             raise self.IndexError()
2818         return entries[0]
2819
2820
2821 class OnDemandPagedList(PagedList):
2822     def _getslice(self, start, end):
2823         for pagenum in itertools.count(start // self._pagesize):
2824             firstid = pagenum * self._pagesize
2825             nextfirstid = pagenum * self._pagesize + self._pagesize
2826             if start >= nextfirstid:
2827                 continue
2828
2829             startv = (
2830                 start % self._pagesize
2831                 if firstid <= start < nextfirstid
2832                 else 0)
2833             endv = (
2834                 ((end - 1) % self._pagesize) + 1
2835                 if (end is not None and firstid <= end <= nextfirstid)
2836                 else None)
2837
2838             page_results = self.getpage(pagenum)
2839             if startv != 0 or endv is not None:
2840                 page_results = page_results[startv:endv]
2841             yield from page_results
2842
2843             # A little optimization - if current page is not "full", ie. does
2844             # not contain page_size videos then we can assume that this page
2845             # is the last one - there are no more ids on further pages -
2846             # i.e. no need to query again.
2847             if len(page_results) + startv < self._pagesize:
2848                 break
2849
2850             # If we got the whole page, but the next page is not interesting,
2851             # break out early as well
2852             if end == nextfirstid:
2853                 break
2854
2855
2856 class InAdvancePagedList(PagedList):
2857     def __init__(self, pagefunc, pagecount, pagesize):
2858         self._pagecount = pagecount
2859         PagedList.__init__(self, pagefunc, pagesize, True)
2860
2861     def _getslice(self, start, end):
2862         start_page = start // self._pagesize
2863         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2864         skip_elems = start - start_page * self._pagesize
2865         only_more = None if end is None else end - start
2866         for pagenum in range(start_page, end_page):
2867             page_results = self.getpage(pagenum)
2868             if skip_elems:
2869                 page_results = page_results[skip_elems:]
2870                 skip_elems = None
2871             if only_more is not None:
2872                 if len(page_results) < only_more:
2873                     only_more -= len(page_results)
2874                 else:
2875                     yield from page_results[:only_more]
2876                     break
2877             yield from page_results
2878
2879
2880 def uppercase_escape(s):
2881     unicode_escape = codecs.getdecoder('unicode_escape')
2882     return re.sub(
2883         r'\\U[0-9a-fA-F]{8}',
2884         lambda m: unicode_escape(m.group(0))[0],
2885         s)
2886
2887
2888 def lowercase_escape(s):
2889     unicode_escape = codecs.getdecoder('unicode_escape')
2890     return re.sub(
2891         r'\\u[0-9a-fA-F]{4}',
2892         lambda m: unicode_escape(m.group(0))[0],
2893         s)
2894
2895
2896 def escape_rfc3986(s):
2897     """Escape non-ASCII characters as suggested by RFC 3986"""
2898     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2899         s = s.encode('utf-8')
2900     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2901
2902
2903 def escape_url(url):
2904     """Escape URL as suggested by RFC 3986"""
2905     url_parsed = compat_urllib_parse_urlparse(url)
2906     return url_parsed._replace(
2907         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2908         path=escape_rfc3986(url_parsed.path),
2909         params=escape_rfc3986(url_parsed.params),
2910         query=escape_rfc3986(url_parsed.query),
2911         fragment=escape_rfc3986(url_parsed.fragment)
2912     ).geturl()
2913
2914
2915 def parse_qs(url):
2916     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2917
2918
2919 def read_batch_urls(batch_fd):
2920     def fixup(url):
2921         if not isinstance(url, compat_str):
2922             url = url.decode('utf-8', 'replace')
2923         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2924         for bom in BOM_UTF8:
2925             if url.startswith(bom):
2926                 url = url[len(bom):]
2927         url = url.lstrip()
2928         if not url or url.startswith(('#', ';', ']')):
2929             return False
2930         # "#" cannot be stripped out since it is part of the URI
2931         # However, it can be safely stipped out if follwing a whitespace
2932         return re.split(r'\s#', url, 1)[0].rstrip()
2933
2934     with contextlib.closing(batch_fd) as fd:
2935         return [url for url in map(fixup, fd) if url]
2936
2937
2938 def urlencode_postdata(*args, **kargs):
2939     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2940
2941
2942 def update_url_query(url, query):
2943     if not query:
2944         return url
2945     parsed_url = compat_urlparse.urlparse(url)
2946     qs = compat_parse_qs(parsed_url.query)
2947     qs.update(query)
2948     return compat_urlparse.urlunparse(parsed_url._replace(
2949         query=compat_urllib_parse_urlencode(qs, True)))
2950
2951
2952 def update_Request(req, url=None, data=None, headers={}, query={}):
2953     req_headers = req.headers.copy()
2954     req_headers.update(headers)
2955     req_data = data or req.data
2956     req_url = update_url_query(url or req.get_full_url(), query)
2957     req_get_method = req.get_method()
2958     if req_get_method == 'HEAD':
2959         req_type = HEADRequest
2960     elif req_get_method == 'PUT':
2961         req_type = PUTRequest
2962     else:
2963         req_type = compat_urllib_request.Request
2964     new_req = req_type(
2965         req_url, data=req_data, headers=req_headers,
2966         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2967     if hasattr(req, 'timeout'):
2968         new_req.timeout = req.timeout
2969     return new_req
2970
2971
2972 def _multipart_encode_impl(data, boundary):
2973     content_type = 'multipart/form-data; boundary=%s' % boundary
2974
2975     out = b''
2976     for k, v in data.items():
2977         out += b'--' + boundary.encode('ascii') + b'\r\n'
2978         if isinstance(k, compat_str):
2979             k = k.encode('utf-8')
2980         if isinstance(v, compat_str):
2981             v = v.encode('utf-8')
2982         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2983         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2984         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2985         if boundary.encode('ascii') in content:
2986             raise ValueError('Boundary overlaps with data')
2987         out += content
2988
2989     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2990
2991     return out, content_type
2992
2993
2994 def multipart_encode(data, boundary=None):
2995     '''
2996     Encode a dict to RFC 7578-compliant form-data
2997
2998     data:
2999         A dict where keys and values can be either Unicode or bytes-like
3000         objects.
3001     boundary:
3002         If specified a Unicode object, it's used as the boundary. Otherwise
3003         a random boundary is generated.
3004
3005     Reference: https://tools.ietf.org/html/rfc7578
3006     '''
3007     has_specified_boundary = boundary is not None
3008
3009     while True:
3010         if boundary is None:
3011             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3012
3013         try:
3014             out, content_type = _multipart_encode_impl(data, boundary)
3015             break
3016         except ValueError:
3017             if has_specified_boundary:
3018                 raise
3019             boundary = None
3020
3021     return out, content_type
3022
3023
3024 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3025     if isinstance(key_or_keys, (list, tuple)):
3026         for key in key_or_keys:
3027             if key not in d or d[key] is None or skip_false_values and not d[key]:
3028                 continue
3029             return d[key]
3030         return default
3031     return d.get(key_or_keys, default)
3032
3033
3034 def try_get(src, getter, expected_type=None):
3035     for get in variadic(getter):
3036         try:
3037             v = get(src)
3038         except (AttributeError, KeyError, TypeError, IndexError):
3039             pass
3040         else:
3041             if expected_type is None or isinstance(v, expected_type):
3042                 return v
3043
3044
3045 def merge_dicts(*dicts):
3046     merged = {}
3047     for a_dict in dicts:
3048         for k, v in a_dict.items():
3049             if v is None:
3050                 continue
3051             if (k not in merged
3052                     or (isinstance(v, compat_str) and v
3053                         and isinstance(merged[k], compat_str)
3054                         and not merged[k])):
3055                 merged[k] = v
3056     return merged
3057
3058
3059 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3060     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3061
3062
3063 US_RATINGS = {
3064     'G': 0,
3065     'PG': 10,
3066     'PG-13': 13,
3067     'R': 16,
3068     'NC': 18,
3069 }
3070
3071
3072 TV_PARENTAL_GUIDELINES = {
3073     'TV-Y': 0,
3074     'TV-Y7': 7,
3075     'TV-G': 0,
3076     'TV-PG': 0,
3077     'TV-14': 14,
3078     'TV-MA': 17,
3079 }
3080
3081
3082 def parse_age_limit(s):
3083     if type(s) == int:
3084         return s if 0 <= s <= 21 else None
3085     if not isinstance(s, compat_basestring):
3086         return None
3087     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3088     if m:
3089         return int(m.group('age'))
3090     s = s.upper()
3091     if s in US_RATINGS:
3092         return US_RATINGS[s]
3093     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3094     if m:
3095         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3096     return None
3097
3098
3099 def strip_jsonp(code):
3100     return re.sub(
3101         r'''(?sx)^
3102             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3103             (?:\s*&&\s*(?P=func_name))?
3104             \s*\(\s*(?P<callback_data>.*)\);?
3105             \s*?(?://[^\n]*)*$''',
3106         r'\g<callback_data>', code)
3107
3108
3109 def js_to_json(code, vars={}):
3110     # vars is a dict of var, val pairs to substitute
3111     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3112     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3113     INTEGER_TABLE = (
3114         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3115         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3116     )
3117
3118     def fix_kv(m):
3119         v = m.group(0)
3120         if v in ('true', 'false', 'null'):
3121             return v
3122         elif v in ('undefined', 'void 0'):
3123             return 'null'
3124         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3125             return ""
3126
3127         if v[0] in ("'", '"'):
3128             v = re.sub(r'(?s)\\.|"', lambda m: {
3129                 '"': '\\"',
3130                 "\\'": "'",
3131                 '\\\n': '',
3132                 '\\x': '\\u00',
3133             }.get(m.group(0), m.group(0)), v[1:-1])
3134         else:
3135             for regex, base in INTEGER_TABLE:
3136                 im = re.match(regex, v)
3137                 if im:
3138                     i = int(im.group(1), base)
3139                     return '"%d":' % i if v.endswith(':') else '%d' % i
3140
3141             if v in vars:
3142                 return vars[v]
3143
3144         return '"%s"' % v
3145
3146     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3147
3148     return re.sub(r'''(?sx)
3149         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3150         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3151         {comment}|,(?={skip}[\]}}])|
3152         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3153         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3154         [0-9]+(?={skip}:)|
3155         !+
3156         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3157
3158
3159 def qualities(quality_ids):
3160     """ Get a numeric quality value out of a list of possible values """
3161     def q(qid):
3162         try:
3163             return quality_ids.index(qid)
3164         except ValueError:
3165             return -1
3166     return q
3167
3168
3169 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3170
3171
3172 DEFAULT_OUTTMPL = {
3173     'default': '%(title)s [%(id)s].%(ext)s',
3174     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3175 }
3176 OUTTMPL_TYPES = {
3177     'chapter': None,
3178     'subtitle': None,
3179     'thumbnail': None,
3180     'description': 'description',
3181     'annotation': 'annotations.xml',
3182     'infojson': 'info.json',
3183     'link': None,
3184     'pl_video': None,
3185     'pl_thumbnail': None,
3186     'pl_description': 'description',
3187     'pl_infojson': 'info.json',
3188 }
3189
3190 # As of [1] format syntax is:
3191 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3192 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3193 STR_FORMAT_RE_TMPL = r'''(?x)
3194     (?<!%)(?P<prefix>(?:%%)*)
3195     %
3196     (?P<has_key>\((?P<key>{0})\))?
3197     (?P<format>
3198         (?P<conversion>[#0\-+ ]+)?
3199         (?P<min_width>\d+)?
3200         (?P<precision>\.\d+)?
3201         (?P<len_mod>[hlL])?  # unused in python
3202         {1}  # conversion type
3203     )
3204 '''
3205
3206
3207 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3208
3209
3210 def limit_length(s, length):
3211     """ Add ellipses to overly long strings """
3212     if s is None:
3213         return None
3214     ELLIPSES = '...'
3215     if len(s) > length:
3216         return s[:length - len(ELLIPSES)] + ELLIPSES
3217     return s
3218
3219
3220 def version_tuple(v):
3221     return tuple(int(e) for e in re.split(r'[-.]', v))
3222
3223
3224 def is_outdated_version(version, limit, assume_new=True):
3225     if not version:
3226         return not assume_new
3227     try:
3228         return version_tuple(version) < version_tuple(limit)
3229     except ValueError:
3230         return not assume_new
3231
3232
3233 def ytdl_is_updateable():
3234     """ Returns if yt-dlp can be updated with -U """
3235
3236     from .update import is_non_updateable
3237
3238     return not is_non_updateable()
3239
3240
3241 def args_to_str(args):
3242     # Get a short string representation for a subprocess command
3243     return ' '.join(compat_shlex_quote(a) for a in args)
3244
3245
3246 def error_to_compat_str(err):
3247     err_str = str(err)
3248     # On python 2 error byte string must be decoded with proper
3249     # encoding rather than ascii
3250     if sys.version_info[0] < 3:
3251         err_str = err_str.decode(preferredencoding())
3252     return err_str
3253
3254
3255 def mimetype2ext(mt):
3256     if mt is None:
3257         return None
3258
3259     mt, _, params = mt.partition(';')
3260     mt = mt.strip()
3261
3262     FULL_MAP = {
3263         'audio/mp4': 'm4a',
3264         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3265         # it's the most popular one
3266         'audio/mpeg': 'mp3',
3267         'audio/x-wav': 'wav',
3268         'audio/wav': 'wav',
3269         'audio/wave': 'wav',
3270     }
3271
3272     ext = FULL_MAP.get(mt)
3273     if ext is not None:
3274         return ext
3275
3276     SUBTYPE_MAP = {
3277         '3gpp': '3gp',
3278         'smptett+xml': 'tt',
3279         'ttaf+xml': 'dfxp',
3280         'ttml+xml': 'ttml',
3281         'x-flv': 'flv',
3282         'x-mp4-fragmented': 'mp4',
3283         'x-ms-sami': 'sami',
3284         'x-ms-wmv': 'wmv',
3285         'mpegurl': 'm3u8',
3286         'x-mpegurl': 'm3u8',
3287         'vnd.apple.mpegurl': 'm3u8',
3288         'dash+xml': 'mpd',
3289         'f4m+xml': 'f4m',
3290         'hds+xml': 'f4m',
3291         'vnd.ms-sstr+xml': 'ism',
3292         'quicktime': 'mov',
3293         'mp2t': 'ts',
3294         'x-wav': 'wav',
3295         'filmstrip+json': 'fs',
3296         'svg+xml': 'svg',
3297     }
3298
3299     _, _, subtype = mt.rpartition('/')
3300     ext = SUBTYPE_MAP.get(subtype.lower())
3301     if ext is not None:
3302         return ext
3303
3304     SUFFIX_MAP = {
3305         'json': 'json',
3306         'xml': 'xml',
3307         'zip': 'zip',
3308         'gzip': 'gz',
3309     }
3310
3311     _, _, suffix = subtype.partition('+')
3312     ext = SUFFIX_MAP.get(suffix)
3313     if ext is not None:
3314         return ext
3315
3316     return subtype.replace('+', '.')
3317
3318
3319 def ext2mimetype(ext_or_url):
3320     if not ext_or_url:
3321         return None
3322     if '.' not in ext_or_url:
3323         ext_or_url = f'file.{ext_or_url}'
3324     return mimetypes.guess_type(ext_or_url)[0]
3325
3326
3327 def parse_codecs(codecs_str):
3328     # http://tools.ietf.org/html/rfc6381
3329     if not codecs_str:
3330         return {}
3331     split_codecs = list(filter(None, map(
3332         str.strip, codecs_str.strip().strip(',').split(','))))
3333     vcodec, acodec, tcodec, hdr = None, None, None, None
3334     for full_codec in split_codecs:
3335         parts = full_codec.split('.')
3336         codec = parts[0].replace('0', '')
3337         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3338                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3339             if not vcodec:
3340                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3341                 if codec in ('dvh1', 'dvhe'):
3342                     hdr = 'DV'
3343                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3344                     hdr = 'HDR10'
3345                 elif full_codec.replace('0', '').startswith('vp9.2'):
3346                     hdr = 'HDR10'
3347         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3348             if not acodec:
3349                 acodec = full_codec
3350         elif codec in ('stpp', 'wvtt',):
3351             if not tcodec:
3352                 tcodec = full_codec
3353         else:
3354             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3355     if vcodec or acodec or tcodec:
3356         return {
3357             'vcodec': vcodec or 'none',
3358             'acodec': acodec or 'none',
3359             'dynamic_range': hdr,
3360             **({'tcodec': tcodec} if tcodec is not None else {}),
3361         }
3362     elif len(split_codecs) == 2:
3363         return {
3364             'vcodec': split_codecs[0],
3365             'acodec': split_codecs[1],
3366         }
3367     return {}
3368
3369
3370 def urlhandle_detect_ext(url_handle):
3371     getheader = url_handle.headers.get
3372
3373     cd = getheader('Content-Disposition')
3374     if cd:
3375         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3376         if m:
3377             e = determine_ext(m.group('filename'), default_ext=None)
3378             if e:
3379                 return e
3380
3381     return mimetype2ext(getheader('Content-Type'))
3382
3383
3384 def encode_data_uri(data, mime_type):
3385     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3386
3387
3388 def age_restricted(content_limit, age_limit):
3389     """ Returns True iff the content should be blocked """
3390
3391     if age_limit is None:  # No limit set
3392         return False
3393     if content_limit is None:
3394         return False  # Content available for everyone
3395     return age_limit < content_limit
3396
3397
3398 def is_html(first_bytes):
3399     """ Detect whether a file contains HTML by examining its first bytes. """
3400
3401     BOMS = [
3402         (b'\xef\xbb\xbf', 'utf-8'),
3403         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3404         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3405         (b'\xff\xfe', 'utf-16-le'),
3406         (b'\xfe\xff', 'utf-16-be'),
3407     ]
3408     for bom, enc in BOMS:
3409         if first_bytes.startswith(bom):
3410             s = first_bytes[len(bom):].decode(enc, 'replace')
3411             break
3412     else:
3413         s = first_bytes.decode('utf-8', 'replace')
3414
3415     return re.match(r'^\s*<', s)
3416
3417
3418 def determine_protocol(info_dict):
3419     protocol = info_dict.get('protocol')
3420     if protocol is not None:
3421         return protocol
3422
3423     url = sanitize_url(info_dict['url'])
3424     if url.startswith('rtmp'):
3425         return 'rtmp'
3426     elif url.startswith('mms'):
3427         return 'mms'
3428     elif url.startswith('rtsp'):
3429         return 'rtsp'
3430
3431     ext = determine_ext(url)
3432     if ext == 'm3u8':
3433         return 'm3u8'
3434     elif ext == 'f4m':
3435         return 'f4m'
3436
3437     return compat_urllib_parse_urlparse(url).scheme
3438
3439
3440 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3441     """ Render a list of rows, each as a list of values.
3442     Text after a \t will be right aligned """
3443     def width(string):
3444         return len(remove_terminal_sequences(string).replace('\t', ''))
3445
3446     def get_max_lens(table):
3447         return [max(width(str(v)) for v in col) for col in zip(*table)]
3448
3449     def filter_using_list(row, filterArray):
3450         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3451
3452     max_lens = get_max_lens(data) if hide_empty else []
3453     header_row = filter_using_list(header_row, max_lens)
3454     data = [filter_using_list(row, max_lens) for row in data]
3455
3456     table = [header_row] + data
3457     max_lens = get_max_lens(table)
3458     extra_gap += 1
3459     if delim:
3460         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3461         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3462     for row in table:
3463         for pos, text in enumerate(map(str, row)):
3464             if '\t' in text:
3465                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3466             else:
3467                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3468     ret = '\n'.join(''.join(row).rstrip() for row in table)
3469     return ret
3470
3471
3472 def _match_one(filter_part, dct, incomplete):
3473     # TODO: Generalize code with YoutubeDL._build_format_filter
3474     STRING_OPERATORS = {
3475         '*=': operator.contains,
3476         '^=': lambda attr, value: attr.startswith(value),
3477         '$=': lambda attr, value: attr.endswith(value),
3478         '~=': lambda attr, value: re.search(value, attr),
3479     }
3480     COMPARISON_OPERATORS = {
3481         **STRING_OPERATORS,
3482         '<=': operator.le,  # "<=" must be defined above "<"
3483         '<': operator.lt,
3484         '>=': operator.ge,
3485         '>': operator.gt,
3486         '=': operator.eq,
3487     }
3488
3489     operator_rex = re.compile(r'''(?x)\s*
3490         (?P<key>[a-z_]+)
3491         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3492         (?:
3493             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3494             (?P<strval>.+?)
3495         )
3496         \s*$
3497         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3498     m = operator_rex.search(filter_part)
3499     if m:
3500         m = m.groupdict()
3501         unnegated_op = COMPARISON_OPERATORS[m['op']]
3502         if m['negation']:
3503             op = lambda attr, value: not unnegated_op(attr, value)
3504         else:
3505             op = unnegated_op
3506         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3507         if m['quote']:
3508             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3509         actual_value = dct.get(m['key'])
3510         numeric_comparison = None
3511         if isinstance(actual_value, compat_numeric_types):
3512             # If the original field is a string and matching comparisonvalue is
3513             # a number we should respect the origin of the original field
3514             # and process comparison value as a string (see
3515             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3516             try:
3517                 numeric_comparison = int(comparison_value)
3518             except ValueError:
3519                 numeric_comparison = parse_filesize(comparison_value)
3520                 if numeric_comparison is None:
3521                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3522                 if numeric_comparison is None:
3523                     numeric_comparison = parse_duration(comparison_value)
3524         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3525             raise ValueError('Operator %s only supports string values!' % m['op'])
3526         if actual_value is None:
3527             return incomplete or m['none_inclusive']
3528         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3529
3530     UNARY_OPERATORS = {
3531         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3532         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3533     }
3534     operator_rex = re.compile(r'''(?x)\s*
3535         (?P<op>%s)\s*(?P<key>[a-z_]+)
3536         \s*$
3537         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3538     m = operator_rex.search(filter_part)
3539     if m:
3540         op = UNARY_OPERATORS[m.group('op')]
3541         actual_value = dct.get(m.group('key'))
3542         if incomplete and actual_value is None:
3543             return True
3544         return op(actual_value)
3545
3546     raise ValueError('Invalid filter part %r' % filter_part)
3547
3548
3549 def match_str(filter_str, dct, incomplete=False):
3550     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3551         When incomplete, all conditions passes on missing fields
3552     """
3553     return all(
3554         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3555         for filter_part in re.split(r'(?<!\\)&', filter_str))
3556
3557
3558 def match_filter_func(filter_str):
3559     def _match_func(info_dict, *args, **kwargs):
3560         if match_str(filter_str, info_dict, *args, **kwargs):
3561             return None
3562         else:
3563             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3564             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3565     return _match_func
3566
3567
3568 def parse_dfxp_time_expr(time_expr):
3569     if not time_expr:
3570         return
3571
3572     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3573     if mobj:
3574         return float(mobj.group('time_offset'))
3575
3576     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3577     if mobj:
3578         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3579
3580
3581 def srt_subtitles_timecode(seconds):
3582     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3583
3584
3585 def ass_subtitles_timecode(seconds):
3586     time = timetuple_from_msec(seconds * 1000)
3587     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3588
3589
3590 def dfxp2srt(dfxp_data):
3591     '''
3592     @param dfxp_data A bytes-like object containing DFXP data
3593     @returns A unicode object containing converted SRT data
3594     '''
3595     LEGACY_NAMESPACES = (
3596         (b'http://www.w3.org/ns/ttml', [
3597             b'http://www.w3.org/2004/11/ttaf1',
3598             b'http://www.w3.org/2006/04/ttaf1',
3599             b'http://www.w3.org/2006/10/ttaf1',
3600         ]),
3601         (b'http://www.w3.org/ns/ttml#styling', [
3602             b'http://www.w3.org/ns/ttml#style',
3603         ]),
3604     )
3605
3606     SUPPORTED_STYLING = [
3607         'color',
3608         'fontFamily',
3609         'fontSize',
3610         'fontStyle',
3611         'fontWeight',
3612         'textDecoration'
3613     ]
3614
3615     _x = functools.partial(xpath_with_ns, ns_map={
3616         'xml': 'http://www.w3.org/XML/1998/namespace',
3617         'ttml': 'http://www.w3.org/ns/ttml',
3618         'tts': 'http://www.w3.org/ns/ttml#styling',
3619     })
3620
3621     styles = {}
3622     default_style = {}
3623
3624     class TTMLPElementParser(object):
3625         _out = ''
3626         _unclosed_elements = []
3627         _applied_styles = []
3628
3629         def start(self, tag, attrib):
3630             if tag in (_x('ttml:br'), 'br'):
3631                 self._out += '\n'
3632             else:
3633                 unclosed_elements = []
3634                 style = {}
3635                 element_style_id = attrib.get('style')
3636                 if default_style:
3637                     style.update(default_style)
3638                 if element_style_id:
3639                     style.update(styles.get(element_style_id, {}))
3640                 for prop in SUPPORTED_STYLING:
3641                     prop_val = attrib.get(_x('tts:' + prop))
3642                     if prop_val:
3643                         style[prop] = prop_val
3644                 if style:
3645                     font = ''
3646                     for k, v in sorted(style.items()):
3647                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3648                             continue
3649                         if k == 'color':
3650                             font += ' color="%s"' % v
3651                         elif k == 'fontSize':
3652                             font += ' size="%s"' % v
3653                         elif k == 'fontFamily':
3654                             font += ' face="%s"' % v
3655                         elif k == 'fontWeight' and v == 'bold':
3656                             self._out += '<b>'
3657                             unclosed_elements.append('b')
3658                         elif k == 'fontStyle' and v == 'italic':
3659                             self._out += '<i>'
3660                             unclosed_elements.append('i')
3661                         elif k == 'textDecoration' and v == 'underline':
3662                             self._out += '<u>'
3663                             unclosed_elements.append('u')
3664                     if font:
3665                         self._out += '<font' + font + '>'
3666                         unclosed_elements.append('font')
3667                     applied_style = {}
3668                     if self._applied_styles:
3669                         applied_style.update(self._applied_styles[-1])
3670                     applied_style.update(style)
3671                     self._applied_styles.append(applied_style)
3672                 self._unclosed_elements.append(unclosed_elements)
3673
3674         def end(self, tag):
3675             if tag not in (_x('ttml:br'), 'br'):
3676                 unclosed_elements = self._unclosed_elements.pop()
3677                 for element in reversed(unclosed_elements):
3678                     self._out += '</%s>' % element
3679                 if unclosed_elements and self._applied_styles:
3680                     self._applied_styles.pop()
3681
3682         def data(self, data):
3683             self._out += data
3684
3685         def close(self):
3686             return self._out.strip()
3687
3688     def parse_node(node):
3689         target = TTMLPElementParser()
3690         parser = xml.etree.ElementTree.XMLParser(target=target)
3691         parser.feed(xml.etree.ElementTree.tostring(node))
3692         return parser.close()
3693
3694     for k, v in LEGACY_NAMESPACES:
3695         for ns in v:
3696             dfxp_data = dfxp_data.replace(ns, k)
3697
3698     dfxp = compat_etree_fromstring(dfxp_data)
3699     out = []
3700     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3701
3702     if not paras:
3703         raise ValueError('Invalid dfxp/TTML subtitle')
3704
3705     repeat = False
3706     while True:
3707         for style in dfxp.findall(_x('.//ttml:style')):
3708             style_id = style.get('id') or style.get(_x('xml:id'))
3709             if not style_id:
3710                 continue
3711             parent_style_id = style.get('style')
3712             if parent_style_id:
3713                 if parent_style_id not in styles:
3714                     repeat = True
3715                     continue
3716                 styles[style_id] = styles[parent_style_id].copy()
3717             for prop in SUPPORTED_STYLING:
3718                 prop_val = style.get(_x('tts:' + prop))
3719                 if prop_val:
3720                     styles.setdefault(style_id, {})[prop] = prop_val
3721         if repeat:
3722             repeat = False
3723         else:
3724             break
3725
3726     for p in ('body', 'div'):
3727         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3728         if ele is None:
3729             continue
3730         style = styles.get(ele.get('style'))
3731         if not style:
3732             continue
3733         default_style.update(style)
3734
3735     for para, index in zip(paras, itertools.count(1)):
3736         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3737         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3738         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3739         if begin_time is None:
3740             continue
3741         if not end_time:
3742             if not dur:
3743                 continue
3744             end_time = begin_time + dur
3745         out.append('%d\n%s --> %s\n%s\n\n' % (
3746             index,
3747             srt_subtitles_timecode(begin_time),
3748             srt_subtitles_timecode(end_time),
3749             parse_node(para)))
3750
3751     return ''.join(out)
3752
3753
3754 def cli_option(params, command_option, param):
3755     param = params.get(param)
3756     if param:
3757         param = compat_str(param)
3758     return [command_option, param] if param is not None else []
3759
3760
3761 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3762     param = params.get(param)
3763     if param is None:
3764         return []
3765     assert isinstance(param, bool)
3766     if separator:
3767         return [command_option + separator + (true_value if param else false_value)]
3768     return [command_option, true_value if param else false_value]
3769
3770
3771 def cli_valueless_option(params, command_option, param, expected_value=True):
3772     param = params.get(param)
3773     return [command_option] if param == expected_value else []
3774
3775
3776 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3777     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3778         if use_compat:
3779             return argdict
3780         else:
3781             argdict = None
3782     if argdict is None:
3783         return default
3784     assert isinstance(argdict, dict)
3785
3786     assert isinstance(keys, (list, tuple))
3787     for key_list in keys:
3788         arg_list = list(filter(
3789             lambda x: x is not None,
3790             [argdict.get(key.lower()) for key in variadic(key_list)]))
3791         if arg_list:
3792             return [arg for args in arg_list for arg in args]
3793     return default
3794
3795
3796 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3797     main_key, exe = main_key.lower(), exe.lower()
3798     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3799     keys = [f'{root_key}{k}' for k in (keys or [''])]
3800     if root_key in keys:
3801         if main_key != exe:
3802             keys.append((main_key, exe))
3803         keys.append('default')
3804     else:
3805         use_compat = False
3806     return cli_configuration_args(argdict, keys, default, use_compat)
3807
3808
3809 class ISO639Utils(object):
3810     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3811     _lang_map = {
3812         'aa': 'aar',
3813         'ab': 'abk',
3814         'ae': 'ave',
3815         'af': 'afr',
3816         'ak': 'aka',
3817         'am': 'amh',
3818         'an': 'arg',
3819         'ar': 'ara',
3820         'as': 'asm',
3821         'av': 'ava',
3822         'ay': 'aym',
3823         'az': 'aze',
3824         'ba': 'bak',
3825         'be': 'bel',
3826         'bg': 'bul',
3827         'bh': 'bih',
3828         'bi': 'bis',
3829         'bm': 'bam',
3830         'bn': 'ben',
3831         'bo': 'bod',
3832         'br': 'bre',
3833         'bs': 'bos',
3834         'ca': 'cat',
3835         'ce': 'che',
3836         'ch': 'cha',
3837         'co': 'cos',
3838         'cr': 'cre',
3839         'cs': 'ces',
3840         'cu': 'chu',
3841         'cv': 'chv',
3842         'cy': 'cym',
3843         'da': 'dan',
3844         'de': 'deu',
3845         'dv': 'div',
3846         'dz': 'dzo',
3847         'ee': 'ewe',
3848         'el': 'ell',
3849         'en': 'eng',
3850         'eo': 'epo',
3851         'es': 'spa',
3852         'et': 'est',
3853         'eu': 'eus',
3854         'fa': 'fas',
3855         'ff': 'ful',
3856         'fi': 'fin',
3857         'fj': 'fij',
3858         'fo': 'fao',
3859         'fr': 'fra',
3860         'fy': 'fry',
3861         'ga': 'gle',
3862         'gd': 'gla',
3863         'gl': 'glg',
3864         'gn': 'grn',
3865         'gu': 'guj',
3866         'gv': 'glv',
3867         'ha': 'hau',
3868         'he': 'heb',
3869         'iw': 'heb',  # Replaced by he in 1989 revision
3870         'hi': 'hin',
3871         'ho': 'hmo',
3872         'hr': 'hrv',
3873         'ht': 'hat',
3874         'hu': 'hun',
3875         'hy': 'hye',
3876         'hz': 'her',
3877         'ia': 'ina',
3878         'id': 'ind',
3879         'in': 'ind',  # Replaced by id in 1989 revision
3880         'ie': 'ile',
3881         'ig': 'ibo',
3882         'ii': 'iii',
3883         'ik': 'ipk',
3884         'io': 'ido',
3885         'is': 'isl',
3886         'it': 'ita',
3887         'iu': 'iku',
3888         'ja': 'jpn',
3889         'jv': 'jav',
3890         'ka': 'kat',
3891         'kg': 'kon',
3892         'ki': 'kik',
3893         'kj': 'kua',
3894         'kk': 'kaz',
3895         'kl': 'kal',
3896         'km': 'khm',
3897         'kn': 'kan',
3898         'ko': 'kor',
3899         'kr': 'kau',
3900         'ks': 'kas',
3901         'ku': 'kur',
3902         'kv': 'kom',
3903         'kw': 'cor',
3904         'ky': 'kir',
3905         'la': 'lat',
3906         'lb': 'ltz',
3907         'lg': 'lug',
3908         'li': 'lim',
3909         'ln': 'lin',
3910         'lo': 'lao',
3911         'lt': 'lit',
3912         'lu': 'lub',
3913         'lv': 'lav',
3914         'mg': 'mlg',
3915         'mh': 'mah',
3916         'mi': 'mri',
3917         'mk': 'mkd',
3918         'ml': 'mal',
3919         'mn': 'mon',
3920         'mr': 'mar',
3921         'ms': 'msa',
3922         'mt': 'mlt',
3923         'my': 'mya',
3924         'na': 'nau',
3925         'nb': 'nob',
3926         'nd': 'nde',
3927         'ne': 'nep',
3928         'ng': 'ndo',
3929         'nl': 'nld',
3930         'nn': 'nno',
3931         'no': 'nor',
3932         'nr': 'nbl',
3933         'nv': 'nav',
3934         'ny': 'nya',
3935         'oc': 'oci',
3936         'oj': 'oji',
3937         'om': 'orm',
3938         'or': 'ori',
3939         'os': 'oss',
3940         'pa': 'pan',
3941         'pi': 'pli',
3942         'pl': 'pol',
3943         'ps': 'pus',
3944         'pt': 'por',
3945         'qu': 'que',
3946         'rm': 'roh',
3947         'rn': 'run',
3948         'ro': 'ron',
3949         'ru': 'rus',
3950         'rw': 'kin',
3951         'sa': 'san',
3952         'sc': 'srd',
3953         'sd': 'snd',
3954         'se': 'sme',
3955         'sg': 'sag',
3956         'si': 'sin',
3957         'sk': 'slk',
3958         'sl': 'slv',
3959         'sm': 'smo',
3960         'sn': 'sna',
3961         'so': 'som',
3962         'sq': 'sqi',
3963         'sr': 'srp',
3964         'ss': 'ssw',
3965         'st': 'sot',
3966         'su': 'sun',
3967         'sv': 'swe',
3968         'sw': 'swa',
3969         'ta': 'tam',
3970         'te': 'tel',
3971         'tg': 'tgk',
3972         'th': 'tha',
3973         'ti': 'tir',
3974         'tk': 'tuk',
3975         'tl': 'tgl',
3976         'tn': 'tsn',
3977         'to': 'ton',
3978         'tr': 'tur',
3979         'ts': 'tso',
3980         'tt': 'tat',
3981         'tw': 'twi',
3982         'ty': 'tah',
3983         'ug': 'uig',
3984         'uk': 'ukr',
3985         'ur': 'urd',
3986         'uz': 'uzb',
3987         've': 'ven',
3988         'vi': 'vie',
3989         'vo': 'vol',
3990         'wa': 'wln',
3991         'wo': 'wol',
3992         'xh': 'xho',
3993         'yi': 'yid',
3994         'ji': 'yid',  # Replaced by yi in 1989 revision
3995         'yo': 'yor',
3996         'za': 'zha',
3997         'zh': 'zho',
3998         'zu': 'zul',
3999     }
4000
4001     @classmethod
4002     def short2long(cls, code):
4003         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4004         return cls._lang_map.get(code[:2])
4005
4006     @classmethod
4007     def long2short(cls, code):
4008         """Convert language code from ISO 639-2/T to ISO 639-1"""
4009         for short_name, long_name in cls._lang_map.items():
4010             if long_name == code:
4011                 return short_name
4012
4013
4014 class ISO3166Utils(object):
4015     # From http://data.okfn.org/data/core/country-list
4016     _country_map = {
4017         'AF': 'Afghanistan',
4018         'AX': 'Åland Islands',
4019         'AL': 'Albania',
4020         'DZ': 'Algeria',
4021         'AS': 'American Samoa',
4022         'AD': 'Andorra',
4023         'AO': 'Angola',
4024         'AI': 'Anguilla',
4025         'AQ': 'Antarctica',
4026         'AG': 'Antigua and Barbuda',
4027         'AR': 'Argentina',
4028         'AM': 'Armenia',
4029         'AW': 'Aruba',
4030         'AU': 'Australia',
4031         'AT': 'Austria',
4032         'AZ': 'Azerbaijan',
4033         'BS': 'Bahamas',
4034         'BH': 'Bahrain',
4035         'BD': 'Bangladesh',
4036         'BB': 'Barbados',
4037         'BY': 'Belarus',
4038         'BE': 'Belgium',
4039         'BZ': 'Belize',
4040         'BJ': 'Benin',
4041         'BM': 'Bermuda',
4042         'BT': 'Bhutan',
4043         'BO': 'Bolivia, Plurinational State of',
4044         'BQ': 'Bonaire, Sint Eustatius and Saba',
4045         'BA': 'Bosnia and Herzegovina',
4046         'BW': 'Botswana',
4047         'BV': 'Bouvet Island',
4048         'BR': 'Brazil',
4049         'IO': 'British Indian Ocean Territory',
4050         'BN': 'Brunei Darussalam',
4051         'BG': 'Bulgaria',
4052         'BF': 'Burkina Faso',
4053         'BI': 'Burundi',
4054         'KH': 'Cambodia',
4055         'CM': 'Cameroon',
4056         'CA': 'Canada',
4057         'CV': 'Cape Verde',
4058         'KY': 'Cayman Islands',
4059         'CF': 'Central African Republic',
4060         'TD': 'Chad',
4061         'CL': 'Chile',
4062         'CN': 'China',
4063         'CX': 'Christmas Island',
4064         'CC': 'Cocos (Keeling) Islands',
4065         'CO': 'Colombia',
4066         'KM': 'Comoros',
4067         'CG': 'Congo',
4068         'CD': 'Congo, the Democratic Republic of the',
4069         'CK': 'Cook Islands',
4070         'CR': 'Costa Rica',
4071         'CI': 'Côte d\'Ivoire',
4072         'HR': 'Croatia',
4073         'CU': 'Cuba',
4074         'CW': 'Curaçao',
4075         'CY': 'Cyprus',
4076         'CZ': 'Czech Republic',
4077         'DK': 'Denmark',
4078         'DJ': 'Djibouti',
4079         'DM': 'Dominica',
4080         'DO': 'Dominican Republic',
4081         'EC': 'Ecuador',
4082         'EG': 'Egypt',
4083         'SV': 'El Salvador',
4084         'GQ': 'Equatorial Guinea',
4085         'ER': 'Eritrea',
4086         'EE': 'Estonia',
4087         'ET': 'Ethiopia',
4088         'FK': 'Falkland Islands (Malvinas)',
4089         'FO': 'Faroe Islands',
4090         'FJ': 'Fiji',
4091         'FI': 'Finland',
4092         'FR': 'France',
4093         'GF': 'French Guiana',
4094         'PF': 'French Polynesia',
4095         'TF': 'French Southern Territories',
4096         'GA': 'Gabon',
4097         'GM': 'Gambia',
4098         'GE': 'Georgia',
4099         'DE': 'Germany',
4100         'GH': 'Ghana',
4101         'GI': 'Gibraltar',
4102         'GR': 'Greece',
4103         'GL': 'Greenland',
4104         'GD': 'Grenada',
4105         'GP': 'Guadeloupe',
4106         'GU': 'Guam',
4107         'GT': 'Guatemala',
4108         'GG': 'Guernsey',
4109         'GN': 'Guinea',
4110         'GW': 'Guinea-Bissau',
4111         'GY': 'Guyana',
4112         'HT': 'Haiti',
4113         'HM': 'Heard Island and McDonald Islands',
4114         'VA': 'Holy See (Vatican City State)',
4115         'HN': 'Honduras',
4116         'HK': 'Hong Kong',
4117         'HU': 'Hungary',
4118         'IS': 'Iceland',
4119         'IN': 'India',
4120         'ID': 'Indonesia',
4121         'IR': 'Iran, Islamic Republic of',
4122         'IQ': 'Iraq',
4123         'IE': 'Ireland',
4124         'IM': 'Isle of Man',
4125         'IL': 'Israel',
4126         'IT': 'Italy',
4127         'JM': 'Jamaica',
4128         'JP': 'Japan',
4129         'JE': 'Jersey',
4130         'JO': 'Jordan',
4131         'KZ': 'Kazakhstan',
4132         'KE': 'Kenya',
4133         'KI': 'Kiribati',
4134         'KP': 'Korea, Democratic People\'s Republic of',
4135         'KR': 'Korea, Republic of',
4136         'KW': 'Kuwait',
4137         'KG': 'Kyrgyzstan',
4138         'LA': 'Lao People\'s Democratic Republic',
4139         'LV': 'Latvia',
4140         'LB': 'Lebanon',
4141         'LS': 'Lesotho',
4142         'LR': 'Liberia',
4143         'LY': 'Libya',
4144         'LI': 'Liechtenstein',
4145         'LT': 'Lithuania',
4146         'LU': 'Luxembourg',
4147         'MO': 'Macao',
4148         'MK': 'Macedonia, the Former Yugoslav Republic of',
4149         'MG': 'Madagascar',
4150         'MW': 'Malawi',
4151         'MY': 'Malaysia',
4152         'MV': 'Maldives',
4153         'ML': 'Mali',
4154         'MT': 'Malta',
4155         'MH': 'Marshall Islands',
4156         'MQ': 'Martinique',
4157         'MR': 'Mauritania',
4158         'MU': 'Mauritius',
4159         'YT': 'Mayotte',
4160         'MX': 'Mexico',
4161         'FM': 'Micronesia, Federated States of',
4162         'MD': 'Moldova, Republic of',
4163         'MC': 'Monaco',
4164         'MN': 'Mongolia',
4165         'ME': 'Montenegro',
4166         'MS': 'Montserrat',
4167         'MA': 'Morocco',
4168         'MZ': 'Mozambique',
4169         'MM': 'Myanmar',
4170         'NA': 'Namibia',
4171         'NR': 'Nauru',
4172         'NP': 'Nepal',
4173         'NL': 'Netherlands',
4174         'NC': 'New Caledonia',
4175         'NZ': 'New Zealand',
4176         'NI': 'Nicaragua',
4177         'NE': 'Niger',
4178         'NG': 'Nigeria',
4179         'NU': 'Niue',
4180         'NF': 'Norfolk Island',
4181         'MP': 'Northern Mariana Islands',
4182         'NO': 'Norway',
4183         'OM': 'Oman',
4184         'PK': 'Pakistan',
4185         'PW': 'Palau',
4186         'PS': 'Palestine, State of',
4187         'PA': 'Panama',
4188         'PG': 'Papua New Guinea',
4189         'PY': 'Paraguay',
4190         'PE': 'Peru',
4191         'PH': 'Philippines',
4192         'PN': 'Pitcairn',
4193         'PL': 'Poland',
4194         'PT': 'Portugal',
4195         'PR': 'Puerto Rico',
4196         'QA': 'Qatar',
4197         'RE': 'Réunion',
4198         'RO': 'Romania',
4199         'RU': 'Russian Federation',
4200         'RW': 'Rwanda',
4201         'BL': 'Saint Barthélemy',
4202         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4203         'KN': 'Saint Kitts and Nevis',
4204         'LC': 'Saint Lucia',
4205         'MF': 'Saint Martin (French part)',
4206         'PM': 'Saint Pierre and Miquelon',
4207         'VC': 'Saint Vincent and the Grenadines',
4208         'WS': 'Samoa',
4209         'SM': 'San Marino',
4210         'ST': 'Sao Tome and Principe',
4211         'SA': 'Saudi Arabia',
4212         'SN': 'Senegal',
4213         'RS': 'Serbia',
4214         'SC': 'Seychelles',
4215         'SL': 'Sierra Leone',
4216         'SG': 'Singapore',
4217         'SX': 'Sint Maarten (Dutch part)',
4218         'SK': 'Slovakia',
4219         'SI': 'Slovenia',
4220         'SB': 'Solomon Islands',
4221         'SO': 'Somalia',
4222         'ZA': 'South Africa',
4223         'GS': 'South Georgia and the South Sandwich Islands',
4224         'SS': 'South Sudan',
4225         'ES': 'Spain',
4226         'LK': 'Sri Lanka',
4227         'SD': 'Sudan',
4228         'SR': 'Suriname',
4229         'SJ': 'Svalbard and Jan Mayen',
4230         'SZ': 'Swaziland',
4231         'SE': 'Sweden',
4232         'CH': 'Switzerland',
4233         'SY': 'Syrian Arab Republic',
4234         'TW': 'Taiwan, Province of China',
4235         'TJ': 'Tajikistan',
4236         'TZ': 'Tanzania, United Republic of',
4237         'TH': 'Thailand',
4238         'TL': 'Timor-Leste',
4239         'TG': 'Togo',
4240         'TK': 'Tokelau',
4241         'TO': 'Tonga',
4242         'TT': 'Trinidad and Tobago',
4243         'TN': 'Tunisia',
4244         'TR': 'Turkey',
4245         'TM': 'Turkmenistan',
4246         'TC': 'Turks and Caicos Islands',
4247         'TV': 'Tuvalu',
4248         'UG': 'Uganda',
4249         'UA': 'Ukraine',
4250         'AE': 'United Arab Emirates',
4251         'GB': 'United Kingdom',
4252         'US': 'United States',
4253         'UM': 'United States Minor Outlying Islands',
4254         'UY': 'Uruguay',
4255         'UZ': 'Uzbekistan',
4256         'VU': 'Vanuatu',
4257         'VE': 'Venezuela, Bolivarian Republic of',
4258         'VN': 'Viet Nam',
4259         'VG': 'Virgin Islands, British',
4260         'VI': 'Virgin Islands, U.S.',
4261         'WF': 'Wallis and Futuna',
4262         'EH': 'Western Sahara',
4263         'YE': 'Yemen',
4264         'ZM': 'Zambia',
4265         'ZW': 'Zimbabwe',
4266     }
4267
4268     @classmethod
4269     def short2full(cls, code):
4270         """Convert an ISO 3166-2 country code to the corresponding full name"""
4271         return cls._country_map.get(code.upper())
4272
4273
4274 class GeoUtils(object):
4275     # Major IPv4 address blocks per country
4276     _country_ip_map = {
4277         'AD': '46.172.224.0/19',
4278         'AE': '94.200.0.0/13',
4279         'AF': '149.54.0.0/17',
4280         'AG': '209.59.64.0/18',
4281         'AI': '204.14.248.0/21',
4282         'AL': '46.99.0.0/16',
4283         'AM': '46.70.0.0/15',
4284         'AO': '105.168.0.0/13',
4285         'AP': '182.50.184.0/21',
4286         'AQ': '23.154.160.0/24',
4287         'AR': '181.0.0.0/12',
4288         'AS': '202.70.112.0/20',
4289         'AT': '77.116.0.0/14',
4290         'AU': '1.128.0.0/11',
4291         'AW': '181.41.0.0/18',
4292         'AX': '185.217.4.0/22',
4293         'AZ': '5.197.0.0/16',
4294         'BA': '31.176.128.0/17',
4295         'BB': '65.48.128.0/17',
4296         'BD': '114.130.0.0/16',
4297         'BE': '57.0.0.0/8',
4298         'BF': '102.178.0.0/15',
4299         'BG': '95.42.0.0/15',
4300         'BH': '37.131.0.0/17',
4301         'BI': '154.117.192.0/18',
4302         'BJ': '137.255.0.0/16',
4303         'BL': '185.212.72.0/23',
4304         'BM': '196.12.64.0/18',
4305         'BN': '156.31.0.0/16',
4306         'BO': '161.56.0.0/16',
4307         'BQ': '161.0.80.0/20',
4308         'BR': '191.128.0.0/12',
4309         'BS': '24.51.64.0/18',
4310         'BT': '119.2.96.0/19',
4311         'BW': '168.167.0.0/16',
4312         'BY': '178.120.0.0/13',
4313         'BZ': '179.42.192.0/18',
4314         'CA': '99.224.0.0/11',
4315         'CD': '41.243.0.0/16',
4316         'CF': '197.242.176.0/21',
4317         'CG': '160.113.0.0/16',
4318         'CH': '85.0.0.0/13',
4319         'CI': '102.136.0.0/14',
4320         'CK': '202.65.32.0/19',
4321         'CL': '152.172.0.0/14',
4322         'CM': '102.244.0.0/14',
4323         'CN': '36.128.0.0/10',
4324         'CO': '181.240.0.0/12',
4325         'CR': '201.192.0.0/12',
4326         'CU': '152.206.0.0/15',
4327         'CV': '165.90.96.0/19',
4328         'CW': '190.88.128.0/17',
4329         'CY': '31.153.0.0/16',
4330         'CZ': '88.100.0.0/14',
4331         'DE': '53.0.0.0/8',
4332         'DJ': '197.241.0.0/17',
4333         'DK': '87.48.0.0/12',
4334         'DM': '192.243.48.0/20',
4335         'DO': '152.166.0.0/15',
4336         'DZ': '41.96.0.0/12',
4337         'EC': '186.68.0.0/15',
4338         'EE': '90.190.0.0/15',
4339         'EG': '156.160.0.0/11',
4340         'ER': '196.200.96.0/20',
4341         'ES': '88.0.0.0/11',
4342         'ET': '196.188.0.0/14',
4343         'EU': '2.16.0.0/13',
4344         'FI': '91.152.0.0/13',
4345         'FJ': '144.120.0.0/16',
4346         'FK': '80.73.208.0/21',
4347         'FM': '119.252.112.0/20',
4348         'FO': '88.85.32.0/19',
4349         'FR': '90.0.0.0/9',
4350         'GA': '41.158.0.0/15',
4351         'GB': '25.0.0.0/8',
4352         'GD': '74.122.88.0/21',
4353         'GE': '31.146.0.0/16',
4354         'GF': '161.22.64.0/18',
4355         'GG': '62.68.160.0/19',
4356         'GH': '154.160.0.0/12',
4357         'GI': '95.164.0.0/16',
4358         'GL': '88.83.0.0/19',
4359         'GM': '160.182.0.0/15',
4360         'GN': '197.149.192.0/18',
4361         'GP': '104.250.0.0/19',
4362         'GQ': '105.235.224.0/20',
4363         'GR': '94.64.0.0/13',
4364         'GT': '168.234.0.0/16',
4365         'GU': '168.123.0.0/16',
4366         'GW': '197.214.80.0/20',
4367         'GY': '181.41.64.0/18',
4368         'HK': '113.252.0.0/14',
4369         'HN': '181.210.0.0/16',
4370         'HR': '93.136.0.0/13',
4371         'HT': '148.102.128.0/17',
4372         'HU': '84.0.0.0/14',
4373         'ID': '39.192.0.0/10',
4374         'IE': '87.32.0.0/12',
4375         'IL': '79.176.0.0/13',
4376         'IM': '5.62.80.0/20',
4377         'IN': '117.192.0.0/10',
4378         'IO': '203.83.48.0/21',
4379         'IQ': '37.236.0.0/14',
4380         'IR': '2.176.0.0/12',
4381         'IS': '82.221.0.0/16',
4382         'IT': '79.0.0.0/10',
4383         'JE': '87.244.64.0/18',
4384         'JM': '72.27.0.0/17',
4385         'JO': '176.29.0.0/16',
4386         'JP': '133.0.0.0/8',
4387         'KE': '105.48.0.0/12',
4388         'KG': '158.181.128.0/17',
4389         'KH': '36.37.128.0/17',
4390         'KI': '103.25.140.0/22',
4391         'KM': '197.255.224.0/20',
4392         'KN': '198.167.192.0/19',
4393         'KP': '175.45.176.0/22',
4394         'KR': '175.192.0.0/10',
4395         'KW': '37.36.0.0/14',
4396         'KY': '64.96.0.0/15',
4397         'KZ': '2.72.0.0/13',
4398         'LA': '115.84.64.0/18',
4399         'LB': '178.135.0.0/16',
4400         'LC': '24.92.144.0/20',
4401         'LI': '82.117.0.0/19',
4402         'LK': '112.134.0.0/15',
4403         'LR': '102.183.0.0/16',
4404         'LS': '129.232.0.0/17',
4405         'LT': '78.56.0.0/13',
4406         'LU': '188.42.0.0/16',
4407         'LV': '46.109.0.0/16',
4408         'LY': '41.252.0.0/14',
4409         'MA': '105.128.0.0/11',
4410         'MC': '88.209.64.0/18',
4411         'MD': '37.246.0.0/16',
4412         'ME': '178.175.0.0/17',
4413         'MF': '74.112.232.0/21',
4414         'MG': '154.126.0.0/17',
4415         'MH': '117.103.88.0/21',
4416         'MK': '77.28.0.0/15',
4417         'ML': '154.118.128.0/18',
4418         'MM': '37.111.0.0/17',
4419         'MN': '49.0.128.0/17',
4420         'MO': '60.246.0.0/16',
4421         'MP': '202.88.64.0/20',
4422         'MQ': '109.203.224.0/19',
4423         'MR': '41.188.64.0/18',
4424         'MS': '208.90.112.0/22',
4425         'MT': '46.11.0.0/16',
4426         'MU': '105.16.0.0/12',
4427         'MV': '27.114.128.0/18',
4428         'MW': '102.70.0.0/15',
4429         'MX': '187.192.0.0/11',
4430         'MY': '175.136.0.0/13',
4431         'MZ': '197.218.0.0/15',
4432         'NA': '41.182.0.0/16',
4433         'NC': '101.101.0.0/18',
4434         'NE': '197.214.0.0/18',
4435         'NF': '203.17.240.0/22',
4436         'NG': '105.112.0.0/12',
4437         'NI': '186.76.0.0/15',
4438         'NL': '145.96.0.0/11',
4439         'NO': '84.208.0.0/13',
4440         'NP': '36.252.0.0/15',
4441         'NR': '203.98.224.0/19',
4442         'NU': '49.156.48.0/22',
4443         'NZ': '49.224.0.0/14',
4444         'OM': '5.36.0.0/15',
4445         'PA': '186.72.0.0/15',
4446         'PE': '186.160.0.0/14',
4447         'PF': '123.50.64.0/18',
4448         'PG': '124.240.192.0/19',
4449         'PH': '49.144.0.0/13',
4450         'PK': '39.32.0.0/11',
4451         'PL': '83.0.0.0/11',
4452         'PM': '70.36.0.0/20',
4453         'PR': '66.50.0.0/16',
4454         'PS': '188.161.0.0/16',
4455         'PT': '85.240.0.0/13',
4456         'PW': '202.124.224.0/20',
4457         'PY': '181.120.0.0/14',
4458         'QA': '37.210.0.0/15',
4459         'RE': '102.35.0.0/16',
4460         'RO': '79.112.0.0/13',
4461         'RS': '93.86.0.0/15',
4462         'RU': '5.136.0.0/13',
4463         'RW': '41.186.0.0/16',
4464         'SA': '188.48.0.0/13',
4465         'SB': '202.1.160.0/19',
4466         'SC': '154.192.0.0/11',
4467         'SD': '102.120.0.0/13',
4468         'SE': '78.64.0.0/12',
4469         'SG': '8.128.0.0/10',
4470         'SI': '188.196.0.0/14',
4471         'SK': '78.98.0.0/15',
4472         'SL': '102.143.0.0/17',
4473         'SM': '89.186.32.0/19',
4474         'SN': '41.82.0.0/15',
4475         'SO': '154.115.192.0/18',
4476         'SR': '186.179.128.0/17',
4477         'SS': '105.235.208.0/21',
4478         'ST': '197.159.160.0/19',
4479         'SV': '168.243.0.0/16',
4480         'SX': '190.102.0.0/20',
4481         'SY': '5.0.0.0/16',
4482         'SZ': '41.84.224.0/19',
4483         'TC': '65.255.48.0/20',
4484         'TD': '154.68.128.0/19',
4485         'TG': '196.168.0.0/14',
4486         'TH': '171.96.0.0/13',
4487         'TJ': '85.9.128.0/18',
4488         'TK': '27.96.24.0/21',
4489         'TL': '180.189.160.0/20',
4490         'TM': '95.85.96.0/19',
4491         'TN': '197.0.0.0/11',
4492         'TO': '175.176.144.0/21',
4493         'TR': '78.160.0.0/11',
4494         'TT': '186.44.0.0/15',
4495         'TV': '202.2.96.0/19',
4496         'TW': '120.96.0.0/11',
4497         'TZ': '156.156.0.0/14',
4498         'UA': '37.52.0.0/14',
4499         'UG': '102.80.0.0/13',
4500         'US': '6.0.0.0/8',
4501         'UY': '167.56.0.0/13',
4502         'UZ': '84.54.64.0/18',
4503         'VA': '212.77.0.0/19',
4504         'VC': '207.191.240.0/21',
4505         'VE': '186.88.0.0/13',
4506         'VG': '66.81.192.0/20',
4507         'VI': '146.226.0.0/16',
4508         'VN': '14.160.0.0/11',
4509         'VU': '202.80.32.0/20',
4510         'WF': '117.20.32.0/21',
4511         'WS': '202.4.32.0/19',
4512         'YE': '134.35.0.0/16',
4513         'YT': '41.242.116.0/22',
4514         'ZA': '41.0.0.0/11',
4515         'ZM': '102.144.0.0/13',
4516         'ZW': '102.177.192.0/18',
4517     }
4518
4519     @classmethod
4520     def random_ipv4(cls, code_or_block):
4521         if len(code_or_block) == 2:
4522             block = cls._country_ip_map.get(code_or_block.upper())
4523             if not block:
4524                 return None
4525         else:
4526             block = code_or_block
4527         addr, preflen = block.split('/')
4528         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4529         addr_max = addr_min | (0xffffffff >> int(preflen))
4530         return compat_str(socket.inet_ntoa(
4531             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4532
4533
4534 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4535     def __init__(self, proxies=None):
4536         # Set default handlers
4537         for type in ('http', 'https'):
4538             setattr(self, '%s_open' % type,
4539                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4540                         meth(r, proxy, type))
4541         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4542
4543     def proxy_open(self, req, proxy, type):
4544         req_proxy = req.headers.get('Ytdl-request-proxy')
4545         if req_proxy is not None:
4546             proxy = req_proxy
4547             del req.headers['Ytdl-request-proxy']
4548
4549         if proxy == '__noproxy__':
4550             return None  # No Proxy
4551         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4552             req.add_header('Ytdl-socks-proxy', proxy)
4553             # yt-dlp's http/https handlers do wrapping the socket with socks
4554             return None
4555         return compat_urllib_request.ProxyHandler.proxy_open(
4556             self, req, proxy, type)
4557
4558
4559 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4560 # released into Public Domain
4561 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4562
4563 def long_to_bytes(n, blocksize=0):
4564     """long_to_bytes(n:long, blocksize:int) : string
4565     Convert a long integer to a byte string.
4566
4567     If optional blocksize is given and greater than zero, pad the front of the
4568     byte string with binary zeros so that the length is a multiple of
4569     blocksize.
4570     """
4571     # after much testing, this algorithm was deemed to be the fastest
4572     s = b''
4573     n = int(n)
4574     while n > 0:
4575         s = compat_struct_pack('>I', n & 0xffffffff) + s
4576         n = n >> 32
4577     # strip off leading zeros
4578     for i in range(len(s)):
4579         if s[i] != b'\000'[0]:
4580             break
4581     else:
4582         # only happens when n == 0
4583         s = b'\000'
4584         i = 0
4585     s = s[i:]
4586     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4587     # de-padding being done above, but sigh...
4588     if blocksize > 0 and len(s) % blocksize:
4589         s = (blocksize - len(s) % blocksize) * b'\000' + s
4590     return s
4591
4592
4593 def bytes_to_long(s):
4594     """bytes_to_long(string) : long
4595     Convert a byte string to a long integer.
4596
4597     This is (essentially) the inverse of long_to_bytes().
4598     """
4599     acc = 0
4600     length = len(s)
4601     if length % 4:
4602         extra = (4 - length % 4)
4603         s = b'\000' * extra + s
4604         length = length + extra
4605     for i in range(0, length, 4):
4606         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4607     return acc
4608
4609
4610 def ohdave_rsa_encrypt(data, exponent, modulus):
4611     '''
4612     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4613
4614     Input:
4615         data: data to encrypt, bytes-like object
4616         exponent, modulus: parameter e and N of RSA algorithm, both integer
4617     Output: hex string of encrypted data
4618
4619     Limitation: supports one block encryption only
4620     '''
4621
4622     payload = int(binascii.hexlify(data[::-1]), 16)
4623     encrypted = pow(payload, exponent, modulus)
4624     return '%x' % encrypted
4625
4626
4627 def pkcs1pad(data, length):
4628     """
4629     Padding input data with PKCS#1 scheme
4630
4631     @param {int[]} data        input data
4632     @param {int}   length      target length
4633     @returns {int[]}           padded data
4634     """
4635     if len(data) > length - 11:
4636         raise ValueError('Input data too long for PKCS#1 padding')
4637
4638     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4639     return [0, 2] + pseudo_random + [0] + data
4640
4641
4642 def encode_base_n(num, n, table=None):
4643     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4644     if not table:
4645         table = FULL_TABLE[:n]
4646
4647     if n > len(table):
4648         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4649
4650     if num == 0:
4651         return table[0]
4652
4653     ret = ''
4654     while num:
4655         ret = table[num % n] + ret
4656         num = num // n
4657     return ret
4658
4659
4660 def decode_packed_codes(code):
4661     mobj = re.search(PACKED_CODES_RE, code)
4662     obfuscated_code, base, count, symbols = mobj.groups()
4663     base = int(base)
4664     count = int(count)
4665     symbols = symbols.split('|')
4666     symbol_table = {}
4667
4668     while count:
4669         count -= 1
4670         base_n_count = encode_base_n(count, base)
4671         symbol_table[base_n_count] = symbols[count] or base_n_count
4672
4673     return re.sub(
4674         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4675         obfuscated_code)
4676
4677
4678 def caesar(s, alphabet, shift):
4679     if shift == 0:
4680         return s
4681     l = len(alphabet)
4682     return ''.join(
4683         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4684         for c in s)
4685
4686
4687 def rot47(s):
4688     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4689
4690
4691 def parse_m3u8_attributes(attrib):
4692     info = {}
4693     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4694         if val.startswith('"'):
4695             val = val[1:-1]
4696         info[key] = val
4697     return info
4698
4699
4700 def urshift(val, n):
4701     return val >> n if val >= 0 else (val + 0x100000000) >> n
4702
4703
4704 # Based on png2str() written by @gdkchan and improved by @yokrysty
4705 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4706 def decode_png(png_data):
4707     # Reference: https://www.w3.org/TR/PNG/
4708     header = png_data[8:]
4709
4710     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4711         raise IOError('Not a valid PNG file.')
4712
4713     int_map = {1: '>B', 2: '>H', 4: '>I'}
4714     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4715
4716     chunks = []
4717
4718     while header:
4719         length = unpack_integer(header[:4])
4720         header = header[4:]
4721
4722         chunk_type = header[:4]
4723         header = header[4:]
4724
4725         chunk_data = header[:length]
4726         header = header[length:]
4727
4728         header = header[4:]  # Skip CRC
4729
4730         chunks.append({
4731             'type': chunk_type,
4732             'length': length,
4733             'data': chunk_data
4734         })
4735
4736     ihdr = chunks[0]['data']
4737
4738     width = unpack_integer(ihdr[:4])
4739     height = unpack_integer(ihdr[4:8])
4740
4741     idat = b''
4742
4743     for chunk in chunks:
4744         if chunk['type'] == b'IDAT':
4745             idat += chunk['data']
4746
4747     if not idat:
4748         raise IOError('Unable to read PNG data.')
4749
4750     decompressed_data = bytearray(zlib.decompress(idat))
4751
4752     stride = width * 3
4753     pixels = []
4754
4755     def _get_pixel(idx):
4756         x = idx % stride
4757         y = idx // stride
4758         return pixels[y][x]
4759
4760     for y in range(height):
4761         basePos = y * (1 + stride)
4762         filter_type = decompressed_data[basePos]
4763
4764         current_row = []
4765
4766         pixels.append(current_row)
4767
4768         for x in range(stride):
4769             color = decompressed_data[1 + basePos + x]
4770             basex = y * stride + x
4771             left = 0
4772             up = 0
4773
4774             if x > 2:
4775                 left = _get_pixel(basex - 3)
4776             if y > 0:
4777                 up = _get_pixel(basex - stride)
4778
4779             if filter_type == 1:  # Sub
4780                 color = (color + left) & 0xff
4781             elif filter_type == 2:  # Up
4782                 color = (color + up) & 0xff
4783             elif filter_type == 3:  # Average
4784                 color = (color + ((left + up) >> 1)) & 0xff
4785             elif filter_type == 4:  # Paeth
4786                 a = left
4787                 b = up
4788                 c = 0
4789
4790                 if x > 2 and y > 0:
4791                     c = _get_pixel(basex - stride - 3)
4792
4793                 p = a + b - c
4794
4795                 pa = abs(p - a)
4796                 pb = abs(p - b)
4797                 pc = abs(p - c)
4798
4799                 if pa <= pb and pa <= pc:
4800                     color = (color + a) & 0xff
4801                 elif pb <= pc:
4802                     color = (color + b) & 0xff
4803                 else:
4804                     color = (color + c) & 0xff
4805
4806             current_row.append(color)
4807
4808     return width, height, pixels
4809
4810
4811 def write_xattr(path, key, value):
4812     # This mess below finds the best xattr tool for the job
4813     try:
4814         # try the pyxattr module...
4815         import xattr
4816
4817         if hasattr(xattr, 'set'):  # pyxattr
4818             # Unicode arguments are not supported in python-pyxattr until
4819             # version 0.5.0
4820             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4821             pyxattr_required_version = '0.5.0'
4822             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4823                 # TODO: fallback to CLI tools
4824                 raise XAttrUnavailableError(
4825                     'python-pyxattr is detected but is too old. '
4826                     'yt-dlp requires %s or above while your version is %s. '
4827                     'Falling back to other xattr implementations' % (
4828                         pyxattr_required_version, xattr.__version__))
4829
4830             setxattr = xattr.set
4831         else:  # xattr
4832             setxattr = xattr.setxattr
4833
4834         try:
4835             setxattr(path, key, value)
4836         except EnvironmentError as e:
4837             raise XAttrMetadataError(e.errno, e.strerror)
4838
4839     except ImportError:
4840         if compat_os_name == 'nt':
4841             # Write xattrs to NTFS Alternate Data Streams:
4842             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4843             assert ':' not in key
4844             assert os.path.exists(path)
4845
4846             ads_fn = path + ':' + key
4847             try:
4848                 with open(ads_fn, 'wb') as f:
4849                     f.write(value)
4850             except EnvironmentError as e:
4851                 raise XAttrMetadataError(e.errno, e.strerror)
4852         else:
4853             user_has_setfattr = check_executable('setfattr', ['--version'])
4854             user_has_xattr = check_executable('xattr', ['-h'])
4855
4856             if user_has_setfattr or user_has_xattr:
4857
4858                 value = value.decode('utf-8')
4859                 if user_has_setfattr:
4860                     executable = 'setfattr'
4861                     opts = ['-n', key, '-v', value]
4862                 elif user_has_xattr:
4863                     executable = 'xattr'
4864                     opts = ['-w', key, value]
4865
4866                 cmd = ([encodeFilename(executable, True)]
4867                        + [encodeArgument(o) for o in opts]
4868                        + [encodeFilename(path, True)])
4869
4870                 try:
4871                     p = Popen(
4872                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4873                 except EnvironmentError as e:
4874                     raise XAttrMetadataError(e.errno, e.strerror)
4875                 stdout, stderr = p.communicate_or_kill()
4876                 stderr = stderr.decode('utf-8', 'replace')
4877                 if p.returncode != 0:
4878                     raise XAttrMetadataError(p.returncode, stderr)
4879
4880             else:
4881                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4882                 if sys.platform.startswith('linux'):
4883                     raise XAttrUnavailableError(
4884                         "Couldn't find a tool to set the xattrs. "
4885                         "Install either the python 'pyxattr' or 'xattr' "
4886                         "modules, or the GNU 'attr' package "
4887                         "(which contains the 'setfattr' tool).")
4888                 else:
4889                     raise XAttrUnavailableError(
4890                         "Couldn't find a tool to set the xattrs. "
4891                         "Install either the python 'xattr' module, "
4892                         "or the 'xattr' binary.")
4893
4894
4895 def random_birthday(year_field, month_field, day_field):
4896     start_date = datetime.date(1950, 1, 1)
4897     end_date = datetime.date(1995, 12, 31)
4898     offset = random.randint(0, (end_date - start_date).days)
4899     random_date = start_date + datetime.timedelta(offset)
4900     return {
4901         year_field: str(random_date.year),
4902         month_field: str(random_date.month),
4903         day_field: str(random_date.day),
4904     }
4905
4906
4907 # Templates for internet shortcut files, which are plain text files.
4908 DOT_URL_LINK_TEMPLATE = '''
4909 [InternetShortcut]
4910 URL=%(url)s
4911 '''.lstrip()
4912
4913 DOT_WEBLOC_LINK_TEMPLATE = '''
4914 <?xml version="1.0" encoding="UTF-8"?>
4915 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4916 <plist version="1.0">
4917 <dict>
4918 \t<key>URL</key>
4919 \t<string>%(url)s</string>
4920 </dict>
4921 </plist>
4922 '''.lstrip()
4923
4924 DOT_DESKTOP_LINK_TEMPLATE = '''
4925 [Desktop Entry]
4926 Encoding=UTF-8
4927 Name=%(filename)s
4928 Type=Link
4929 URL=%(url)s
4930 Icon=text-html
4931 '''.lstrip()
4932
4933 LINK_TEMPLATES = {
4934     'url': DOT_URL_LINK_TEMPLATE,
4935     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4936     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4937 }
4938
4939
4940 def iri_to_uri(iri):
4941     """
4942     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4943
4944     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4945     """
4946
4947     iri_parts = compat_urllib_parse_urlparse(iri)
4948
4949     if '[' in iri_parts.netloc:
4950         raise ValueError('IPv6 URIs are not, yet, supported.')
4951         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4952
4953     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4954
4955     net_location = ''
4956     if iri_parts.username:
4957         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4958         if iri_parts.password is not None:
4959             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4960         net_location += '@'
4961
4962     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4963     # The 'idna' encoding produces ASCII text.
4964     if iri_parts.port is not None and iri_parts.port != 80:
4965         net_location += ':' + str(iri_parts.port)
4966
4967     return compat_urllib_parse_urlunparse(
4968         (iri_parts.scheme,
4969             net_location,
4970
4971             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4972
4973             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4974             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4975
4976             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4977             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4978
4979             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4980
4981     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4982
4983
4984 def to_high_limit_path(path):
4985     if sys.platform in ['win32', 'cygwin']:
4986         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4987         return r'\\?\ '.rstrip() + os.path.abspath(path)
4988
4989     return path
4990
4991
4992 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4993     val = traverse_obj(obj, *variadic(field))
4994     if val in ignore:
4995         return default
4996     return template % (func(val) if func else val)
4997
4998
4999 def clean_podcast_url(url):
5000     return re.sub(r'''(?x)
5001         (?:
5002             (?:
5003                 chtbl\.com/track|
5004                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5005                 play\.podtrac\.com
5006             )/[^/]+|
5007             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5008             flex\.acast\.com|
5009             pd(?:
5010                 cn\.co| # https://podcorn.com/analytics-prefix/
5011                 st\.fm # https://podsights.com/docs/
5012             )/e
5013         )/''', '', url)
5014
5015
5016 _HEX_TABLE = '0123456789abcdef'
5017
5018
5019 def random_uuidv4():
5020     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5021
5022
5023 def make_dir(path, to_screen=None):
5024     try:
5025         dn = os.path.dirname(path)
5026         if dn and not os.path.exists(dn):
5027             os.makedirs(dn)
5028         return True
5029     except (OSError, IOError) as err:
5030         if callable(to_screen) is not None:
5031             to_screen('unable to create directory ' + error_to_compat_str(err))
5032         return False
5033
5034
5035 def get_executable_path():
5036     from zipimport import zipimporter
5037     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5038         path = os.path.dirname(sys.executable)
5039     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5040         path = os.path.join(os.path.dirname(__file__), '../..')
5041     else:
5042         path = os.path.join(os.path.dirname(__file__), '..')
5043     return os.path.abspath(path)
5044
5045
5046 def load_plugins(name, suffix, namespace):
5047     classes = {}
5048     try:
5049         plugins_spec = importlib.util.spec_from_file_location(
5050             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5051         plugins = importlib.util.module_from_spec(plugins_spec)
5052         sys.modules[plugins_spec.name] = plugins
5053         plugins_spec.loader.exec_module(plugins)
5054         for name in dir(plugins):
5055             if name in namespace:
5056                 continue
5057             if not name.endswith(suffix):
5058                 continue
5059             klass = getattr(plugins, name)
5060             classes[name] = namespace[name] = klass
5061     except FileNotFoundError:
5062         pass
5063     return classes
5064
5065
5066 def traverse_obj(
5067         obj, *path_list, default=None, expected_type=None, get_all=True,
5068         casesense=True, is_user_input=False, traverse_string=False):
5069     ''' Traverse nested list/dict/tuple
5070     @param path_list        A list of paths which are checked one by one.
5071                             Each path is a list of keys where each key is a string,
5072                             a function, a tuple of strings/None or "...".
5073                             When a fuction is given, it takes the key as argument and
5074                             returns whether the key matches or not. When a tuple is given,
5075                             all the keys given in the tuple are traversed, and
5076                             "..." traverses all the keys in the object
5077                             "None" returns the object without traversal
5078     @param default          Default value to return
5079     @param expected_type    Only accept final value of this type (Can also be any callable)
5080     @param get_all          Return all the values obtained from a path or only the first one
5081     @param casesense        Whether to consider dictionary keys as case sensitive
5082     @param is_user_input    Whether the keys are generated from user input. If True,
5083                             strings are converted to int/slice if necessary
5084     @param traverse_string  Whether to traverse inside strings. If True, any
5085                             non-compatible object will also be converted into a string
5086     # TODO: Write tests
5087     '''
5088     if not casesense:
5089         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5090         path_list = (map(_lower, variadic(path)) for path in path_list)
5091
5092     def _traverse_obj(obj, path, _current_depth=0):
5093         nonlocal depth
5094         path = tuple(variadic(path))
5095         for i, key in enumerate(path):
5096             if None in (key, obj):
5097                 return obj
5098             if isinstance(key, (list, tuple)):
5099                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5100                 key = ...
5101             if key is ...:
5102                 obj = (obj.values() if isinstance(obj, dict)
5103                        else obj if isinstance(obj, (list, tuple, LazyList))
5104                        else str(obj) if traverse_string else [])
5105                 _current_depth += 1
5106                 depth = max(depth, _current_depth)
5107                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5108             elif callable(key):
5109                 if isinstance(obj, (list, tuple, LazyList)):
5110                     obj = enumerate(obj)
5111                 elif isinstance(obj, dict):
5112                     obj = obj.items()
5113                 else:
5114                     if not traverse_string:
5115                         return None
5116                     obj = str(obj)
5117                 _current_depth += 1
5118                 depth = max(depth, _current_depth)
5119                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5120             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5121                 obj = (obj.get(key) if casesense or (key in obj)
5122                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5123             else:
5124                 if is_user_input:
5125                     key = (int_or_none(key) if ':' not in key
5126                            else slice(*map(int_or_none, key.split(':'))))
5127                     if key == slice(None):
5128                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5129                 if not isinstance(key, (int, slice)):
5130                     return None
5131                 if not isinstance(obj, (list, tuple, LazyList)):
5132                     if not traverse_string:
5133                         return None
5134                     obj = str(obj)
5135                 try:
5136                     obj = obj[key]
5137                 except IndexError:
5138                     return None
5139         return obj
5140
5141     if isinstance(expected_type, type):
5142         type_test = lambda val: val if isinstance(val, expected_type) else None
5143     elif expected_type is not None:
5144         type_test = expected_type
5145     else:
5146         type_test = lambda val: val
5147
5148     for path in path_list:
5149         depth = 0
5150         val = _traverse_obj(obj, path)
5151         if val is not None:
5152             if depth:
5153                 for _ in range(depth - 1):
5154                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5155                 val = [v for v in map(type_test, val) if v is not None]
5156                 if val:
5157                     return val if get_all else val[0]
5158             else:
5159                 val = type_test(val)
5160                 if val is not None:
5161                     return val
5162     return default
5163
5164
5165 def traverse_dict(dictn, keys, casesense=True):
5166     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5167                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5168     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5169
5170
5171 def variadic(x, allowed_types=(str, bytes, dict)):
5172     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5173
5174
5175 # create a JSON Web Signature (jws) with HS256 algorithm
5176 # the resulting format is in JWS Compact Serialization
5177 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5178 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5179 def jwt_encode_hs256(payload_data, key, headers={}):
5180     header_data = {
5181         'alg': 'HS256',
5182         'typ': 'JWT',
5183     }
5184     if headers:
5185         header_data.update(headers)
5186     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5187     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5188     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5189     signature_b64 = base64.b64encode(h.digest())
5190     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5191     return token
5192
5193
5194 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5195 def jwt_decode_hs256(jwt):
5196     header_b64, payload_b64, signature_b64 = jwt.split('.')
5197     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5198     return payload_data
5199
5200
5201 def supports_terminal_sequences(stream):
5202     if compat_os_name == 'nt':
5203         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5204         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5205             return False
5206     elif not os.getenv('TERM'):
5207         return False
5208     try:
5209         return stream.isatty()
5210     except BaseException:
5211         return False
5212
5213
5214 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5215
5216
5217 def remove_terminal_sequences(string):
5218     return _terminal_sequences_re.sub('', string)
5219
5220
5221 def number_of_digits(number):
5222     return len('%d' % number)
5223
5224
5225 def join_nonempty(*values, delim='-', from_dict=None):
5226     if from_dict is not None:
5227         values = map(from_dict.get, values)
5228     return delim.join(map(str, filter(None, values)))
5229
5230
5231 class Config:
5232     own_args = None
5233     filename = None
5234     __initialized = False
5235
5236     def __init__(self, parser, label=None):
5237         self._parser, self.label = parser, label
5238         self._loaded_paths, self.configs = set(), []
5239
5240     def init(self, args=None, filename=None):
5241         assert not self.__initialized
5242         directory = ''
5243         if filename:
5244             location = os.path.realpath(filename)
5245             directory = os.path.dirname(location)
5246             if location in self._loaded_paths:
5247                 return False
5248             self._loaded_paths.add(location)
5249
5250         self.__initialized = True
5251         self.own_args, self.filename = args, filename
5252         for location in self._parser.parse_args(args)[0].config_locations or []:
5253             location = os.path.join(directory, expand_path(location))
5254             if os.path.isdir(location):
5255                 location = os.path.join(location, 'yt-dlp.conf')
5256             if not os.path.exists(location):
5257                 self._parser.error(f'config location {location} does not exist')
5258             self.append_config(self.read_file(location), location)
5259         return True
5260
5261     def __str__(self):
5262         label = join_nonempty(
5263             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5264             delim=' ')
5265         return join_nonempty(
5266             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5267             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5268             delim='\n')
5269
5270     @staticmethod
5271     def read_file(filename, default=[]):
5272         try:
5273             optionf = open(filename)
5274         except IOError:
5275             return default  # silently skip if file is not present
5276         try:
5277             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5278             contents = optionf.read()
5279             if sys.version_info < (3,):
5280                 contents = contents.decode(preferredencoding())
5281             res = compat_shlex_split(contents, comments=True)
5282         finally:
5283             optionf.close()
5284         return res
5285
5286     @staticmethod
5287     def hide_login_info(opts):
5288         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5289         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5290
5291         def _scrub_eq(o):
5292             m = eqre.match(o)
5293             if m:
5294                 return m.group('key') + '=PRIVATE'
5295             else:
5296                 return o
5297
5298         opts = list(map(_scrub_eq, opts))
5299         for idx, opt in enumerate(opts):
5300             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5301                 opts[idx + 1] = 'PRIVATE'
5302         return opts
5303
5304     def append_config(self, *args, label=None):
5305         config = type(self)(self._parser, label)
5306         config._loaded_paths = self._loaded_paths
5307         if config.init(*args):
5308             self.configs.append(config)
5309
5310     @property
5311     def all_args(self):
5312         for config in reversed(self.configs):
5313             yield from config.all_args
5314         yield from self.own_args or []
5315
5316     def parse_args(self):
5317         return self._parser.parse_args(list(self.all_args))
5318
5319
5320 class WebSocketsWrapper():
5321     """Wraps websockets module to use in non-async scopes"""
5322
5323     def __init__(self, url, headers=None):
5324         self.loop = asyncio.events.new_event_loop()
5325         self.conn = compat_websockets.connect(
5326             url, extra_headers=headers, ping_interval=None,
5327             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5328
5329     def __enter__(self):
5330         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5331         return self
5332
5333     def send(self, *args):
5334         self.run_with_loop(self.pool.send(*args), self.loop)
5335
5336     def recv(self, *args):
5337         return self.run_with_loop(self.pool.recv(*args), self.loop)
5338
5339     def __exit__(self, type, value, traceback):
5340         try:
5341             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5342         finally:
5343             self.loop.close()
5344             self.r_cancel_all_tasks(self.loop)
5345
5346     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5347     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5348     @staticmethod
5349     def run_with_loop(main, loop):
5350         if not asyncio.coroutines.iscoroutine(main):
5351             raise ValueError(f'a coroutine was expected, got {main!r}')
5352
5353         try:
5354             return loop.run_until_complete(main)
5355         finally:
5356             loop.run_until_complete(loop.shutdown_asyncgens())
5357             if hasattr(loop, 'shutdown_default_executor'):
5358                 loop.run_until_complete(loop.shutdown_default_executor())
5359
5360     @staticmethod
5361     def _cancel_all_tasks(loop):
5362         to_cancel = asyncio.tasks.all_tasks(loop)
5363
5364         if not to_cancel:
5365             return
5366
5367         for task in to_cancel:
5368             task.cancel()
5369
5370         loop.run_until_complete(
5371             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5372
5373         for task in to_cancel:
5374             if task.cancelled():
5375                 continue
5376             if task.exception() is not None:
5377                 loop.call_exception_handler({
5378                     'message': 'unhandled exception during asyncio.run() shutdown',
5379                     'exception': task.exception(),
5380                     'task': task,
5381                 })
5382
5383
5384 has_websockets = bool(compat_websockets)