yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_chr,
  51     compat_cookiejar,
  52     compat_ctypes_WINFUNCTYPE,
  53     compat_etree_fromstring,
  54     compat_expanduser,
  55     compat_html_entities,
  56     compat_html_entities_html5,
  57     compat_http_client,
  58     compat_integer_types,
  59     compat_numeric_types,
  60     compat_kwargs,
  61     compat_os_name,
  62     compat_parse_qs,
  63     compat_shlex_split,
  64     compat_shlex_quote,
  65     compat_str,
  66     compat_struct_pack,
  67     compat_struct_unpack,
  68     compat_urllib_error,
  69     compat_urllib_parse,
  70     compat_urllib_parse_urlencode,
  71     compat_urllib_parse_urlparse,
  72     compat_urllib_parse_urlunparse,
  73     compat_urllib_parse_quote,
  74     compat_urllib_parse_quote_plus,
  75     compat_urllib_parse_unquote_plus,
  76     compat_urllib_request,
  77     compat_urlparse,
  78     compat_websockets,
  79     compat_xpath,
  80 )
  81
  82 from .socks import (
  83     ProxyType,
  84     sockssocket,
  85 )
  86
  87
  88 def register_socks_protocols():
  89     # "Register" SOCKS protocols
  90     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  91     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  92     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  93         if scheme not in compat_urlparse.uses_netloc:
  94             compat_urlparse.uses_netloc.append(scheme)
  95
  96
  97 # This is not clearly defined otherwise
  98 compiled_regex_type = type(re.compile(''))
  99
 100
 101 def random_user_agent():
 102     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 103     _CHROME_VERSIONS = (
 104         '90.0.4430.212',
 105         '90.0.4430.24',
 106         '90.0.4430.70',
 107         '90.0.4430.72',
 108         '90.0.4430.85',
 109         '90.0.4430.93',
 110         '91.0.4472.101',
 111         '91.0.4472.106',
 112         '91.0.4472.114',
 113         '91.0.4472.124',
 114         '91.0.4472.164',
 115         '91.0.4472.19',
 116         '91.0.4472.77',
 117         '92.0.4515.107',
 118         '92.0.4515.115',
 119         '92.0.4515.131',
 120         '92.0.4515.159',
 121         '92.0.4515.43',
 122         '93.0.4556.0',
 123         '93.0.4577.15',
 124         '93.0.4577.63',
 125         '93.0.4577.82',
 126         '94.0.4606.41',
 127         '94.0.4606.54',
 128         '94.0.4606.61',
 129         '94.0.4606.71',
 130         '94.0.4606.81',
 131         '94.0.4606.85',
 132         '95.0.4638.17',
 133         '95.0.4638.50',
 134         '95.0.4638.54',
 135         '95.0.4638.69',
 136         '95.0.4638.74',
 137         '96.0.4664.18',
 138         '96.0.4664.45',
 139         '96.0.4664.55',
 140         '96.0.4664.93',
 141         '97.0.4692.20',
 142     )
 143     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 144
 145
 146 std_headers = {
 147     'User-Agent': random_user_agent(),
 148     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 149     'Accept-Encoding': 'gzip, deflate',
 150     'Accept-Language': 'en-us,en;q=0.5',
 151     'Sec-Fetch-Mode': 'navigate',
 152 }
 153
 154
 155 USER_AGENTS = {
 156     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 157 }
 158
 159
 160 NO_DEFAULT = object()
 161
 162 ENGLISH_MONTH_NAMES = [
 163     'January', 'February', 'March', 'April', 'May', 'June',
 164     'July', 'August', 'September', 'October', 'November', 'December']
 165
 166 MONTH_NAMES = {
 167     'en': ENGLISH_MONTH_NAMES,
 168     'fr': [
 169         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 170         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 171 }
 172
 173 KNOWN_EXTENSIONS = (
 174     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 175     'flv', 'f4v', 'f4a', 'f4b',
 176     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 177     'mkv', 'mka', 'mk3d',
 178     'avi', 'divx',
 179     'mov',
 180     'asf', 'wmv', 'wma',
 181     '3gp', '3g2',
 182     'mp3',
 183     'flac',
 184     'ape',
 185     'wav',
 186     'f4f', 'f4m', 'm3u8', 'smil')
 187
 188 # needed for sanitizing filenames in restricted mode
 189 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 190                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 191                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 192
 193 DATE_FORMATS = (
 194     '%d %B %Y',
 195     '%d %b %Y',
 196     '%B %d %Y',
 197     '%B %dst %Y',
 198     '%B %dnd %Y',
 199     '%B %drd %Y',
 200     '%B %dth %Y',
 201     '%b %d %Y',
 202     '%b %dst %Y',
 203     '%b %dnd %Y',
 204     '%b %drd %Y',
 205     '%b %dth %Y',
 206     '%b %dst %Y %I:%M',
 207     '%b %dnd %Y %I:%M',
 208     '%b %drd %Y %I:%M',
 209     '%b %dth %Y %I:%M',
 210     '%Y %m %d',
 211     '%Y-%m-%d',
 212     '%Y.%m.%d.',
 213     '%Y/%m/%d',
 214     '%Y/%m/%d %H:%M',
 215     '%Y/%m/%d %H:%M:%S',
 216     '%Y%m%d%H%M',
 217     '%Y%m%d%H%M%S',
 218     '%Y%m%d',
 219     '%Y-%m-%d %H:%M',
 220     '%Y-%m-%d %H:%M:%S',
 221     '%Y-%m-%d %H:%M:%S.%f',
 222     '%Y-%m-%d %H:%M:%S:%f',
 223     '%d.%m.%Y %H:%M',
 224     '%d.%m.%Y %H.%M',
 225     '%Y-%m-%dT%H:%M:%SZ',
 226     '%Y-%m-%dT%H:%M:%S.%fZ',
 227     '%Y-%m-%dT%H:%M:%S.%f0Z',
 228     '%Y-%m-%dT%H:%M:%S',
 229     '%Y-%m-%dT%H:%M:%S.%f',
 230     '%Y-%m-%dT%H:%M',
 231     '%b %d %Y at %H:%M',
 232     '%b %d %Y at %H:%M:%S',
 233     '%B %d %Y at %H:%M',
 234     '%B %d %Y at %H:%M:%S',
 235     '%H:%M %d-%b-%Y',
 236 )
 237
 238 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 239 DATE_FORMATS_DAY_FIRST.extend([
 240     '%d-%m-%Y',
 241     '%d.%m.%Y',
 242     '%d.%m.%y',
 243     '%d/%m/%Y',
 244     '%d/%m/%y',
 245     '%d/%m/%Y %H:%M:%S',
 246 ])
 247
 248 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 249 DATE_FORMATS_MONTH_FIRST.extend([
 250     '%m-%d-%Y',
 251     '%m.%d.%Y',
 252     '%m/%d/%Y',
 253     '%m/%d/%y',
 254     '%m/%d/%Y %H:%M:%S',
 255 ])
 256
 257 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 258 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 259
 260
 261 def preferredencoding():
 262     """Get preferred encoding.
 263
 264     Returns the best encoding scheme for the system, based on
 265     locale.getpreferredencoding() and some further tweaks.
 266     """
 267     try:
 268         pref = locale.getpreferredencoding()
 269         'TEST'.encode(pref)
 270     except Exception:
 271         pref = 'UTF-8'
 272
 273     return pref
 274
 275
 276 def write_json_file(obj, fn):
 277     """ Encode obj as JSON and write it to fn, atomically if possible """
 278
 279     fn = encodeFilename(fn)
 280     if sys.version_info < (3, 0) and sys.platform != 'win32':
 281         encoding = get_filesystem_encoding()
 282         # os.path.basename returns a bytes object, but NamedTemporaryFile
 283         # will fail if the filename contains non ascii characters unless we
 284         # use a unicode object
 285         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 286         # the same for os.path.dirname
 287         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 288     else:
 289         path_basename = os.path.basename
 290         path_dirname = os.path.dirname
 291
 292     args = {
 293         'suffix': '.tmp',
 294         'prefix': path_basename(fn) + '.',
 295         'dir': path_dirname(fn),
 296         'delete': False,
 297     }
 298
 299     # In Python 2.x, json.dump expects a bytestream.
 300     # In Python 3.x, it writes to a character stream
 301     if sys.version_info < (3, 0):
 302         args['mode'] = 'wb'
 303     else:
 304         args.update({
 305             'mode': 'w',
 306             'encoding': 'utf-8',
 307         })
 308
 309     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 310
 311     try:
 312         with tf:
 313             json.dump(obj, tf, ensure_ascii=False)
 314         if sys.platform == 'win32':
 315             # Need to remove existing file on Windows, else os.rename raises
 316             # WindowsError or FileExistsError.
 317             try:
 318                 os.unlink(fn)
 319             except OSError:
 320                 pass
 321         try:
 322             mask = os.umask(0)
 323             os.umask(mask)
 324             os.chmod(tf.name, 0o666 & ~mask)
 325         except OSError:
 326             pass
 327         os.rename(tf.name, fn)
 328     except Exception:
 329         try:
 330             os.remove(tf.name)
 331         except OSError:
 332             pass
 333         raise
 334
 335
 336 if sys.version_info >= (2, 7):
 337     def find_xpath_attr(node, xpath, key, val=None):
 338         """ Find the xpath xpath[@key=val] """
 339         assert re.match(r'^[a-zA-Z_-]+$', key)
 340         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 341         return node.find(expr)
 342 else:
 343     def find_xpath_attr(node, xpath, key, val=None):
 344         for f in node.findall(compat_xpath(xpath)):
 345             if key not in f.attrib:
 346                 continue
 347             if val is None or f.attrib.get(key) == val:
 348                 return f
 349         return None
 350
 351 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 352 # the namespace parameter
 353
 354
 355 def xpath_with_ns(path, ns_map):
 356     components = [c.split(':') for c in path.split('/')]
 357     replaced = []
 358     for c in components:
 359         if len(c) == 1:
 360             replaced.append(c[0])
 361         else:
 362             ns, tag = c
 363             replaced.append('{%s}%s' % (ns_map[ns], tag))
 364     return '/'.join(replaced)
 365
 366
 367 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 368     def _find_xpath(xpath):
 369         return node.find(compat_xpath(xpath))
 370
 371     if isinstance(xpath, (str, compat_str)):
 372         n = _find_xpath(xpath)
 373     else:
 374         for xp in xpath:
 375             n = _find_xpath(xp)
 376             if n is not None:
 377                 break
 378
 379     if n is None:
 380         if default is not NO_DEFAULT:
 381             return default
 382         elif fatal:
 383             name = xpath if name is None else name
 384             raise ExtractorError('Could not find XML element %s' % name)
 385         else:
 386             return None
 387     return n
 388
 389
 390 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 391     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 392     if n is None or n == default:
 393         return n
 394     if n.text is None:
 395         if default is not NO_DEFAULT:
 396             return default
 397         elif fatal:
 398             name = xpath if name is None else name
 399             raise ExtractorError('Could not find XML element\'s text %s' % name)
 400         else:
 401             return None
 402     return n.text
 403
 404
 405 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 406     n = find_xpath_attr(node, xpath, key)
 407     if n is None:
 408         if default is not NO_DEFAULT:
 409             return default
 410         elif fatal:
 411             name = '%s[@%s]' % (xpath, key) if name is None else name
 412             raise ExtractorError('Could not find XML attribute %s' % name)
 413         else:
 414             return None
 415     return n.attrib[key]
 416
 417
 418 def get_element_by_id(id, html):
 419     """Return the content of the tag with the specified ID in the passed HTML document"""
 420     return get_element_by_attribute('id', id, html)
 421
 422
 423 def get_element_html_by_id(id, html):
 424     """Return the html of the tag with the specified ID in the passed HTML document"""
 425     return get_element_html_by_attribute('id', id, html)
 426
 427
 428 def get_element_by_class(class_name, html):
 429     """Return the content of the first tag with the specified class in the passed HTML document"""
 430     retval = get_elements_by_class(class_name, html)
 431     return retval[0] if retval else None
 432
 433
 434 def get_element_html_by_class(class_name, html):
 435     """Return the html of the first tag with the specified class in the passed HTML document"""
 436     retval = get_elements_html_by_class(class_name, html)
 437     return retval[0] if retval else None
 438
 439
 440 def get_element_by_attribute(attribute, value, html, escape_value=True):
 441     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 442     return retval[0] if retval else None
 443
 444
 445 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 446     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 447     return retval[0] if retval else None
 448
 449
 450 def get_elements_by_class(class_name, html):
 451     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 452     return get_elements_by_attribute(
 453         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 454         html, escape_value=False)
 455
 456
 457 def get_elements_html_by_class(class_name, html):
 458     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 459     return get_elements_html_by_attribute(
 460         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 461         html, escape_value=False)
 462
 463
 464 def get_elements_by_attribute(*args, **kwargs):
 465     """Return the content of the tag with the specified attribute in the passed HTML document"""
 466     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 467
 468
 469 def get_elements_html_by_attribute(*args, **kwargs):
 470     """Return the html of the tag with the specified attribute in the passed HTML document"""
 471     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 472
 473
 474 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 475     """
 476     Return the text (content) and the html (whole) of the tag with the specified
 477     attribute in the passed HTML document
 478     """
 479
 480     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 481
 482     value = re.escape(value) if escape_value else value
 483
 484     partial_element_re = r'''(?x)
 485         <(?P<tag>[a-zA-Z0-9:._-]+)
 486          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 487          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 488         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 489
 490     for m in re.finditer(partial_element_re, html):
 491         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 492
 493         yield (
 494             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 495             whole
 496         )
 497
 498
 499 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 500     """
 501     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 502     closing tag for the first opening tag it has encountered, and can be used
 503     as a context manager
 504     """
 505
 506     class HTMLBreakOnClosingTagException(Exception):
 507         pass
 508
 509     def __init__(self):
 510         self.tagstack = collections.deque()
 511         compat_HTMLParser.__init__(self)
 512
 513     def __enter__(self):
 514         return self
 515
 516     def __exit__(self, *_):
 517         self.close()
 518
 519     def close(self):
 520         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 521         # so data remains buffered; we no longer have any interest in it, thus
 522         # override this method to discard it
 523         pass
 524
 525     def handle_starttag(self, tag, _):
 526         self.tagstack.append(tag)
 527
 528     def handle_endtag(self, tag):
 529         if not self.tagstack:
 530             raise compat_HTMLParseError('no tags in the stack')
 531         while self.tagstack:
 532             inner_tag = self.tagstack.pop()
 533             if inner_tag == tag:
 534                 break
 535         else:
 536             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 537         if not self.tagstack:
 538             raise self.HTMLBreakOnClosingTagException()
 539
 540
 541 def get_element_text_and_html_by_tag(tag, html):
 542     """
 543     For the first element with the specified tag in the passed HTML document
 544     return its' content (text) and the whole element (html)
 545     """
 546     def find_or_raise(haystack, needle, exc):
 547         try:
 548             return haystack.index(needle)
 549         except ValueError:
 550             raise exc
 551     closing_tag = f'</{tag}>'
 552     whole_start = find_or_raise(
 553         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 554     content_start = find_or_raise(
 555         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 556     content_start += whole_start + 1
 557     with HTMLBreakOnClosingTagParser() as parser:
 558         parser.feed(html[whole_start:content_start])
 559         if not parser.tagstack or parser.tagstack[0] != tag:
 560             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 561         offset = content_start
 562         while offset < len(html):
 563             next_closing_tag_start = find_or_raise(
 564                 html[offset:], closing_tag,
 565                 compat_HTMLParseError(f'closing {tag} tag not found'))
 566             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 567             try:
 568                 parser.feed(html[offset:offset + next_closing_tag_end])
 569                 offset += next_closing_tag_end
 570             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 571                 return html[content_start:offset + next_closing_tag_start], \
 572                     html[whole_start:offset + next_closing_tag_end]
 573         raise compat_HTMLParseError('unexpected end of html')
 574
 575
 576 class HTMLAttributeParser(compat_HTMLParser):
 577     """Trivial HTML parser to gather the attributes for a single element"""
 578
 579     def __init__(self):
 580         self.attrs = {}
 581         compat_HTMLParser.__init__(self)
 582
 583     def handle_starttag(self, tag, attrs):
 584         self.attrs = dict(attrs)
 585
 586
 587 class HTMLListAttrsParser(compat_HTMLParser):
 588     """HTML parser to gather the attributes for the elements of a list"""
 589
 590     def __init__(self):
 591         compat_HTMLParser.__init__(self)
 592         self.items = []
 593         self._level = 0
 594
 595     def handle_starttag(self, tag, attrs):
 596         if tag == 'li' and self._level == 0:
 597             self.items.append(dict(attrs))
 598         self._level += 1
 599
 600     def handle_endtag(self, tag):
 601         self._level -= 1
 602
 603
 604 def extract_attributes(html_element):
 605     """Given a string for an HTML element such as
 606     <el
 607          a="foo" B="bar" c="&98;az" d=boz
 608          empty= noval entity="&amp;"
 609          sq='"' dq="'"
 610     >
 611     Decode and return a dictionary of attributes.
 612     {
 613         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 614         'empty': '', 'noval': None, 'entity': '&',
 615         'sq': '"', 'dq': '\''
 616     }.
 617     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 618     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 619     """
 620     parser = HTMLAttributeParser()
 621     try:
 622         parser.feed(html_element)
 623         parser.close()
 624     # Older Python may throw HTMLParseError in case of malformed HTML
 625     except compat_HTMLParseError:
 626         pass
 627     return parser.attrs
 628
 629
 630 def parse_list(webpage):
 631     """Given a string for an series of HTML <li> elements,
 632     return a dictionary of their attributes"""
 633     parser = HTMLListAttrsParser()
 634     parser.feed(webpage)
 635     parser.close()
 636     return parser.items
 637
 638
 639 def clean_html(html):
 640     """Clean an HTML snippet into a readable string"""
 641
 642     if html is None:  # Convenience for sanitizing descriptions etc.
 643         return html
 644
 645     html = re.sub(r'\s+', ' ', html)
 646     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 647     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 648     # Strip html tags
 649     html = re.sub('<.*?>', '', html)
 650     # Replace html entities
 651     html = unescapeHTML(html)
 652     return html.strip()
 653
 654
 655 def sanitize_open(filename, open_mode):
 656     """Try to open the given filename, and slightly tweak it if this fails.
 657
 658     Attempts to open the given filename. If this fails, it tries to change
 659     the filename slightly, step by step, until it's either able to open it
 660     or it fails and raises a final exception, like the standard open()
 661     function.
 662
 663     It returns the tuple (stream, definitive_file_name).
 664     """
 665     try:
 666         if filename == '-':
 667             if sys.platform == 'win32':
 668                 import msvcrt
 669                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 670             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 671         stream = locked_file(filename, open_mode, block=False).open()
 672         return (stream, filename)
 673     except (IOError, OSError) as err:
 674         if err.errno in (errno.EACCES,):
 675             raise
 676
 677         # In case of error, try to remove win32 forbidden chars
 678         alt_filename = sanitize_path(filename)
 679         if alt_filename == filename:
 680             raise
 681         else:
 682             # An exception here should be caught in the caller
 683             stream = locked_file(filename, open_mode, block=False).open()
 684             return (stream, alt_filename)
 685
 686
 687 def timeconvert(timestr):
 688     """Convert RFC 2822 defined time string into system timestamp"""
 689     timestamp = None
 690     timetuple = email.utils.parsedate_tz(timestr)
 691     if timetuple is not None:
 692         timestamp = email.utils.mktime_tz(timetuple)
 693     return timestamp
 694
 695
 696 def sanitize_filename(s, restricted=False, is_id=False):
 697     """Sanitizes a string so it could be used as part of a filename.
 698     If restricted is set, use a stricter subset of allowed characters.
 699     Set is_id if this is not an arbitrary string, but an ID that should be kept
 700     if possible.
 701     """
 702     def replace_insane(char):
 703         if restricted and char in ACCENT_CHARS:
 704             return ACCENT_CHARS[char]
 705         elif not restricted and char == '\n':
 706             return ' '
 707         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 708             return ''
 709         elif char == '"':
 710             return '' if restricted else '\''
 711         elif char == ':':
 712             return '_-' if restricted else ' -'
 713         elif char in '\\/|*<>':
 714             return '_'
 715         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 716             return '_'
 717         if restricted and ord(char) > 127:
 718             return '_'
 719         return char
 720
 721     if s == '':
 722         return ''
 723     # Handle timestamps
 724     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 725     result = ''.join(map(replace_insane, s))
 726     if not is_id:
 727         while '__' in result:
 728             result = result.replace('__', '_')
 729         result = result.strip('_')
 730         # Common case of "Foreign band name - English song title"
 731         if restricted and result.startswith('-_'):
 732             result = result[2:]
 733         if result.startswith('-'):
 734             result = '_' + result[len('-'):]
 735         result = result.lstrip('.')
 736         if not result:
 737             result = '_'
 738     return result
 739
 740
 741 def sanitize_path(s, force=False):
 742     """Sanitizes and normalizes path on Windows"""
 743     if sys.platform == 'win32':
 744         force = False
 745         drive_or_unc, _ = os.path.splitdrive(s)
 746         if sys.version_info < (2, 7) and not drive_or_unc:
 747             drive_or_unc, _ = os.path.splitunc(s)
 748     elif force:
 749         drive_or_unc = ''
 750     else:
 751         return s
 752
 753     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 754     if drive_or_unc:
 755         norm_path.pop(0)
 756     sanitized_path = [
 757         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 758         for path_part in norm_path]
 759     if drive_or_unc:
 760         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 761     elif force and s[0] == os.path.sep:
 762         sanitized_path.insert(0, os.path.sep)
 763     return os.path.join(*sanitized_path)
 764
 765
 766 def sanitize_url(url):
 767     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 768     # the number of unwanted failures due to missing protocol
 769     if url.startswith('//'):
 770         return 'http:%s' % url
 771     # Fix some common typos seen so far
 772     COMMON_TYPOS = (
 773         # https://github.com/ytdl-org/youtube-dl/issues/15649
 774         (r'^httpss://', r'https://'),
 775         # https://bx1.be/lives/direct-tv/
 776         (r'^rmtp([es]?)://', r'rtmp\1://'),
 777     )
 778     for mistake, fixup in COMMON_TYPOS:
 779         if re.match(mistake, url):
 780             return re.sub(mistake, fixup, url)
 781     return url
 782
 783
 784 def extract_basic_auth(url):
 785     parts = compat_urlparse.urlsplit(url)
 786     if parts.username is None:
 787         return url, None
 788     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 789         parts.hostname if parts.port is None
 790         else '%s:%d' % (parts.hostname, parts.port))))
 791     auth_payload = base64.b64encode(
 792         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 793     return url, 'Basic ' + auth_payload.decode('utf-8')
 794
 795
 796 def sanitized_Request(url, *args, **kwargs):
 797     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 798     if auth_header is not None:
 799         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 800         headers['Authorization'] = auth_header
 801     return compat_urllib_request.Request(url, *args, **kwargs)
 802
 803
 804 def expand_path(s):
 805     """Expand shell variables and ~"""
 806     return os.path.expandvars(compat_expanduser(s))
 807
 808
 809 def orderedSet(iterable):
 810     """ Remove all duplicates from the input iterable """
 811     res = []
 812     for el in iterable:
 813         if el not in res:
 814             res.append(el)
 815     return res
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in compat_html_entities.name2codepoint:
 824         return compat_chr(compat_html_entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon. For example,
 827     # '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in compat_html_entities_html5:
 829         return compat_html_entities_html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         try:
 841             return compat_chr(int(numstr, base))
 842         except ValueError:
 843             pass
 844
 845     # Unknown entity in name, return its literal representation
 846     return '&%s;' % entity
 847
 848
 849 def unescapeHTML(s):
 850     if s is None:
 851         return None
 852     assert type(s) == compat_str
 853
 854     return re.sub(
 855         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 856
 857
 858 def escapeHTML(text):
 859     return (
 860         text
 861         .replace('&', '&amp;')
 862         .replace('<', '&lt;')
 863         .replace('>', '&gt;')
 864         .replace('"', '&quot;')
 865         .replace("'", '&#39;')
 866     )
 867
 868
 869 def process_communicate_or_kill(p, *args, **kwargs):
 870     try:
 871         return p.communicate(*args, **kwargs)
 872     except BaseException:  # Including KeyboardInterrupt
 873         p.kill()
 874         p.wait()
 875         raise
 876
 877
 878 class Popen(subprocess.Popen):
 879     if sys.platform == 'win32':
 880         _startupinfo = subprocess.STARTUPINFO()
 881         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 882     else:
 883         _startupinfo = None
 884
 885     def __init__(self, *args, **kwargs):
 886         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 887
 888     def communicate_or_kill(self, *args, **kwargs):
 889         return process_communicate_or_kill(self, *args, **kwargs)
 890
 891
 892 def get_subprocess_encoding():
 893     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 894         # For subprocess calls, encode with locale encoding
 895         # Refer to http://stackoverflow.com/a/9951851/35070
 896         encoding = preferredencoding()
 897     else:
 898         encoding = sys.getfilesystemencoding()
 899     if encoding is None:
 900         encoding = 'utf-8'
 901     return encoding
 902
 903
 904 def encodeFilename(s, for_subprocess=False):
 905     """
 906     @param s The name of the file
 907     """
 908
 909     assert type(s) == compat_str
 910
 911     # Python 3 has a Unicode API
 912     if sys.version_info >= (3, 0):
 913         return s
 914
 915     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 916     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 917     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 918     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 919         return s
 920
 921     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 922     if sys.platform.startswith('java'):
 923         return s
 924
 925     return s.encode(get_subprocess_encoding(), 'ignore')
 926
 927
 928 def decodeFilename(b, for_subprocess=False):
 929
 930     if sys.version_info >= (3, 0):
 931         return b
 932
 933     if not isinstance(b, bytes):
 934         return b
 935
 936     return b.decode(get_subprocess_encoding(), 'ignore')
 937
 938
 939 def encodeArgument(s):
 940     if not isinstance(s, compat_str):
 941         # Legacy code that uses byte strings
 942         # Uncomment the following line after fixing all post processors
 943         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 944         s = s.decode('ascii')
 945     return encodeFilename(s, True)
 946
 947
 948 def decodeArgument(b):
 949     return decodeFilename(b, True)
 950
 951
 952 def decodeOption(optval):
 953     if optval is None:
 954         return optval
 955     if isinstance(optval, bytes):
 956         optval = optval.decode(preferredencoding())
 957
 958     assert isinstance(optval, compat_str)
 959     return optval
 960
 961
 962 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 963
 964
 965 def timetuple_from_msec(msec):
 966     secs, msec = divmod(msec, 1000)
 967     mins, secs = divmod(secs, 60)
 968     hrs, mins = divmod(mins, 60)
 969     return _timetuple(hrs, mins, secs, msec)
 970
 971
 972 def formatSeconds(secs, delim=':', msec=False):
 973     time = timetuple_from_msec(secs * 1000)
 974     if time.hours:
 975         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 976     elif time.minutes:
 977         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 978     else:
 979         ret = '%d' % time.seconds
 980     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 981
 982
 983 def _ssl_load_windows_store_certs(ssl_context, storename):
 984     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 985     try:
 986         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 987                  if encoding == 'x509_asn' and (
 988                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 989     except PermissionError:
 990         return
 991     for cert in certs:
 992         try:
 993             ssl_context.load_verify_locations(cadata=cert)
 994         except ssl.SSLError:
 995             pass
 996
 997
 998 def make_HTTPS_handler(params, **kwargs):
 999     opts_check_certificate = not params.get('nocheckcertificate')
1000     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1001     context.check_hostname = opts_check_certificate
1002     if params.get('legacyserverconnect'):
1003         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1004     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1005     if opts_check_certificate:
1006         try:
1007             context.load_default_certs()
1008             # Work around the issue in load_default_certs when there are bad certificates. See:
1009             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1010             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1011         except ssl.SSLError:
1012             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1013             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1014                 # Create a new context to discard any certificates that were already loaded
1015                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1016                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1017                 for storename in ('CA', 'ROOT'):
1018                     _ssl_load_windows_store_certs(context, storename)
1019             context.set_default_verify_paths()
1020     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1021
1022
1023 def bug_reports_message(before=';'):
1024     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1025            'filling out the "Broken site" issue template properly. '
1026            'Confirm you are on the latest version using -U')
1027
1028     before = before.rstrip()
1029     if not before or before.endswith(('.', '!', '?')):
1030         msg = msg[0].title() + msg[1:]
1031
1032     return (before + ' ' if before else '') + msg
1033
1034
1035 class YoutubeDLError(Exception):
1036     """Base exception for YoutubeDL errors."""
1037     msg = None
1038
1039     def __init__(self, msg=None):
1040         if msg is not None:
1041             self.msg = msg
1042         elif self.msg is None:
1043             self.msg = type(self).__name__
1044         super().__init__(self.msg)
1045
1046
1047 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1048 if hasattr(ssl, 'CertificateError'):
1049     network_exceptions.append(ssl.CertificateError)
1050 network_exceptions = tuple(network_exceptions)
1051
1052
1053 class ExtractorError(YoutubeDLError):
1054     """Error during info extraction."""
1055
1056     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1057         """ tb, if given, is the original traceback (so that it can be printed out).
1058         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1059         """
1060         if sys.exc_info()[0] in network_exceptions:
1061             expected = True
1062
1063         self.msg = str(msg)
1064         self.traceback = tb
1065         self.expected = expected
1066         self.cause = cause
1067         self.video_id = video_id
1068         self.ie = ie
1069         self.exc_info = sys.exc_info()  # preserve original exception
1070
1071         super(ExtractorError, self).__init__(''.join((
1072             format_field(ie, template='[%s] '),
1073             format_field(video_id, template='%s: '),
1074             self.msg,
1075             format_field(cause, template=' (caused by %r)'),
1076             '' if expected else bug_reports_message())))
1077
1078     def format_traceback(self):
1079         if self.traceback is None:
1080             return None
1081         return ''.join(traceback.format_tb(self.traceback))
1082
1083
1084 class UnsupportedError(ExtractorError):
1085     def __init__(self, url):
1086         super(UnsupportedError, self).__init__(
1087             'Unsupported URL: %s' % url, expected=True)
1088         self.url = url
1089
1090
1091 class RegexNotFoundError(ExtractorError):
1092     """Error when a regex didn't match"""
1093     pass
1094
1095
1096 class GeoRestrictedError(ExtractorError):
1097     """Geographic restriction Error exception.
1098
1099     This exception may be thrown when a video is not available from your
1100     geographic location due to geographic restrictions imposed by a website.
1101     """
1102
1103     def __init__(self, msg, countries=None, **kwargs):
1104         kwargs['expected'] = True
1105         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1106         self.countries = countries
1107
1108
1109 class DownloadError(YoutubeDLError):
1110     """Download Error exception.
1111
1112     This exception may be thrown by FileDownloader objects if they are not
1113     configured to continue on errors. They will contain the appropriate
1114     error message.
1115     """
1116
1117     def __init__(self, msg, exc_info=None):
1118         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1119         super(DownloadError, self).__init__(msg)
1120         self.exc_info = exc_info
1121
1122
1123 class EntryNotInPlaylist(YoutubeDLError):
1124     """Entry not in playlist exception.
1125
1126     This exception will be thrown by YoutubeDL when a requested entry
1127     is not found in the playlist info_dict
1128     """
1129     msg = 'Entry not found in info'
1130
1131
1132 class SameFileError(YoutubeDLError):
1133     """Same File exception.
1134
1135     This exception will be thrown by FileDownloader objects if they detect
1136     multiple files would have to be downloaded to the same file on disk.
1137     """
1138     msg = 'Fixed output name but more than one file to download'
1139
1140     def __init__(self, filename=None):
1141         if filename is not None:
1142             self.msg += f': {filename}'
1143         super().__init__(self.msg)
1144
1145
1146 class PostProcessingError(YoutubeDLError):
1147     """Post Processing exception.
1148
1149     This exception may be raised by PostProcessor's .run() method to
1150     indicate an error in the postprocessing task.
1151     """
1152
1153
1154 class DownloadCancelled(YoutubeDLError):
1155     """ Exception raised when the download queue should be interrupted """
1156     msg = 'The download was cancelled'
1157
1158
1159 class ExistingVideoReached(DownloadCancelled):
1160     """ --break-on-existing triggered """
1161     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1162
1163
1164 class RejectedVideoReached(DownloadCancelled):
1165     """ --break-on-reject triggered """
1166     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1167
1168
1169 class MaxDownloadsReached(DownloadCancelled):
1170     """ --max-downloads limit has been reached. """
1171     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1172
1173
1174 class ReExtractInfo(YoutubeDLError):
1175     """ Video info needs to be re-extracted. """
1176
1177     def __init__(self, msg, expected=False):
1178         super().__init__(msg)
1179         self.expected = expected
1180
1181
1182 class ThrottledDownload(ReExtractInfo):
1183     """ Download speed below --throttled-rate. """
1184     msg = 'The download speed is below throttle limit'
1185
1186     def __init__(self):
1187         super().__init__(self.msg, expected=False)
1188
1189
1190 class UnavailableVideoError(YoutubeDLError):
1191     """Unavailable Format exception.
1192
1193     This exception will be thrown when a video is requested
1194     in a format that is not available for that video.
1195     """
1196     msg = 'Unable to download video'
1197
1198     def __init__(self, err=None):
1199         if err is not None:
1200             self.msg += f': {err}'
1201         super().__init__(self.msg)
1202
1203
1204 class ContentTooShortError(YoutubeDLError):
1205     """Content Too Short exception.
1206
1207     This exception may be raised by FileDownloader objects when a file they
1208     download is too small for what the server announced first, indicating
1209     the connection was probably interrupted.
1210     """
1211
1212     def __init__(self, downloaded, expected):
1213         super(ContentTooShortError, self).__init__(
1214             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1215         )
1216         # Both in bytes
1217         self.downloaded = downloaded
1218         self.expected = expected
1219
1220
1221 class XAttrMetadataError(YoutubeDLError):
1222     def __init__(self, code=None, msg='Unknown error'):
1223         super(XAttrMetadataError, self).__init__(msg)
1224         self.code = code
1225         self.msg = msg
1226
1227         # Parsing code and msg
1228         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1229                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1230             self.reason = 'NO_SPACE'
1231         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1232             self.reason = 'VALUE_TOO_LONG'
1233         else:
1234             self.reason = 'NOT_SUPPORTED'
1235
1236
1237 class XAttrUnavailableError(YoutubeDLError):
1238     pass
1239
1240
1241 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1242     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1243     # expected HTTP responses to meet HTTP/1.0 or later (see also
1244     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1245     if sys.version_info < (3, 0):
1246         kwargs['strict'] = True
1247     hc = http_class(*args, **compat_kwargs(kwargs))
1248     source_address = ydl_handler._params.get('source_address')
1249
1250     if source_address is not None:
1251         # This is to workaround _create_connection() from socket where it will try all
1252         # address data from getaddrinfo() including IPv6. This filters the result from
1253         # getaddrinfo() based on the source_address value.
1254         # This is based on the cpython socket.create_connection() function.
1255         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1256         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1257             host, port = address
1258             err = None
1259             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1260             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1261             ip_addrs = [addr for addr in addrs if addr[0] == af]
1262             if addrs and not ip_addrs:
1263                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1264                 raise socket.error(
1265                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1266                     % (ip_version, source_address[0]))
1267             for res in ip_addrs:
1268                 af, socktype, proto, canonname, sa = res
1269                 sock = None
1270                 try:
1271                     sock = socket.socket(af, socktype, proto)
1272                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1273                         sock.settimeout(timeout)
1274                     sock.bind(source_address)
1275                     sock.connect(sa)
1276                     err = None  # Explicitly break reference cycle
1277                     return sock
1278                 except socket.error as _:
1279                     err = _
1280                     if sock is not None:
1281                         sock.close()
1282             if err is not None:
1283                 raise err
1284             else:
1285                 raise socket.error('getaddrinfo returns an empty list')
1286         if hasattr(hc, '_create_connection'):
1287             hc._create_connection = _create_connection
1288         sa = (source_address, 0)
1289         if hasattr(hc, 'source_address'):  # Python 2.7+
1290             hc.source_address = sa
1291         else:  # Python 2.6
1292             def _hc_connect(self, *args, **kwargs):
1293                 sock = _create_connection(
1294                     (self.host, self.port), self.timeout, sa)
1295                 if is_https:
1296                     self.sock = ssl.wrap_socket(
1297                         sock, self.key_file, self.cert_file,
1298                         ssl_version=ssl.PROTOCOL_TLSv1)
1299                 else:
1300                     self.sock = sock
1301             hc.connect = functools.partial(_hc_connect, hc)
1302
1303     return hc
1304
1305
1306 def handle_youtubedl_headers(headers):
1307     filtered_headers = headers
1308
1309     if 'Youtubedl-no-compression' in filtered_headers:
1310         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1311         del filtered_headers['Youtubedl-no-compression']
1312
1313     return filtered_headers
1314
1315
1316 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1317     """Handler for HTTP requests and responses.
1318
1319     This class, when installed with an OpenerDirector, automatically adds
1320     the standard headers to every HTTP request and handles gzipped and
1321     deflated responses from web servers. If compression is to be avoided in
1322     a particular request, the original request in the program code only has
1323     to include the HTTP header "Youtubedl-no-compression", which will be
1324     removed before making the real request.
1325
1326     Part of this code was copied from:
1327
1328     http://techknack.net/python-urllib2-handlers/
1329
1330     Andrew Rowls, the author of that code, agreed to release it to the
1331     public domain.
1332     """
1333
1334     def __init__(self, params, *args, **kwargs):
1335         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1336         self._params = params
1337
1338     def http_open(self, req):
1339         conn_class = compat_http_client.HTTPConnection
1340
1341         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1342         if socks_proxy:
1343             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1344             del req.headers['Ytdl-socks-proxy']
1345
1346         return self.do_open(functools.partial(
1347             _create_http_connection, self, conn_class, False),
1348             req)
1349
1350     @staticmethod
1351     def deflate(data):
1352         if not data:
1353             return data
1354         try:
1355             return zlib.decompress(data, -zlib.MAX_WBITS)
1356         except zlib.error:
1357             return zlib.decompress(data)
1358
1359     def http_request(self, req):
1360         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1361         # always respected by websites, some tend to give out URLs with non percent-encoded
1362         # non-ASCII characters (see telemb.py, ard.py [#3412])
1363         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1364         # To work around aforementioned issue we will replace request's original URL with
1365         # percent-encoded one
1366         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1367         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1368         url = req.get_full_url()
1369         url_escaped = escape_url(url)
1370
1371         # Substitute URL if any change after escaping
1372         if url != url_escaped:
1373             req = update_Request(req, url=url_escaped)
1374
1375         for h, v in std_headers.items():
1376             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1377             # The dict keys are capitalized because of this bug by urllib
1378             if h.capitalize() not in req.headers:
1379                 req.add_header(h, v)
1380
1381         req.headers = handle_youtubedl_headers(req.headers)
1382
1383         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1384             # Python 2.6 is brain-dead when it comes to fragments
1385             req._Request__original = req._Request__original.partition('#')[0]
1386             req._Request__r_type = req._Request__r_type.partition('#')[0]
1387
1388         return req
1389
1390     def http_response(self, req, resp):
1391         old_resp = resp
1392         # gzip
1393         if resp.headers.get('Content-encoding', '') == 'gzip':
1394             content = resp.read()
1395             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1396             try:
1397                 uncompressed = io.BytesIO(gz.read())
1398             except IOError as original_ioerror:
1399                 # There may be junk add the end of the file
1400                 # See http://stackoverflow.com/q/4928560/35070 for details
1401                 for i in range(1, 1024):
1402                     try:
1403                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1404                         uncompressed = io.BytesIO(gz.read())
1405                     except IOError:
1406                         continue
1407                     break
1408                 else:
1409                     raise original_ioerror
1410             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1411             resp.msg = old_resp.msg
1412             del resp.headers['Content-encoding']
1413         # deflate
1414         if resp.headers.get('Content-encoding', '') == 'deflate':
1415             gz = io.BytesIO(self.deflate(resp.read()))
1416             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1417             resp.msg = old_resp.msg
1418             del resp.headers['Content-encoding']
1419         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1420         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1421         if 300 <= resp.code < 400:
1422             location = resp.headers.get('Location')
1423             if location:
1424                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1425                 if sys.version_info >= (3, 0):
1426                     location = location.encode('iso-8859-1').decode('utf-8')
1427                 else:
1428                     location = location.decode('utf-8')
1429                 location_escaped = escape_url(location)
1430                 if location != location_escaped:
1431                     del resp.headers['Location']
1432                     if sys.version_info < (3, 0):
1433                         location_escaped = location_escaped.encode('utf-8')
1434                     resp.headers['Location'] = location_escaped
1435         return resp
1436
1437     https_request = http_request
1438     https_response = http_response
1439
1440
1441 def make_socks_conn_class(base_class, socks_proxy):
1442     assert issubclass(base_class, (
1443         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1444
1445     url_components = compat_urlparse.urlparse(socks_proxy)
1446     if url_components.scheme.lower() == 'socks5':
1447         socks_type = ProxyType.SOCKS5
1448     elif url_components.scheme.lower() in ('socks', 'socks4'):
1449         socks_type = ProxyType.SOCKS4
1450     elif url_components.scheme.lower() == 'socks4a':
1451         socks_type = ProxyType.SOCKS4A
1452
1453     def unquote_if_non_empty(s):
1454         if not s:
1455             return s
1456         return compat_urllib_parse_unquote_plus(s)
1457
1458     proxy_args = (
1459         socks_type,
1460         url_components.hostname, url_components.port or 1080,
1461         True,  # Remote DNS
1462         unquote_if_non_empty(url_components.username),
1463         unquote_if_non_empty(url_components.password),
1464     )
1465
1466     class SocksConnection(base_class):
1467         def connect(self):
1468             self.sock = sockssocket()
1469             self.sock.setproxy(*proxy_args)
1470             if type(self.timeout) in (int, float):
1471                 self.sock.settimeout(self.timeout)
1472             self.sock.connect((self.host, self.port))
1473
1474             if isinstance(self, compat_http_client.HTTPSConnection):
1475                 if hasattr(self, '_context'):  # Python > 2.6
1476                     self.sock = self._context.wrap_socket(
1477                         self.sock, server_hostname=self.host)
1478                 else:
1479                     self.sock = ssl.wrap_socket(self.sock)
1480
1481     return SocksConnection
1482
1483
1484 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1485     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1486         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1487         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1488         self._params = params
1489
1490     def https_open(self, req):
1491         kwargs = {}
1492         conn_class = self._https_conn_class
1493
1494         if hasattr(self, '_context'):  # python > 2.6
1495             kwargs['context'] = self._context
1496         if hasattr(self, '_check_hostname'):  # python 3.x
1497             kwargs['check_hostname'] = self._check_hostname
1498
1499         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1500         if socks_proxy:
1501             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1502             del req.headers['Ytdl-socks-proxy']
1503
1504         return self.do_open(functools.partial(
1505             _create_http_connection, self, conn_class, True),
1506             req, **kwargs)
1507
1508
1509 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1510     """
1511     See [1] for cookie file format.
1512
1513     1. https://curl.haxx.se/docs/http-cookies.html
1514     """
1515     _HTTPONLY_PREFIX = '#HttpOnly_'
1516     _ENTRY_LEN = 7
1517     _HEADER = '''# Netscape HTTP Cookie File
1518 # This file is generated by yt-dlp.  Do not edit.
1519
1520 '''
1521     _CookieFileEntry = collections.namedtuple(
1522         'CookieFileEntry',
1523         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1524
1525     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1526         """
1527         Save cookies to a file.
1528
1529         Most of the code is taken from CPython 3.8 and slightly adapted
1530         to support cookie files with UTF-8 in both python 2 and 3.
1531         """
1532         if filename is None:
1533             if self.filename is not None:
1534                 filename = self.filename
1535             else:
1536                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1537
1538         # Store session cookies with `expires` set to 0 instead of an empty
1539         # string
1540         for cookie in self:
1541             if cookie.expires is None:
1542                 cookie.expires = 0
1543
1544         with io.open(filename, 'w', encoding='utf-8') as f:
1545             f.write(self._HEADER)
1546             now = time.time()
1547             for cookie in self:
1548                 if not ignore_discard and cookie.discard:
1549                     continue
1550                 if not ignore_expires and cookie.is_expired(now):
1551                     continue
1552                 if cookie.secure:
1553                     secure = 'TRUE'
1554                 else:
1555                     secure = 'FALSE'
1556                 if cookie.domain.startswith('.'):
1557                     initial_dot = 'TRUE'
1558                 else:
1559                     initial_dot = 'FALSE'
1560                 if cookie.expires is not None:
1561                     expires = compat_str(cookie.expires)
1562                 else:
1563                     expires = ''
1564                 if cookie.value is None:
1565                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1566                     # with no name, whereas http.cookiejar regards it as a
1567                     # cookie with no value.
1568                     name = ''
1569                     value = cookie.name
1570                 else:
1571                     name = cookie.name
1572                     value = cookie.value
1573                 f.write(
1574                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1575                                secure, expires, name, value]) + '\n')
1576
1577     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1578         """Load cookies from a file."""
1579         if filename is None:
1580             if self.filename is not None:
1581                 filename = self.filename
1582             else:
1583                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1584
1585         def prepare_line(line):
1586             if line.startswith(self._HTTPONLY_PREFIX):
1587                 line = line[len(self._HTTPONLY_PREFIX):]
1588             # comments and empty lines are fine
1589             if line.startswith('#') or not line.strip():
1590                 return line
1591             cookie_list = line.split('\t')
1592             if len(cookie_list) != self._ENTRY_LEN:
1593                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1594             cookie = self._CookieFileEntry(*cookie_list)
1595             if cookie.expires_at and not cookie.expires_at.isdigit():
1596                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1597             return line
1598
1599         cf = io.StringIO()
1600         with io.open(filename, encoding='utf-8') as f:
1601             for line in f:
1602                 try:
1603                     cf.write(prepare_line(line))
1604                 except compat_cookiejar.LoadError as e:
1605                     write_string(
1606                         'WARNING: skipping cookie file entry due to %s: %r\n'
1607                         % (e, line), sys.stderr)
1608                     continue
1609         cf.seek(0)
1610         self._really_load(cf, filename, ignore_discard, ignore_expires)
1611         # Session cookies are denoted by either `expires` field set to
1612         # an empty string or 0. MozillaCookieJar only recognizes the former
1613         # (see [1]). So we need force the latter to be recognized as session
1614         # cookies on our own.
1615         # Session cookies may be important for cookies-based authentication,
1616         # e.g. usually, when user does not check 'Remember me' check box while
1617         # logging in on a site, some important cookies are stored as session
1618         # cookies so that not recognizing them will result in failed login.
1619         # 1. https://bugs.python.org/issue17164
1620         for cookie in self:
1621             # Treat `expires=0` cookies as session cookies
1622             if cookie.expires == 0:
1623                 cookie.expires = None
1624                 cookie.discard = True
1625
1626
1627 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1628     def __init__(self, cookiejar=None):
1629         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1630
1631     def http_response(self, request, response):
1632         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1633         # characters in Set-Cookie HTTP header of last response (see
1634         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1635         # In order to at least prevent crashing we will percent encode Set-Cookie
1636         # header before HTTPCookieProcessor starts processing it.
1637         # if sys.version_info < (3, 0) and response.headers:
1638         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1639         #         set_cookie = response.headers.get(set_cookie_header)
1640         #         if set_cookie:
1641         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1642         #             if set_cookie != set_cookie_escaped:
1643         #                 del response.headers[set_cookie_header]
1644         #                 response.headers[set_cookie_header] = set_cookie_escaped
1645         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1646
1647     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1648     https_response = http_response
1649
1650
1651 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1652     """YoutubeDL redirect handler
1653
1654     The code is based on HTTPRedirectHandler implementation from CPython [1].
1655
1656     This redirect handler solves two issues:
1657      - ensures redirect URL is always unicode under python 2
1658      - introduces support for experimental HTTP response status code
1659        308 Permanent Redirect [2] used by some sites [3]
1660
1661     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1662     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1663     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1664     """
1665
1666     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1667
1668     def redirect_request(self, req, fp, code, msg, headers, newurl):
1669         """Return a Request or None in response to a redirect.
1670
1671         This is called by the http_error_30x methods when a
1672         redirection response is received.  If a redirection should
1673         take place, return a new Request to allow http_error_30x to
1674         perform the redirect.  Otherwise, raise HTTPError if no-one
1675         else should try to handle this url.  Return None if you can't
1676         but another Handler might.
1677         """
1678         m = req.get_method()
1679         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1680                  or code in (301, 302, 303) and m == "POST")):
1681             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1682         # Strictly (according to RFC 2616), 301 or 302 in response to
1683         # a POST MUST NOT cause a redirection without confirmation
1684         # from the user (of urllib.request, in this case).  In practice,
1685         # essentially all clients do redirect in this case, so we do
1686         # the same.
1687
1688         # On python 2 urlh.geturl() may sometimes return redirect URL
1689         # as byte string instead of unicode. This workaround allows
1690         # to force it always return unicode.
1691         if sys.version_info[0] < 3:
1692             newurl = compat_str(newurl)
1693
1694         # Be conciliant with URIs containing a space.  This is mainly
1695         # redundant with the more complete encoding done in http_error_302(),
1696         # but it is kept for compatibility with other callers.
1697         newurl = newurl.replace(' ', '%20')
1698
1699         CONTENT_HEADERS = ("content-length", "content-type")
1700         # NB: don't use dict comprehension for python 2.6 compatibility
1701         newheaders = dict((k, v) for k, v in req.headers.items()
1702                           if k.lower() not in CONTENT_HEADERS)
1703         return compat_urllib_request.Request(
1704             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1705             unverifiable=True)
1706
1707
1708 def extract_timezone(date_str):
1709     m = re.search(
1710         r'''(?x)
1711             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1712             (?P<tz>Z|                                            # just the UTC Z, or
1713                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1714                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1715                    [ ]?                                          # optional space
1716                 (?P<sign>\+|-)                                   # +/-
1717                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1718             $)
1719         ''', date_str)
1720     if not m:
1721         timezone = datetime.timedelta()
1722     else:
1723         date_str = date_str[:-len(m.group('tz'))]
1724         if not m.group('sign'):
1725             timezone = datetime.timedelta()
1726         else:
1727             sign = 1 if m.group('sign') == '+' else -1
1728             timezone = datetime.timedelta(
1729                 hours=sign * int(m.group('hours')),
1730                 minutes=sign * int(m.group('minutes')))
1731     return timezone, date_str
1732
1733
1734 def parse_iso8601(date_str, delimiter='T', timezone=None):
1735     """ Return a UNIX timestamp from the given date """
1736
1737     if date_str is None:
1738         return None
1739
1740     date_str = re.sub(r'\.[0-9]+', '', date_str)
1741
1742     if timezone is None:
1743         timezone, date_str = extract_timezone(date_str)
1744
1745     try:
1746         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1747         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1748         return calendar.timegm(dt.timetuple())
1749     except ValueError:
1750         pass
1751
1752
1753 def date_formats(day_first=True):
1754     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1755
1756
1757 def unified_strdate(date_str, day_first=True):
1758     """Return a string with the date in the format YYYYMMDD"""
1759
1760     if date_str is None:
1761         return None
1762     upload_date = None
1763     # Replace commas
1764     date_str = date_str.replace(',', ' ')
1765     # Remove AM/PM + timezone
1766     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1767     _, date_str = extract_timezone(date_str)
1768
1769     for expression in date_formats(day_first):
1770         try:
1771             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1772         except ValueError:
1773             pass
1774     if upload_date is None:
1775         timetuple = email.utils.parsedate_tz(date_str)
1776         if timetuple:
1777             try:
1778                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1779             except ValueError:
1780                 pass
1781     if upload_date is not None:
1782         return compat_str(upload_date)
1783
1784
1785 def unified_timestamp(date_str, day_first=True):
1786     if date_str is None:
1787         return None
1788
1789     date_str = re.sub(r'[,|]', '', date_str)
1790
1791     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1792     timezone, date_str = extract_timezone(date_str)
1793
1794     # Remove AM/PM + timezone
1795     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1796
1797     # Remove unrecognized timezones from ISO 8601 alike timestamps
1798     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1799     if m:
1800         date_str = date_str[:-len(m.group('tz'))]
1801
1802     # Python only supports microseconds, so remove nanoseconds
1803     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1804     if m:
1805         date_str = m.group(1)
1806
1807     for expression in date_formats(day_first):
1808         try:
1809             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1810             return calendar.timegm(dt.timetuple())
1811         except ValueError:
1812             pass
1813     timetuple = email.utils.parsedate_tz(date_str)
1814     if timetuple:
1815         return calendar.timegm(timetuple) + pm_delta * 3600
1816
1817
1818 def determine_ext(url, default_ext='unknown_video'):
1819     if url is None or '.' not in url:
1820         return default_ext
1821     guess = url.partition('?')[0].rpartition('.')[2]
1822     if re.match(r'^[A-Za-z0-9]+$', guess):
1823         return guess
1824     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1825     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1826         return guess.rstrip('/')
1827     else:
1828         return default_ext
1829
1830
1831 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1832     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1833
1834
1835 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1836     """
1837     Return a datetime object from a string in the format YYYYMMDD or
1838     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1839
1840     format: string date format used to return datetime object from
1841     precision: round the time portion of a datetime object.
1842                 auto|microsecond|second|minute|hour|day.
1843                 auto: round to the unit provided in date_str (if applicable).
1844     """
1845     auto_precision = False
1846     if precision == 'auto':
1847         auto_precision = True
1848         precision = 'microsecond'
1849     today = datetime_round(datetime.datetime.utcnow(), precision)
1850     if date_str in ('now', 'today'):
1851         return today
1852     if date_str == 'yesterday':
1853         return today - datetime.timedelta(days=1)
1854     match = re.match(
1855         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1856         date_str)
1857     if match is not None:
1858         start_time = datetime_from_str(match.group('start'), precision, format)
1859         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1860         unit = match.group('unit')
1861         if unit == 'month' or unit == 'year':
1862             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1863             unit = 'day'
1864         else:
1865             if unit == 'week':
1866                 unit = 'day'
1867                 time *= 7
1868             delta = datetime.timedelta(**{unit + 's': time})
1869             new_date = start_time + delta
1870         if auto_precision:
1871             return datetime_round(new_date, unit)
1872         return new_date
1873
1874     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1875
1876
1877 def date_from_str(date_str, format='%Y%m%d', strict=False):
1878     """
1879     Return a datetime object from a string in the format YYYYMMDD or
1880     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1881
1882     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1883
1884     format: string date format used to return datetime object from
1885     """
1886     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1887         raise ValueError(f'Invalid date format {date_str}')
1888     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1889
1890
1891 def datetime_add_months(dt, months):
1892     """Increment/Decrement a datetime object by months."""
1893     month = dt.month + months - 1
1894     year = dt.year + month // 12
1895     month = month % 12 + 1
1896     day = min(dt.day, calendar.monthrange(year, month)[1])
1897     return dt.replace(year, month, day)
1898
1899
1900 def datetime_round(dt, precision='day'):
1901     """
1902     Round a datetime object's time to a specific precision
1903     """
1904     if precision == 'microsecond':
1905         return dt
1906
1907     unit_seconds = {
1908         'day': 86400,
1909         'hour': 3600,
1910         'minute': 60,
1911         'second': 1,
1912     }
1913     roundto = lambda x, n: ((x + n / 2) // n) * n
1914     timestamp = calendar.timegm(dt.timetuple())
1915     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1916
1917
1918 def hyphenate_date(date_str):
1919     """
1920     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1921     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1922     if match is not None:
1923         return '-'.join(match.groups())
1924     else:
1925         return date_str
1926
1927
1928 class DateRange(object):
1929     """Represents a time interval between two dates"""
1930
1931     def __init__(self, start=None, end=None):
1932         """start and end must be strings in the format accepted by date"""
1933         if start is not None:
1934             self.start = date_from_str(start, strict=True)
1935         else:
1936             self.start = datetime.datetime.min.date()
1937         if end is not None:
1938             self.end = date_from_str(end, strict=True)
1939         else:
1940             self.end = datetime.datetime.max.date()
1941         if self.start > self.end:
1942             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1943
1944     @classmethod
1945     def day(cls, day):
1946         """Returns a range that only contains the given day"""
1947         return cls(day, day)
1948
1949     def __contains__(self, date):
1950         """Check if the date is in the range"""
1951         if not isinstance(date, datetime.date):
1952             date = date_from_str(date)
1953         return self.start <= date <= self.end
1954
1955     def __str__(self):
1956         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1957
1958
1959 def platform_name():
1960     """ Returns the platform name as a compat_str """
1961     res = platform.platform()
1962     if isinstance(res, bytes):
1963         res = res.decode(preferredencoding())
1964
1965     assert isinstance(res, compat_str)
1966     return res
1967
1968
1969 def get_windows_version():
1970     ''' Get Windows version. None if it's not running on Windows '''
1971     if compat_os_name == 'nt':
1972         return version_tuple(platform.win32_ver()[1])
1973     else:
1974         return None
1975
1976
1977 def _windows_write_string(s, out):
1978     """ Returns True if the string was written using special methods,
1979     False if it has yet to be written out."""
1980     # Adapted from http://stackoverflow.com/a/3259271/35070
1981
1982     import ctypes.wintypes
1983
1984     WIN_OUTPUT_IDS = {
1985         1: -11,
1986         2: -12,
1987     }
1988
1989     try:
1990         fileno = out.fileno()
1991     except AttributeError:
1992         # If the output stream doesn't have a fileno, it's virtual
1993         return False
1994     except io.UnsupportedOperation:
1995         # Some strange Windows pseudo files?
1996         return False
1997     if fileno not in WIN_OUTPUT_IDS:
1998         return False
1999
2000     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2001         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2002         ('GetStdHandle', ctypes.windll.kernel32))
2003     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2004
2005     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2006         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2007         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2008         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2009     written = ctypes.wintypes.DWORD(0)
2010
2011     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2012     FILE_TYPE_CHAR = 0x0002
2013     FILE_TYPE_REMOTE = 0x8000
2014     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2015         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2016         ctypes.POINTER(ctypes.wintypes.DWORD))(
2017         ('GetConsoleMode', ctypes.windll.kernel32))
2018     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2019
2020     def not_a_console(handle):
2021         if handle == INVALID_HANDLE_VALUE or handle is None:
2022             return True
2023         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2024                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2025
2026     if not_a_console(h):
2027         return False
2028
2029     def next_nonbmp_pos(s):
2030         try:
2031             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2032         except StopIteration:
2033             return len(s)
2034
2035     while s:
2036         count = min(next_nonbmp_pos(s), 1024)
2037
2038         ret = WriteConsoleW(
2039             h, s, count if count else 2, ctypes.byref(written), None)
2040         if ret == 0:
2041             raise OSError('Failed to write string')
2042         if not count:  # We just wrote a non-BMP character
2043             assert written.value == 2
2044             s = s[1:]
2045         else:
2046             assert written.value > 0
2047             s = s[written.value:]
2048     return True
2049
2050
2051 def write_string(s, out=None, encoding=None):
2052     if out is None:
2053         out = sys.stderr
2054     assert type(s) == compat_str
2055
2056     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2057         if _windows_write_string(s, out):
2058             return
2059
2060     if ('b' in getattr(out, 'mode', '')
2061             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2062         byt = s.encode(encoding or preferredencoding(), 'ignore')
2063         out.write(byt)
2064     elif hasattr(out, 'buffer'):
2065         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2066         byt = s.encode(enc, 'ignore')
2067         out.buffer.write(byt)
2068     else:
2069         out.write(s)
2070     out.flush()
2071
2072
2073 def bytes_to_intlist(bs):
2074     if not bs:
2075         return []
2076     if isinstance(bs[0], int):  # Python 3
2077         return list(bs)
2078     else:
2079         return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083     if not xs:
2084         return b''
2085     return compat_struct_pack('%dB' % len(xs), *xs)
2086
2087
2088 # Cross-platform file locking
2089 if sys.platform == 'win32':
2090     import ctypes.wintypes
2091     import msvcrt
2092
2093     class OVERLAPPED(ctypes.Structure):
2094         _fields_ = [
2095             ('Internal', ctypes.wintypes.LPVOID),
2096             ('InternalHigh', ctypes.wintypes.LPVOID),
2097             ('Offset', ctypes.wintypes.DWORD),
2098             ('OffsetHigh', ctypes.wintypes.DWORD),
2099             ('hEvent', ctypes.wintypes.HANDLE),
2100         ]
2101
2102     kernel32 = ctypes.windll.kernel32
2103     LockFileEx = kernel32.LockFileEx
2104     LockFileEx.argtypes = [
2105         ctypes.wintypes.HANDLE,     # hFile
2106         ctypes.wintypes.DWORD,      # dwFlags
2107         ctypes.wintypes.DWORD,      # dwReserved
2108         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2109         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2110         ctypes.POINTER(OVERLAPPED)  # Overlapped
2111     ]
2112     LockFileEx.restype = ctypes.wintypes.BOOL
2113     UnlockFileEx = kernel32.UnlockFileEx
2114     UnlockFileEx.argtypes = [
2115         ctypes.wintypes.HANDLE,     # hFile
2116         ctypes.wintypes.DWORD,      # dwReserved
2117         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2118         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2119         ctypes.POINTER(OVERLAPPED)  # Overlapped
2120     ]
2121     UnlockFileEx.restype = ctypes.wintypes.BOOL
2122     whole_low = 0xffffffff
2123     whole_high = 0x7fffffff
2124
2125     def _lock_file(f, exclusive, block):  # todo: block unused on win32
2126         overlapped = OVERLAPPED()
2127         overlapped.Offset = 0
2128         overlapped.OffsetHigh = 0
2129         overlapped.hEvent = 0
2130         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2131         handle = msvcrt.get_osfhandle(f.fileno())
2132         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2133                           whole_low, whole_high, f._lock_file_overlapped_p):
2134             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2135
2136     def _unlock_file(f):
2137         assert f._lock_file_overlapped_p
2138         handle = msvcrt.get_osfhandle(f.fileno())
2139         if not UnlockFileEx(handle, 0,
2140                             whole_low, whole_high, f._lock_file_overlapped_p):
2141             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2142
2143 else:
2144     try:
2145         import fcntl
2146
2147         def _lock_file(f, exclusive, block):
2148             try:
2149                 fcntl.flock(f,
2150                             fcntl.LOCK_SH if not exclusive
2151                             else fcntl.LOCK_EX if block
2152                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2153             except BlockingIOError:
2154                 raise
2155             except OSError:  # AOSP does not have flock()
2156                 fcntl.lockf(f,
2157                             fcntl.LOCK_SH if not exclusive
2158                             else fcntl.LOCK_EX if block
2159                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2160
2161         def _unlock_file(f):
2162             try:
2163                 fcntl.flock(f, fcntl.LOCK_UN)
2164             except OSError:
2165                 fcntl.lockf(f, fcntl.LOCK_UN)
2166
2167     except ImportError:
2168         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2169
2170         def _lock_file(f, exclusive, block):
2171             raise IOError(UNSUPPORTED_MSG)
2172
2173         def _unlock_file(f):
2174             raise IOError(UNSUPPORTED_MSG)
2175
2176
2177 class locked_file(object):
2178     def __init__(self, filename, mode, block=True, encoding=None):
2179         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2180         self.f = io.open(filename, mode, encoding=encoding)
2181         self.mode = mode
2182         self.block = block
2183
2184     def __enter__(self):
2185         exclusive = 'r' not in self.mode
2186         try:
2187             _lock_file(self.f, exclusive, self.block)
2188         except IOError:
2189             self.f.close()
2190             raise
2191         return self
2192
2193     def __exit__(self, etype, value, traceback):
2194         try:
2195             _unlock_file(self.f)
2196         finally:
2197             self.f.close()
2198
2199     def __iter__(self):
2200         return iter(self.f)
2201
2202     def write(self, *args):
2203         return self.f.write(*args)
2204
2205     def read(self, *args):
2206         return self.f.read(*args)
2207
2208     def flush(self):
2209         self.f.flush()
2210
2211     def open(self):
2212         return self.__enter__()
2213
2214     def close(self, *args):
2215         self.__exit__(self, *args, value=False, traceback=False)
2216
2217
2218 def get_filesystem_encoding():
2219     encoding = sys.getfilesystemencoding()
2220     return encoding if encoding is not None else 'utf-8'
2221
2222
2223 def shell_quote(args):
2224     quoted_args = []
2225     encoding = get_filesystem_encoding()
2226     for a in args:
2227         if isinstance(a, bytes):
2228             # We may get a filename encoded with 'encodeFilename'
2229             a = a.decode(encoding)
2230         quoted_args.append(compat_shlex_quote(a))
2231     return ' '.join(quoted_args)
2232
2233
2234 def smuggle_url(url, data):
2235     """ Pass additional data in a URL for internal use. """
2236
2237     url, idata = unsmuggle_url(url, {})
2238     data.update(idata)
2239     sdata = compat_urllib_parse_urlencode(
2240         {'__youtubedl_smuggle': json.dumps(data)})
2241     return url + '#' + sdata
2242
2243
2244 def unsmuggle_url(smug_url, default=None):
2245     if '#__youtubedl_smuggle' not in smug_url:
2246         return smug_url, default
2247     url, _, sdata = smug_url.rpartition('#')
2248     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2249     data = json.loads(jsond)
2250     return url, data
2251
2252
2253 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2254     """ Formats numbers with decimal sufixes like K, M, etc """
2255     num, factor = float_or_none(num), float(factor)
2256     if num is None:
2257         return None
2258     exponent = 0 if num == 0 else int(math.log(num, factor))
2259     suffix = ['', *'kMGTPEZY'][exponent]
2260     if factor == 1024:
2261         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2262     converted = num / (factor ** exponent)
2263     return fmt % (converted, suffix)
2264
2265
2266 def format_bytes(bytes):
2267     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2268
2269
2270 def lookup_unit_table(unit_table, s):
2271     units_re = '|'.join(re.escape(u) for u in unit_table)
2272     m = re.match(
2273         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2274     if not m:
2275         return None
2276     num_str = m.group('num').replace(',', '.')
2277     mult = unit_table[m.group('unit')]
2278     return int(float(num_str) * mult)
2279
2280
2281 def parse_filesize(s):
2282     if s is None:
2283         return None
2284
2285     # The lower-case forms are of course incorrect and unofficial,
2286     # but we support those too
2287     _UNIT_TABLE = {
2288         'B': 1,
2289         'b': 1,
2290         'bytes': 1,
2291         'KiB': 1024,
2292         'KB': 1000,
2293         'kB': 1024,
2294         'Kb': 1000,
2295         'kb': 1000,
2296         'kilobytes': 1000,
2297         'kibibytes': 1024,
2298         'MiB': 1024 ** 2,
2299         'MB': 1000 ** 2,
2300         'mB': 1024 ** 2,
2301         'Mb': 1000 ** 2,
2302         'mb': 1000 ** 2,
2303         'megabytes': 1000 ** 2,
2304         'mebibytes': 1024 ** 2,
2305         'GiB': 1024 ** 3,
2306         'GB': 1000 ** 3,
2307         'gB': 1024 ** 3,
2308         'Gb': 1000 ** 3,
2309         'gb': 1000 ** 3,
2310         'gigabytes': 1000 ** 3,
2311         'gibibytes': 1024 ** 3,
2312         'TiB': 1024 ** 4,
2313         'TB': 1000 ** 4,
2314         'tB': 1024 ** 4,
2315         'Tb': 1000 ** 4,
2316         'tb': 1000 ** 4,
2317         'terabytes': 1000 ** 4,
2318         'tebibytes': 1024 ** 4,
2319         'PiB': 1024 ** 5,
2320         'PB': 1000 ** 5,
2321         'pB': 1024 ** 5,
2322         'Pb': 1000 ** 5,
2323         'pb': 1000 ** 5,
2324         'petabytes': 1000 ** 5,
2325         'pebibytes': 1024 ** 5,
2326         'EiB': 1024 ** 6,
2327         'EB': 1000 ** 6,
2328         'eB': 1024 ** 6,
2329         'Eb': 1000 ** 6,
2330         'eb': 1000 ** 6,
2331         'exabytes': 1000 ** 6,
2332         'exbibytes': 1024 ** 6,
2333         'ZiB': 1024 ** 7,
2334         'ZB': 1000 ** 7,
2335         'zB': 1024 ** 7,
2336         'Zb': 1000 ** 7,
2337         'zb': 1000 ** 7,
2338         'zettabytes': 1000 ** 7,
2339         'zebibytes': 1024 ** 7,
2340         'YiB': 1024 ** 8,
2341         'YB': 1000 ** 8,
2342         'yB': 1024 ** 8,
2343         'Yb': 1000 ** 8,
2344         'yb': 1000 ** 8,
2345         'yottabytes': 1000 ** 8,
2346         'yobibytes': 1024 ** 8,
2347     }
2348
2349     return lookup_unit_table(_UNIT_TABLE, s)
2350
2351
2352 def parse_count(s):
2353     if s is None:
2354         return None
2355
2356     s = re.sub(r'^[^\d]+\s', '', s).strip()
2357
2358     if re.match(r'^[\d,.]+$', s):
2359         return str_to_int(s)
2360
2361     _UNIT_TABLE = {
2362         'k': 1000,
2363         'K': 1000,
2364         'm': 1000 ** 2,
2365         'M': 1000 ** 2,
2366         'kk': 1000 ** 2,
2367         'KK': 1000 ** 2,
2368         'b': 1000 ** 3,
2369         'B': 1000 ** 3,
2370     }
2371
2372     ret = lookup_unit_table(_UNIT_TABLE, s)
2373     if ret is not None:
2374         return ret
2375
2376     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2377     if mobj:
2378         return str_to_int(mobj.group(1))
2379
2380
2381 def parse_resolution(s):
2382     if s is None:
2383         return {}
2384
2385     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2386     if mobj:
2387         return {
2388             'width': int(mobj.group('w')),
2389             'height': int(mobj.group('h')),
2390         }
2391
2392     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2393     if mobj:
2394         return {'height': int(mobj.group(1))}
2395
2396     mobj = re.search(r'\b([48])[kK]\b', s)
2397     if mobj:
2398         return {'height': int(mobj.group(1)) * 540}
2399
2400     return {}
2401
2402
2403 def parse_bitrate(s):
2404     if not isinstance(s, compat_str):
2405         return
2406     mobj = re.search(r'\b(\d+)\s*kbps', s)
2407     if mobj:
2408         return int(mobj.group(1))
2409
2410
2411 def month_by_name(name, lang='en'):
2412     """ Return the number of a month by (locale-independently) English name """
2413
2414     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2415
2416     try:
2417         return month_names.index(name) + 1
2418     except ValueError:
2419         return None
2420
2421
2422 def month_by_abbreviation(abbrev):
2423     """ Return the number of a month by (locale-independently) English
2424         abbreviations """
2425
2426     try:
2427         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2428     except ValueError:
2429         return None
2430
2431
2432 def fix_xml_ampersands(xml_str):
2433     """Replace all the '&' by '&amp;' in XML"""
2434     return re.sub(
2435         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2436         '&amp;',
2437         xml_str)
2438
2439
2440 def setproctitle(title):
2441     assert isinstance(title, compat_str)
2442
2443     # ctypes in Jython is not complete
2444     # http://bugs.jython.org/issue2148
2445     if sys.platform.startswith('java'):
2446         return
2447
2448     try:
2449         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2450     except OSError:
2451         return
2452     except TypeError:
2453         # LoadLibrary in Windows Python 2.7.13 only expects
2454         # a bytestring, but since unicode_literals turns
2455         # every string into a unicode string, it fails.
2456         return
2457     title_bytes = title.encode('utf-8')
2458     buf = ctypes.create_string_buffer(len(title_bytes))
2459     buf.value = title_bytes
2460     try:
2461         libc.prctl(15, buf, 0, 0, 0)
2462     except AttributeError:
2463         return  # Strange libc, just skip this
2464
2465
2466 def remove_start(s, start):
2467     return s[len(start):] if s is not None and s.startswith(start) else s
2468
2469
2470 def remove_end(s, end):
2471     return s[:-len(end)] if s is not None and s.endswith(end) else s
2472
2473
2474 def remove_quotes(s):
2475     if s is None or len(s) < 2:
2476         return s
2477     for quote in ('"', "'", ):
2478         if s[0] == quote and s[-1] == quote:
2479             return s[1:-1]
2480     return s
2481
2482
2483 def get_domain(url):
2484     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2485     return domain.group('domain') if domain else None
2486
2487
2488 def url_basename(url):
2489     path = compat_urlparse.urlparse(url).path
2490     return path.strip('/').split('/')[-1]
2491
2492
2493 def base_url(url):
2494     return re.match(r'https?://[^?#&]+/', url).group()
2495
2496
2497 def urljoin(base, path):
2498     if isinstance(path, bytes):
2499         path = path.decode('utf-8')
2500     if not isinstance(path, compat_str) or not path:
2501         return None
2502     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2503         return path
2504     if isinstance(base, bytes):
2505         base = base.decode('utf-8')
2506     if not isinstance(base, compat_str) or not re.match(
2507             r'^(?:https?:)?//', base):
2508         return None
2509     return compat_urlparse.urljoin(base, path)
2510
2511
2512 class HEADRequest(compat_urllib_request.Request):
2513     def get_method(self):
2514         return 'HEAD'
2515
2516
2517 class PUTRequest(compat_urllib_request.Request):
2518     def get_method(self):
2519         return 'PUT'
2520
2521
2522 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2523     if get_attr and v is not None:
2524         v = getattr(v, get_attr, None)
2525     try:
2526         return int(v) * invscale // scale
2527     except (ValueError, TypeError, OverflowError):
2528         return default
2529
2530
2531 def str_or_none(v, default=None):
2532     return default if v is None else compat_str(v)
2533
2534
2535 def str_to_int(int_str):
2536     """ A more relaxed version of int_or_none """
2537     if isinstance(int_str, compat_integer_types):
2538         return int_str
2539     elif isinstance(int_str, compat_str):
2540         int_str = re.sub(r'[,\.\+]', '', int_str)
2541         return int_or_none(int_str)
2542
2543
2544 def float_or_none(v, scale=1, invscale=1, default=None):
2545     if v is None:
2546         return default
2547     try:
2548         return float(v) * invscale / scale
2549     except (ValueError, TypeError):
2550         return default
2551
2552
2553 def bool_or_none(v, default=None):
2554     return v if isinstance(v, bool) else default
2555
2556
2557 def strip_or_none(v, default=None):
2558     return v.strip() if isinstance(v, compat_str) else default
2559
2560
2561 def url_or_none(url):
2562     if not url or not isinstance(url, compat_str):
2563         return None
2564     url = url.strip()
2565     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2566
2567
2568 def request_to_url(req):
2569     if isinstance(req, compat_urllib_request.Request):
2570         return req.get_full_url()
2571     else:
2572         return req
2573
2574
2575 def strftime_or_none(timestamp, date_format, default=None):
2576     datetime_object = None
2577     try:
2578         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2579             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2580         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2581             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2582         return datetime_object.strftime(date_format)
2583     except (ValueError, TypeError, AttributeError):
2584         return default
2585
2586
2587 def parse_duration(s):
2588     if not isinstance(s, compat_basestring):
2589         return None
2590     s = s.strip()
2591     if not s:
2592         return None
2593
2594     days, hours, mins, secs, ms = [None] * 5
2595     m = re.match(r'''(?x)
2596             (?P<before_secs>
2597                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2598             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2599             (?P<ms>[.:][0-9]+)?Z?$
2600         ''', s)
2601     if m:
2602         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2603     else:
2604         m = re.match(
2605             r'''(?ix)(?:P?
2606                 (?:
2607                     [0-9]+\s*y(?:ears?)?\s*
2608                 )?
2609                 (?:
2610                     [0-9]+\s*m(?:onths?)?\s*
2611                 )?
2612                 (?:
2613                     [0-9]+\s*w(?:eeks?)?\s*
2614                 )?
2615                 (?:
2616                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2617                 )?
2618                 T)?
2619                 (?:
2620                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2621                 )?
2622                 (?:
2623                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2624                 )?
2625                 (?:
2626                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2627                 )?Z?$''', s)
2628         if m:
2629             days, hours, mins, secs, ms = m.groups()
2630         else:
2631             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2632             if m:
2633                 hours, mins = m.groups()
2634             else:
2635                 return None
2636
2637     duration = 0
2638     if secs:
2639         duration += float(secs)
2640     if mins:
2641         duration += float(mins) * 60
2642     if hours:
2643         duration += float(hours) * 60 * 60
2644     if days:
2645         duration += float(days) * 24 * 60 * 60
2646     if ms:
2647         duration += float(ms.replace(':', '.'))
2648     return duration
2649
2650
2651 def prepend_extension(filename, ext, expected_real_ext=None):
2652     name, real_ext = os.path.splitext(filename)
2653     return (
2654         '{0}.{1}{2}'.format(name, ext, real_ext)
2655         if not expected_real_ext or real_ext[1:] == expected_real_ext
2656         else '{0}.{1}'.format(filename, ext))
2657
2658
2659 def replace_extension(filename, ext, expected_real_ext=None):
2660     name, real_ext = os.path.splitext(filename)
2661     return '{0}.{1}'.format(
2662         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2663         ext)
2664
2665
2666 def check_executable(exe, args=[]):
2667     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2668     args can be a list of arguments for a short output (like -version) """
2669     try:
2670         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2671     except OSError:
2672         return False
2673     return exe
2674
2675
2676 def _get_exe_version_output(exe, args):
2677     try:
2678         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2679         # SIGTTOU if yt-dlp is run in the background.
2680         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2681         out, _ = Popen(
2682             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2683             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2684     except OSError:
2685         return False
2686     if isinstance(out, bytes):  # Python 2.x
2687         out = out.decode('ascii', 'ignore')
2688     return out
2689
2690
2691 def detect_exe_version(output, version_re=None, unrecognized='present'):
2692     assert isinstance(output, compat_str)
2693     if version_re is None:
2694         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2695     m = re.search(version_re, output)
2696     if m:
2697         return m.group(1)
2698     else:
2699         return unrecognized
2700
2701
2702 def get_exe_version(exe, args=['--version'],
2703                     version_re=None, unrecognized='present'):
2704     """ Returns the version of the specified executable,
2705     or False if the executable is not present """
2706     out = _get_exe_version_output(exe, args)
2707     return detect_exe_version(out, version_re, unrecognized) if out else False
2708
2709
2710 class LazyList(collections.abc.Sequence):
2711     ''' Lazy immutable list from an iterable
2712     Note that slices of a LazyList are lists and not LazyList'''
2713
2714     class IndexError(IndexError):
2715         pass
2716
2717     def __init__(self, iterable, *, reverse=False, _cache=None):
2718         self.__iterable = iter(iterable)
2719         self.__cache = [] if _cache is None else _cache
2720         self.__reversed = reverse
2721
2722     def __iter__(self):
2723         if self.__reversed:
2724             # We need to consume the entire iterable to iterate in reverse
2725             yield from self.exhaust()
2726             return
2727         yield from self.__cache
2728         for item in self.__iterable:
2729             self.__cache.append(item)
2730             yield item
2731
2732     def __exhaust(self):
2733         self.__cache.extend(self.__iterable)
2734         # Discard the emptied iterable to make it pickle-able
2735         self.__iterable = []
2736         return self.__cache
2737
2738     def exhaust(self):
2739         ''' Evaluate the entire iterable '''
2740         return self.__exhaust()[::-1 if self.__reversed else 1]
2741
2742     @staticmethod
2743     def __reverse_index(x):
2744         return None if x is None else -(x + 1)
2745
2746     def __getitem__(self, idx):
2747         if isinstance(idx, slice):
2748             if self.__reversed:
2749                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2750             start, stop, step = idx.start, idx.stop, idx.step or 1
2751         elif isinstance(idx, int):
2752             if self.__reversed:
2753                 idx = self.__reverse_index(idx)
2754             start, stop, step = idx, idx, 0
2755         else:
2756             raise TypeError('indices must be integers or slices')
2757         if ((start or 0) < 0 or (stop or 0) < 0
2758                 or (start is None and step < 0)
2759                 or (stop is None and step > 0)):
2760             # We need to consume the entire iterable to be able to slice from the end
2761             # Obviously, never use this with infinite iterables
2762             self.__exhaust()
2763             try:
2764                 return self.__cache[idx]
2765             except IndexError as e:
2766                 raise self.IndexError(e) from e
2767         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2768         if n > 0:
2769             self.__cache.extend(itertools.islice(self.__iterable, n))
2770         try:
2771             return self.__cache[idx]
2772         except IndexError as e:
2773             raise self.IndexError(e) from e
2774
2775     def __bool__(self):
2776         try:
2777             self[-1] if self.__reversed else self[0]
2778         except self.IndexError:
2779             return False
2780         return True
2781
2782     def __len__(self):
2783         self.__exhaust()
2784         return len(self.__cache)
2785
2786     def __reversed__(self):
2787         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2788
2789     def __copy__(self):
2790         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2791
2792     def __repr__(self):
2793         # repr and str should mimic a list. So we exhaust the iterable
2794         return repr(self.exhaust())
2795
2796     def __str__(self):
2797         return repr(self.exhaust())
2798
2799
2800 class PagedList:
2801
2802     class IndexError(IndexError):
2803         pass
2804
2805     def __len__(self):
2806         # This is only useful for tests
2807         return len(self.getslice())
2808
2809     def __init__(self, pagefunc, pagesize, use_cache=True):
2810         self._pagefunc = pagefunc
2811         self._pagesize = pagesize
2812         self._pagecount = float('inf')
2813         self._use_cache = use_cache
2814         self._cache = {}
2815
2816     def getpage(self, pagenum):
2817         page_results = self._cache.get(pagenum)
2818         if page_results is None:
2819             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2820         if self._use_cache:
2821             self._cache[pagenum] = page_results
2822         return page_results
2823
2824     def getslice(self, start=0, end=None):
2825         return list(self._getslice(start, end))
2826
2827     def _getslice(self, start, end):
2828         raise NotImplementedError('This method must be implemented by subclasses')
2829
2830     def __getitem__(self, idx):
2831         assert self._use_cache, 'Indexing PagedList requires cache'
2832         if not isinstance(idx, int) or idx < 0:
2833             raise TypeError('indices must be non-negative integers')
2834         entries = self.getslice(idx, idx + 1)
2835         if not entries:
2836             raise self.IndexError()
2837         return entries[0]
2838
2839
2840 class OnDemandPagedList(PagedList):
2841     def _getslice(self, start, end):
2842         for pagenum in itertools.count(start // self._pagesize):
2843             firstid = pagenum * self._pagesize
2844             nextfirstid = pagenum * self._pagesize + self._pagesize
2845             if start >= nextfirstid:
2846                 continue
2847
2848             startv = (
2849                 start % self._pagesize
2850                 if firstid <= start < nextfirstid
2851                 else 0)
2852             endv = (
2853                 ((end - 1) % self._pagesize) + 1
2854                 if (end is not None and firstid <= end <= nextfirstid)
2855                 else None)
2856
2857             try:
2858                 page_results = self.getpage(pagenum)
2859             except Exception:
2860                 self._pagecount = pagenum - 1
2861                 raise
2862             if startv != 0 or endv is not None:
2863                 page_results = page_results[startv:endv]
2864             yield from page_results
2865
2866             # A little optimization - if current page is not "full", ie. does
2867             # not contain page_size videos then we can assume that this page
2868             # is the last one - there are no more ids on further pages -
2869             # i.e. no need to query again.
2870             if len(page_results) + startv < self._pagesize:
2871                 break
2872
2873             # If we got the whole page, but the next page is not interesting,
2874             # break out early as well
2875             if end == nextfirstid:
2876                 break
2877
2878
2879 class InAdvancePagedList(PagedList):
2880     def __init__(self, pagefunc, pagecount, pagesize):
2881         PagedList.__init__(self, pagefunc, pagesize, True)
2882         self._pagecount = pagecount
2883
2884     def _getslice(self, start, end):
2885         start_page = start // self._pagesize
2886         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2887         skip_elems = start - start_page * self._pagesize
2888         only_more = None if end is None else end - start
2889         for pagenum in range(start_page, end_page):
2890             page_results = self.getpage(pagenum)
2891             if skip_elems:
2892                 page_results = page_results[skip_elems:]
2893                 skip_elems = None
2894             if only_more is not None:
2895                 if len(page_results) < only_more:
2896                     only_more -= len(page_results)
2897                 else:
2898                     yield from page_results[:only_more]
2899                     break
2900             yield from page_results
2901
2902
2903 def uppercase_escape(s):
2904     unicode_escape = codecs.getdecoder('unicode_escape')
2905     return re.sub(
2906         r'\\U[0-9a-fA-F]{8}',
2907         lambda m: unicode_escape(m.group(0))[0],
2908         s)
2909
2910
2911 def lowercase_escape(s):
2912     unicode_escape = codecs.getdecoder('unicode_escape')
2913     return re.sub(
2914         r'\\u[0-9a-fA-F]{4}',
2915         lambda m: unicode_escape(m.group(0))[0],
2916         s)
2917
2918
2919 def escape_rfc3986(s):
2920     """Escape non-ASCII characters as suggested by RFC 3986"""
2921     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2922         s = s.encode('utf-8')
2923     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2924
2925
2926 def escape_url(url):
2927     """Escape URL as suggested by RFC 3986"""
2928     url_parsed = compat_urllib_parse_urlparse(url)
2929     return url_parsed._replace(
2930         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2931         path=escape_rfc3986(url_parsed.path),
2932         params=escape_rfc3986(url_parsed.params),
2933         query=escape_rfc3986(url_parsed.query),
2934         fragment=escape_rfc3986(url_parsed.fragment)
2935     ).geturl()
2936
2937
2938 def parse_qs(url):
2939     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2940
2941
2942 def read_batch_urls(batch_fd):
2943     def fixup(url):
2944         if not isinstance(url, compat_str):
2945             url = url.decode('utf-8', 'replace')
2946         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2947         for bom in BOM_UTF8:
2948             if url.startswith(bom):
2949                 url = url[len(bom):]
2950         url = url.lstrip()
2951         if not url or url.startswith(('#', ';', ']')):
2952             return False
2953         # "#" cannot be stripped out since it is part of the URI
2954         # However, it can be safely stipped out if follwing a whitespace
2955         return re.split(r'\s#', url, 1)[0].rstrip()
2956
2957     with contextlib.closing(batch_fd) as fd:
2958         return [url for url in map(fixup, fd) if url]
2959
2960
2961 def urlencode_postdata(*args, **kargs):
2962     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2963
2964
2965 def update_url_query(url, query):
2966     if not query:
2967         return url
2968     parsed_url = compat_urlparse.urlparse(url)
2969     qs = compat_parse_qs(parsed_url.query)
2970     qs.update(query)
2971     return compat_urlparse.urlunparse(parsed_url._replace(
2972         query=compat_urllib_parse_urlencode(qs, True)))
2973
2974
2975 def update_Request(req, url=None, data=None, headers={}, query={}):
2976     req_headers = req.headers.copy()
2977     req_headers.update(headers)
2978     req_data = data or req.data
2979     req_url = update_url_query(url or req.get_full_url(), query)
2980     req_get_method = req.get_method()
2981     if req_get_method == 'HEAD':
2982         req_type = HEADRequest
2983     elif req_get_method == 'PUT':
2984         req_type = PUTRequest
2985     else:
2986         req_type = compat_urllib_request.Request
2987     new_req = req_type(
2988         req_url, data=req_data, headers=req_headers,
2989         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2990     if hasattr(req, 'timeout'):
2991         new_req.timeout = req.timeout
2992     return new_req
2993
2994
2995 def _multipart_encode_impl(data, boundary):
2996     content_type = 'multipart/form-data; boundary=%s' % boundary
2997
2998     out = b''
2999     for k, v in data.items():
3000         out += b'--' + boundary.encode('ascii') + b'\r\n'
3001         if isinstance(k, compat_str):
3002             k = k.encode('utf-8')
3003         if isinstance(v, compat_str):
3004             v = v.encode('utf-8')
3005         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3006         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3007         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3008         if boundary.encode('ascii') in content:
3009             raise ValueError('Boundary overlaps with data')
3010         out += content
3011
3012     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3013
3014     return out, content_type
3015
3016
3017 def multipart_encode(data, boundary=None):
3018     '''
3019     Encode a dict to RFC 7578-compliant form-data
3020
3021     data:
3022         A dict where keys and values can be either Unicode or bytes-like
3023         objects.
3024     boundary:
3025         If specified a Unicode object, it's used as the boundary. Otherwise
3026         a random boundary is generated.
3027
3028     Reference: https://tools.ietf.org/html/rfc7578
3029     '''
3030     has_specified_boundary = boundary is not None
3031
3032     while True:
3033         if boundary is None:
3034             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3035
3036         try:
3037             out, content_type = _multipart_encode_impl(data, boundary)
3038             break
3039         except ValueError:
3040             if has_specified_boundary:
3041                 raise
3042             boundary = None
3043
3044     return out, content_type
3045
3046
3047 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3048     if isinstance(key_or_keys, (list, tuple)):
3049         for key in key_or_keys:
3050             if key not in d or d[key] is None or skip_false_values and not d[key]:
3051                 continue
3052             return d[key]
3053         return default
3054     return d.get(key_or_keys, default)
3055
3056
3057 def try_get(src, getter, expected_type=None):
3058     for get in variadic(getter):
3059         try:
3060             v = get(src)
3061         except (AttributeError, KeyError, TypeError, IndexError):
3062             pass
3063         else:
3064             if expected_type is None or isinstance(v, expected_type):
3065                 return v
3066
3067
3068 def merge_dicts(*dicts):
3069     merged = {}
3070     for a_dict in dicts:
3071         for k, v in a_dict.items():
3072             if v is None:
3073                 continue
3074             if (k not in merged
3075                     or (isinstance(v, compat_str) and v
3076                         and isinstance(merged[k], compat_str)
3077                         and not merged[k])):
3078                 merged[k] = v
3079     return merged
3080
3081
3082 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3083     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3084
3085
3086 US_RATINGS = {
3087     'G': 0,
3088     'PG': 10,
3089     'PG-13': 13,
3090     'R': 16,
3091     'NC': 18,
3092 }
3093
3094
3095 TV_PARENTAL_GUIDELINES = {
3096     'TV-Y': 0,
3097     'TV-Y7': 7,
3098     'TV-G': 0,
3099     'TV-PG': 0,
3100     'TV-14': 14,
3101     'TV-MA': 17,
3102 }
3103
3104
3105 def parse_age_limit(s):
3106     if type(s) == int:
3107         return s if 0 <= s <= 21 else None
3108     if not isinstance(s, compat_basestring):
3109         return None
3110     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3111     if m:
3112         return int(m.group('age'))
3113     s = s.upper()
3114     if s in US_RATINGS:
3115         return US_RATINGS[s]
3116     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3117     if m:
3118         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3119     return None
3120
3121
3122 def strip_jsonp(code):
3123     return re.sub(
3124         r'''(?sx)^
3125             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3126             (?:\s*&&\s*(?P=func_name))?
3127             \s*\(\s*(?P<callback_data>.*)\);?
3128             \s*?(?://[^\n]*)*$''',
3129         r'\g<callback_data>', code)
3130
3131
3132 def js_to_json(code, vars={}):
3133     # vars is a dict of var, val pairs to substitute
3134     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3135     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3136     INTEGER_TABLE = (
3137         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3138         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3139     )
3140
3141     def fix_kv(m):
3142         v = m.group(0)
3143         if v in ('true', 'false', 'null'):
3144             return v
3145         elif v in ('undefined', 'void 0'):
3146             return 'null'
3147         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3148             return ""
3149
3150         if v[0] in ("'", '"'):
3151             v = re.sub(r'(?s)\\.|"', lambda m: {
3152                 '"': '\\"',
3153                 "\\'": "'",
3154                 '\\\n': '',
3155                 '\\x': '\\u00',
3156             }.get(m.group(0), m.group(0)), v[1:-1])
3157         else:
3158             for regex, base in INTEGER_TABLE:
3159                 im = re.match(regex, v)
3160                 if im:
3161                     i = int(im.group(1), base)
3162                     return '"%d":' % i if v.endswith(':') else '%d' % i
3163
3164             if v in vars:
3165                 return vars[v]
3166
3167         return '"%s"' % v
3168
3169     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3170
3171     return re.sub(r'''(?sx)
3172         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3173         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3174         {comment}|,(?={skip}[\]}}])|
3175         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3176         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3177         [0-9]+(?={skip}:)|
3178         !+
3179         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3180
3181
3182 def qualities(quality_ids):
3183     """ Get a numeric quality value out of a list of possible values """
3184     def q(qid):
3185         try:
3186             return quality_ids.index(qid)
3187         except ValueError:
3188             return -1
3189     return q
3190
3191
3192 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3193
3194
3195 DEFAULT_OUTTMPL = {
3196     'default': '%(title)s [%(id)s].%(ext)s',
3197     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3198 }
3199 OUTTMPL_TYPES = {
3200     'chapter': None,
3201     'subtitle': None,
3202     'thumbnail': None,
3203     'description': 'description',
3204     'annotation': 'annotations.xml',
3205     'infojson': 'info.json',
3206     'link': None,
3207     'pl_video': None,
3208     'pl_thumbnail': None,
3209     'pl_description': 'description',
3210     'pl_infojson': 'info.json',
3211 }
3212
3213 # As of [1] format syntax is:
3214 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3215 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3216 STR_FORMAT_RE_TMPL = r'''(?x)
3217     (?<!%)(?P<prefix>(?:%%)*)
3218     %
3219     (?P<has_key>\((?P<key>{0})\))?
3220     (?P<format>
3221         (?P<conversion>[#0\-+ ]+)?
3222         (?P<min_width>\d+)?
3223         (?P<precision>\.\d+)?
3224         (?P<len_mod>[hlL])?  # unused in python
3225         {1}  # conversion type
3226     )
3227 '''
3228
3229
3230 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3231
3232
3233 def limit_length(s, length):
3234     """ Add ellipses to overly long strings """
3235     if s is None:
3236         return None
3237     ELLIPSES = '...'
3238     if len(s) > length:
3239         return s[:length - len(ELLIPSES)] + ELLIPSES
3240     return s
3241
3242
3243 def version_tuple(v):
3244     return tuple(int(e) for e in re.split(r'[-.]', v))
3245
3246
3247 def is_outdated_version(version, limit, assume_new=True):
3248     if not version:
3249         return not assume_new
3250     try:
3251         return version_tuple(version) < version_tuple(limit)
3252     except ValueError:
3253         return not assume_new
3254
3255
3256 def ytdl_is_updateable():
3257     """ Returns if yt-dlp can be updated with -U """
3258
3259     from .update import is_non_updateable
3260
3261     return not is_non_updateable()
3262
3263
3264 def args_to_str(args):
3265     # Get a short string representation for a subprocess command
3266     return ' '.join(compat_shlex_quote(a) for a in args)
3267
3268
3269 def error_to_compat_str(err):
3270     err_str = str(err)
3271     # On python 2 error byte string must be decoded with proper
3272     # encoding rather than ascii
3273     if sys.version_info[0] < 3:
3274         err_str = err_str.decode(preferredencoding())
3275     return err_str
3276
3277
3278 def mimetype2ext(mt):
3279     if mt is None:
3280         return None
3281
3282     mt, _, params = mt.partition(';')
3283     mt = mt.strip()
3284
3285     FULL_MAP = {
3286         'audio/mp4': 'm4a',
3287         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3288         # it's the most popular one
3289         'audio/mpeg': 'mp3',
3290         'audio/x-wav': 'wav',
3291         'audio/wav': 'wav',
3292         'audio/wave': 'wav',
3293     }
3294
3295     ext = FULL_MAP.get(mt)
3296     if ext is not None:
3297         return ext
3298
3299     SUBTYPE_MAP = {
3300         '3gpp': '3gp',
3301         'smptett+xml': 'tt',
3302         'ttaf+xml': 'dfxp',
3303         'ttml+xml': 'ttml',
3304         'x-flv': 'flv',
3305         'x-mp4-fragmented': 'mp4',
3306         'x-ms-sami': 'sami',
3307         'x-ms-wmv': 'wmv',
3308         'mpegurl': 'm3u8',
3309         'x-mpegurl': 'm3u8',
3310         'vnd.apple.mpegurl': 'm3u8',
3311         'dash+xml': 'mpd',
3312         'f4m+xml': 'f4m',
3313         'hds+xml': 'f4m',
3314         'vnd.ms-sstr+xml': 'ism',
3315         'quicktime': 'mov',
3316         'mp2t': 'ts',
3317         'x-wav': 'wav',
3318         'filmstrip+json': 'fs',
3319         'svg+xml': 'svg',
3320     }
3321
3322     _, _, subtype = mt.rpartition('/')
3323     ext = SUBTYPE_MAP.get(subtype.lower())
3324     if ext is not None:
3325         return ext
3326
3327     SUFFIX_MAP = {
3328         'json': 'json',
3329         'xml': 'xml',
3330         'zip': 'zip',
3331         'gzip': 'gz',
3332     }
3333
3334     _, _, suffix = subtype.partition('+')
3335     ext = SUFFIX_MAP.get(suffix)
3336     if ext is not None:
3337         return ext
3338
3339     return subtype.replace('+', '.')
3340
3341
3342 def ext2mimetype(ext_or_url):
3343     if not ext_or_url:
3344         return None
3345     if '.' not in ext_or_url:
3346         ext_or_url = f'file.{ext_or_url}'
3347     return mimetypes.guess_type(ext_or_url)[0]
3348
3349
3350 def parse_codecs(codecs_str):
3351     # http://tools.ietf.org/html/rfc6381
3352     if not codecs_str:
3353         return {}
3354     split_codecs = list(filter(None, map(
3355         str.strip, codecs_str.strip().strip(',').split(','))))
3356     vcodec, acodec, tcodec, hdr = None, None, None, None
3357     for full_codec in split_codecs:
3358         parts = full_codec.split('.')
3359         codec = parts[0].replace('0', '')
3360         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3361                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3362             if not vcodec:
3363                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3364                 if codec in ('dvh1', 'dvhe'):
3365                     hdr = 'DV'
3366                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3367                     hdr = 'HDR10'
3368                 elif full_codec.replace('0', '').startswith('vp9.2'):
3369                     hdr = 'HDR10'
3370         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3371             if not acodec:
3372                 acodec = full_codec
3373         elif codec in ('stpp', 'wvtt',):
3374             if not tcodec:
3375                 tcodec = full_codec
3376         else:
3377             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3378     if vcodec or acodec or tcodec:
3379         return {
3380             'vcodec': vcodec or 'none',
3381             'acodec': acodec or 'none',
3382             'dynamic_range': hdr,
3383             **({'tcodec': tcodec} if tcodec is not None else {}),
3384         }
3385     elif len(split_codecs) == 2:
3386         return {
3387             'vcodec': split_codecs[0],
3388             'acodec': split_codecs[1],
3389         }
3390     return {}
3391
3392
3393 def urlhandle_detect_ext(url_handle):
3394     getheader = url_handle.headers.get
3395
3396     cd = getheader('Content-Disposition')
3397     if cd:
3398         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3399         if m:
3400             e = determine_ext(m.group('filename'), default_ext=None)
3401             if e:
3402                 return e
3403
3404     return mimetype2ext(getheader('Content-Type'))
3405
3406
3407 def encode_data_uri(data, mime_type):
3408     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3409
3410
3411 def age_restricted(content_limit, age_limit):
3412     """ Returns True iff the content should be blocked """
3413
3414     if age_limit is None:  # No limit set
3415         return False
3416     if content_limit is None:
3417         return False  # Content available for everyone
3418     return age_limit < content_limit
3419
3420
3421 def is_html(first_bytes):
3422     """ Detect whether a file contains HTML by examining its first bytes. """
3423
3424     BOMS = [
3425         (b'\xef\xbb\xbf', 'utf-8'),
3426         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3427         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3428         (b'\xff\xfe', 'utf-16-le'),
3429         (b'\xfe\xff', 'utf-16-be'),
3430     ]
3431     for bom, enc in BOMS:
3432         if first_bytes.startswith(bom):
3433             s = first_bytes[len(bom):].decode(enc, 'replace')
3434             break
3435     else:
3436         s = first_bytes.decode('utf-8', 'replace')
3437
3438     return re.match(r'^\s*<', s)
3439
3440
3441 def determine_protocol(info_dict):
3442     protocol = info_dict.get('protocol')
3443     if protocol is not None:
3444         return protocol
3445
3446     url = sanitize_url(info_dict['url'])
3447     if url.startswith('rtmp'):
3448         return 'rtmp'
3449     elif url.startswith('mms'):
3450         return 'mms'
3451     elif url.startswith('rtsp'):
3452         return 'rtsp'
3453
3454     ext = determine_ext(url)
3455     if ext == 'm3u8':
3456         return 'm3u8'
3457     elif ext == 'f4m':
3458         return 'f4m'
3459
3460     return compat_urllib_parse_urlparse(url).scheme
3461
3462
3463 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3464     """ Render a list of rows, each as a list of values.
3465     Text after a \t will be right aligned """
3466     def width(string):
3467         return len(remove_terminal_sequences(string).replace('\t', ''))
3468
3469     def get_max_lens(table):
3470         return [max(width(str(v)) for v in col) for col in zip(*table)]
3471
3472     def filter_using_list(row, filterArray):
3473         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3474
3475     max_lens = get_max_lens(data) if hide_empty else []
3476     header_row = filter_using_list(header_row, max_lens)
3477     data = [filter_using_list(row, max_lens) for row in data]
3478
3479     table = [header_row] + data
3480     max_lens = get_max_lens(table)
3481     extra_gap += 1
3482     if delim:
3483         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3484         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3485     for row in table:
3486         for pos, text in enumerate(map(str, row)):
3487             if '\t' in text:
3488                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3489             else:
3490                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3491     ret = '\n'.join(''.join(row).rstrip() for row in table)
3492     return ret
3493
3494
3495 def _match_one(filter_part, dct, incomplete):
3496     # TODO: Generalize code with YoutubeDL._build_format_filter
3497     STRING_OPERATORS = {
3498         '*=': operator.contains,
3499         '^=': lambda attr, value: attr.startswith(value),
3500         '$=': lambda attr, value: attr.endswith(value),
3501         '~=': lambda attr, value: re.search(value, attr),
3502     }
3503     COMPARISON_OPERATORS = {
3504         **STRING_OPERATORS,
3505         '<=': operator.le,  # "<=" must be defined above "<"
3506         '<': operator.lt,
3507         '>=': operator.ge,
3508         '>': operator.gt,
3509         '=': operator.eq,
3510     }
3511
3512     operator_rex = re.compile(r'''(?x)\s*
3513         (?P<key>[a-z_]+)
3514         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3515         (?:
3516             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3517             (?P<strval>.+?)
3518         )
3519         \s*$
3520         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3521     m = operator_rex.search(filter_part)
3522     if m:
3523         m = m.groupdict()
3524         unnegated_op = COMPARISON_OPERATORS[m['op']]
3525         if m['negation']:
3526             op = lambda attr, value: not unnegated_op(attr, value)
3527         else:
3528             op = unnegated_op
3529         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3530         if m['quote']:
3531             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3532         actual_value = dct.get(m['key'])
3533         numeric_comparison = None
3534         if isinstance(actual_value, compat_numeric_types):
3535             # If the original field is a string and matching comparisonvalue is
3536             # a number we should respect the origin of the original field
3537             # and process comparison value as a string (see
3538             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3539             try:
3540                 numeric_comparison = int(comparison_value)
3541             except ValueError:
3542                 numeric_comparison = parse_filesize(comparison_value)
3543                 if numeric_comparison is None:
3544                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3545                 if numeric_comparison is None:
3546                     numeric_comparison = parse_duration(comparison_value)
3547         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3548             raise ValueError('Operator %s only supports string values!' % m['op'])
3549         if actual_value is None:
3550             return incomplete or m['none_inclusive']
3551         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3552
3553     UNARY_OPERATORS = {
3554         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3555         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3556     }
3557     operator_rex = re.compile(r'''(?x)\s*
3558         (?P<op>%s)\s*(?P<key>[a-z_]+)
3559         \s*$
3560         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3561     m = operator_rex.search(filter_part)
3562     if m:
3563         op = UNARY_OPERATORS[m.group('op')]
3564         actual_value = dct.get(m.group('key'))
3565         if incomplete and actual_value is None:
3566             return True
3567         return op(actual_value)
3568
3569     raise ValueError('Invalid filter part %r' % filter_part)
3570
3571
3572 def match_str(filter_str, dct, incomplete=False):
3573     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3574         When incomplete, all conditions passes on missing fields
3575     """
3576     return all(
3577         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3578         for filter_part in re.split(r'(?<!\\)&', filter_str))
3579
3580
3581 def match_filter_func(filter_str):
3582     def _match_func(info_dict, *args, **kwargs):
3583         if match_str(filter_str, info_dict, *args, **kwargs):
3584             return None
3585         else:
3586             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3587             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3588     return _match_func
3589
3590
3591 def parse_dfxp_time_expr(time_expr):
3592     if not time_expr:
3593         return
3594
3595     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3596     if mobj:
3597         return float(mobj.group('time_offset'))
3598
3599     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3600     if mobj:
3601         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3602
3603
3604 def srt_subtitles_timecode(seconds):
3605     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3606
3607
3608 def ass_subtitles_timecode(seconds):
3609     time = timetuple_from_msec(seconds * 1000)
3610     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3611
3612
3613 def dfxp2srt(dfxp_data):
3614     '''
3615     @param dfxp_data A bytes-like object containing DFXP data
3616     @returns A unicode object containing converted SRT data
3617     '''
3618     LEGACY_NAMESPACES = (
3619         (b'http://www.w3.org/ns/ttml', [
3620             b'http://www.w3.org/2004/11/ttaf1',
3621             b'http://www.w3.org/2006/04/ttaf1',
3622             b'http://www.w3.org/2006/10/ttaf1',
3623         ]),
3624         (b'http://www.w3.org/ns/ttml#styling', [
3625             b'http://www.w3.org/ns/ttml#style',
3626         ]),
3627     )
3628
3629     SUPPORTED_STYLING = [
3630         'color',
3631         'fontFamily',
3632         'fontSize',
3633         'fontStyle',
3634         'fontWeight',
3635         'textDecoration'
3636     ]
3637
3638     _x = functools.partial(xpath_with_ns, ns_map={
3639         'xml': 'http://www.w3.org/XML/1998/namespace',
3640         'ttml': 'http://www.w3.org/ns/ttml',
3641         'tts': 'http://www.w3.org/ns/ttml#styling',
3642     })
3643
3644     styles = {}
3645     default_style = {}
3646
3647     class TTMLPElementParser(object):
3648         _out = ''
3649         _unclosed_elements = []
3650         _applied_styles = []
3651
3652         def start(self, tag, attrib):
3653             if tag in (_x('ttml:br'), 'br'):
3654                 self._out += '\n'
3655             else:
3656                 unclosed_elements = []
3657                 style = {}
3658                 element_style_id = attrib.get('style')
3659                 if default_style:
3660                     style.update(default_style)
3661                 if element_style_id:
3662                     style.update(styles.get(element_style_id, {}))
3663                 for prop in SUPPORTED_STYLING:
3664                     prop_val = attrib.get(_x('tts:' + prop))
3665                     if prop_val:
3666                         style[prop] = prop_val
3667                 if style:
3668                     font = ''
3669                     for k, v in sorted(style.items()):
3670                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3671                             continue
3672                         if k == 'color':
3673                             font += ' color="%s"' % v
3674                         elif k == 'fontSize':
3675                             font += ' size="%s"' % v
3676                         elif k == 'fontFamily':
3677                             font += ' face="%s"' % v
3678                         elif k == 'fontWeight' and v == 'bold':
3679                             self._out += '<b>'
3680                             unclosed_elements.append('b')
3681                         elif k == 'fontStyle' and v == 'italic':
3682                             self._out += '<i>'
3683                             unclosed_elements.append('i')
3684                         elif k == 'textDecoration' and v == 'underline':
3685                             self._out += '<u>'
3686                             unclosed_elements.append('u')
3687                     if font:
3688                         self._out += '<font' + font + '>'
3689                         unclosed_elements.append('font')
3690                     applied_style = {}
3691                     if self._applied_styles:
3692                         applied_style.update(self._applied_styles[-1])
3693                     applied_style.update(style)
3694                     self._applied_styles.append(applied_style)
3695                 self._unclosed_elements.append(unclosed_elements)
3696
3697         def end(self, tag):
3698             if tag not in (_x('ttml:br'), 'br'):
3699                 unclosed_elements = self._unclosed_elements.pop()
3700                 for element in reversed(unclosed_elements):
3701                     self._out += '</%s>' % element
3702                 if unclosed_elements and self._applied_styles:
3703                     self._applied_styles.pop()
3704
3705         def data(self, data):
3706             self._out += data
3707
3708         def close(self):
3709             return self._out.strip()
3710
3711     def parse_node(node):
3712         target = TTMLPElementParser()
3713         parser = xml.etree.ElementTree.XMLParser(target=target)
3714         parser.feed(xml.etree.ElementTree.tostring(node))
3715         return parser.close()
3716
3717     for k, v in LEGACY_NAMESPACES:
3718         for ns in v:
3719             dfxp_data = dfxp_data.replace(ns, k)
3720
3721     dfxp = compat_etree_fromstring(dfxp_data)
3722     out = []
3723     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3724
3725     if not paras:
3726         raise ValueError('Invalid dfxp/TTML subtitle')
3727
3728     repeat = False
3729     while True:
3730         for style in dfxp.findall(_x('.//ttml:style')):
3731             style_id = style.get('id') or style.get(_x('xml:id'))
3732             if not style_id:
3733                 continue
3734             parent_style_id = style.get('style')
3735             if parent_style_id:
3736                 if parent_style_id not in styles:
3737                     repeat = True
3738                     continue
3739                 styles[style_id] = styles[parent_style_id].copy()
3740             for prop in SUPPORTED_STYLING:
3741                 prop_val = style.get(_x('tts:' + prop))
3742                 if prop_val:
3743                     styles.setdefault(style_id, {})[prop] = prop_val
3744         if repeat:
3745             repeat = False
3746         else:
3747             break
3748
3749     for p in ('body', 'div'):
3750         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3751         if ele is None:
3752             continue
3753         style = styles.get(ele.get('style'))
3754         if not style:
3755             continue
3756         default_style.update(style)
3757
3758     for para, index in zip(paras, itertools.count(1)):
3759         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3760         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3761         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3762         if begin_time is None:
3763             continue
3764         if not end_time:
3765             if not dur:
3766                 continue
3767             end_time = begin_time + dur
3768         out.append('%d\n%s --> %s\n%s\n\n' % (
3769             index,
3770             srt_subtitles_timecode(begin_time),
3771             srt_subtitles_timecode(end_time),
3772             parse_node(para)))
3773
3774     return ''.join(out)
3775
3776
3777 def cli_option(params, command_option, param):
3778     param = params.get(param)
3779     if param:
3780         param = compat_str(param)
3781     return [command_option, param] if param is not None else []
3782
3783
3784 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3785     param = params.get(param)
3786     if param is None:
3787         return []
3788     assert isinstance(param, bool)
3789     if separator:
3790         return [command_option + separator + (true_value if param else false_value)]
3791     return [command_option, true_value if param else false_value]
3792
3793
3794 def cli_valueless_option(params, command_option, param, expected_value=True):
3795     param = params.get(param)
3796     return [command_option] if param == expected_value else []
3797
3798
3799 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3800     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3801         if use_compat:
3802             return argdict
3803         else:
3804             argdict = None
3805     if argdict is None:
3806         return default
3807     assert isinstance(argdict, dict)
3808
3809     assert isinstance(keys, (list, tuple))
3810     for key_list in keys:
3811         arg_list = list(filter(
3812             lambda x: x is not None,
3813             [argdict.get(key.lower()) for key in variadic(key_list)]))
3814         if arg_list:
3815             return [arg for args in arg_list for arg in args]
3816     return default
3817
3818
3819 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3820     main_key, exe = main_key.lower(), exe.lower()
3821     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3822     keys = [f'{root_key}{k}' for k in (keys or [''])]
3823     if root_key in keys:
3824         if main_key != exe:
3825             keys.append((main_key, exe))
3826         keys.append('default')
3827     else:
3828         use_compat = False
3829     return cli_configuration_args(argdict, keys, default, use_compat)
3830
3831
3832 class ISO639Utils(object):
3833     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3834     _lang_map = {
3835         'aa': 'aar',
3836         'ab': 'abk',
3837         'ae': 'ave',
3838         'af': 'afr',
3839         'ak': 'aka',
3840         'am': 'amh',
3841         'an': 'arg',
3842         'ar': 'ara',
3843         'as': 'asm',
3844         'av': 'ava',
3845         'ay': 'aym',
3846         'az': 'aze',
3847         'ba': 'bak',
3848         'be': 'bel',
3849         'bg': 'bul',
3850         'bh': 'bih',
3851         'bi': 'bis',
3852         'bm': 'bam',
3853         'bn': 'ben',
3854         'bo': 'bod',
3855         'br': 'bre',
3856         'bs': 'bos',
3857         'ca': 'cat',
3858         'ce': 'che',
3859         'ch': 'cha',
3860         'co': 'cos',
3861         'cr': 'cre',
3862         'cs': 'ces',
3863         'cu': 'chu',
3864         'cv': 'chv',
3865         'cy': 'cym',
3866         'da': 'dan',
3867         'de': 'deu',
3868         'dv': 'div',
3869         'dz': 'dzo',
3870         'ee': 'ewe',
3871         'el': 'ell',
3872         'en': 'eng',
3873         'eo': 'epo',
3874         'es': 'spa',
3875         'et': 'est',
3876         'eu': 'eus',
3877         'fa': 'fas',
3878         'ff': 'ful',
3879         'fi': 'fin',
3880         'fj': 'fij',
3881         'fo': 'fao',
3882         'fr': 'fra',
3883         'fy': 'fry',
3884         'ga': 'gle',
3885         'gd': 'gla',
3886         'gl': 'glg',
3887         'gn': 'grn',
3888         'gu': 'guj',
3889         'gv': 'glv',
3890         'ha': 'hau',
3891         'he': 'heb',
3892         'iw': 'heb',  # Replaced by he in 1989 revision
3893         'hi': 'hin',
3894         'ho': 'hmo',
3895         'hr': 'hrv',
3896         'ht': 'hat',
3897         'hu': 'hun',
3898         'hy': 'hye',
3899         'hz': 'her',
3900         'ia': 'ina',
3901         'id': 'ind',
3902         'in': 'ind',  # Replaced by id in 1989 revision
3903         'ie': 'ile',
3904         'ig': 'ibo',
3905         'ii': 'iii',
3906         'ik': 'ipk',
3907         'io': 'ido',
3908         'is': 'isl',
3909         'it': 'ita',
3910         'iu': 'iku',
3911         'ja': 'jpn',
3912         'jv': 'jav',
3913         'ka': 'kat',
3914         'kg': 'kon',
3915         'ki': 'kik',
3916         'kj': 'kua',
3917         'kk': 'kaz',
3918         'kl': 'kal',
3919         'km': 'khm',
3920         'kn': 'kan',
3921         'ko': 'kor',
3922         'kr': 'kau',
3923         'ks': 'kas',
3924         'ku': 'kur',
3925         'kv': 'kom',
3926         'kw': 'cor',
3927         'ky': 'kir',
3928         'la': 'lat',
3929         'lb': 'ltz',
3930         'lg': 'lug',
3931         'li': 'lim',
3932         'ln': 'lin',
3933         'lo': 'lao',
3934         'lt': 'lit',
3935         'lu': 'lub',
3936         'lv': 'lav',
3937         'mg': 'mlg',
3938         'mh': 'mah',
3939         'mi': 'mri',
3940         'mk': 'mkd',
3941         'ml': 'mal',
3942         'mn': 'mon',
3943         'mr': 'mar',
3944         'ms': 'msa',
3945         'mt': 'mlt',
3946         'my': 'mya',
3947         'na': 'nau',
3948         'nb': 'nob',
3949         'nd': 'nde',
3950         'ne': 'nep',
3951         'ng': 'ndo',
3952         'nl': 'nld',
3953         'nn': 'nno',
3954         'no': 'nor',
3955         'nr': 'nbl',
3956         'nv': 'nav',
3957         'ny': 'nya',
3958         'oc': 'oci',
3959         'oj': 'oji',
3960         'om': 'orm',
3961         'or': 'ori',
3962         'os': 'oss',
3963         'pa': 'pan',
3964         'pi': 'pli',
3965         'pl': 'pol',
3966         'ps': 'pus',
3967         'pt': 'por',
3968         'qu': 'que',
3969         'rm': 'roh',
3970         'rn': 'run',
3971         'ro': 'ron',
3972         'ru': 'rus',
3973         'rw': 'kin',
3974         'sa': 'san',
3975         'sc': 'srd',
3976         'sd': 'snd',
3977         'se': 'sme',
3978         'sg': 'sag',
3979         'si': 'sin',
3980         'sk': 'slk',
3981         'sl': 'slv',
3982         'sm': 'smo',
3983         'sn': 'sna',
3984         'so': 'som',
3985         'sq': 'sqi',
3986         'sr': 'srp',
3987         'ss': 'ssw',
3988         'st': 'sot',
3989         'su': 'sun',
3990         'sv': 'swe',
3991         'sw': 'swa',
3992         'ta': 'tam',
3993         'te': 'tel',
3994         'tg': 'tgk',
3995         'th': 'tha',
3996         'ti': 'tir',
3997         'tk': 'tuk',
3998         'tl': 'tgl',
3999         'tn': 'tsn',
4000         'to': 'ton',
4001         'tr': 'tur',
4002         'ts': 'tso',
4003         'tt': 'tat',
4004         'tw': 'twi',
4005         'ty': 'tah',
4006         'ug': 'uig',
4007         'uk': 'ukr',
4008         'ur': 'urd',
4009         'uz': 'uzb',
4010         've': 'ven',
4011         'vi': 'vie',
4012         'vo': 'vol',
4013         'wa': 'wln',
4014         'wo': 'wol',
4015         'xh': 'xho',
4016         'yi': 'yid',
4017         'ji': 'yid',  # Replaced by yi in 1989 revision
4018         'yo': 'yor',
4019         'za': 'zha',
4020         'zh': 'zho',
4021         'zu': 'zul',
4022     }
4023
4024     @classmethod
4025     def short2long(cls, code):
4026         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4027         return cls._lang_map.get(code[:2])
4028
4029     @classmethod
4030     def long2short(cls, code):
4031         """Convert language code from ISO 639-2/T to ISO 639-1"""
4032         for short_name, long_name in cls._lang_map.items():
4033             if long_name == code:
4034                 return short_name
4035
4036
4037 class ISO3166Utils(object):
4038     # From http://data.okfn.org/data/core/country-list
4039     _country_map = {
4040         'AF': 'Afghanistan',
4041         'AX': 'Åland Islands',
4042         'AL': 'Albania',
4043         'DZ': 'Algeria',
4044         'AS': 'American Samoa',
4045         'AD': 'Andorra',
4046         'AO': 'Angola',
4047         'AI': 'Anguilla',
4048         'AQ': 'Antarctica',
4049         'AG': 'Antigua and Barbuda',
4050         'AR': 'Argentina',
4051         'AM': 'Armenia',
4052         'AW': 'Aruba',
4053         'AU': 'Australia',
4054         'AT': 'Austria',
4055         'AZ': 'Azerbaijan',
4056         'BS': 'Bahamas',
4057         'BH': 'Bahrain',
4058         'BD': 'Bangladesh',
4059         'BB': 'Barbados',
4060         'BY': 'Belarus',
4061         'BE': 'Belgium',
4062         'BZ': 'Belize',
4063         'BJ': 'Benin',
4064         'BM': 'Bermuda',
4065         'BT': 'Bhutan',
4066         'BO': 'Bolivia, Plurinational State of',
4067         'BQ': 'Bonaire, Sint Eustatius and Saba',
4068         'BA': 'Bosnia and Herzegovina',
4069         'BW': 'Botswana',
4070         'BV': 'Bouvet Island',
4071         'BR': 'Brazil',
4072         'IO': 'British Indian Ocean Territory',
4073         'BN': 'Brunei Darussalam',
4074         'BG': 'Bulgaria',
4075         'BF': 'Burkina Faso',
4076         'BI': 'Burundi',
4077         'KH': 'Cambodia',
4078         'CM': 'Cameroon',
4079         'CA': 'Canada',
4080         'CV': 'Cape Verde',
4081         'KY': 'Cayman Islands',
4082         'CF': 'Central African Republic',
4083         'TD': 'Chad',
4084         'CL': 'Chile',
4085         'CN': 'China',
4086         'CX': 'Christmas Island',
4087         'CC': 'Cocos (Keeling) Islands',
4088         'CO': 'Colombia',
4089         'KM': 'Comoros',
4090         'CG': 'Congo',
4091         'CD': 'Congo, the Democratic Republic of the',
4092         'CK': 'Cook Islands',
4093         'CR': 'Costa Rica',
4094         'CI': 'Côte d\'Ivoire',
4095         'HR': 'Croatia',
4096         'CU': 'Cuba',
4097         'CW': 'Curaçao',
4098         'CY': 'Cyprus',
4099         'CZ': 'Czech Republic',
4100         'DK': 'Denmark',
4101         'DJ': 'Djibouti',
4102         'DM': 'Dominica',
4103         'DO': 'Dominican Republic',
4104         'EC': 'Ecuador',
4105         'EG': 'Egypt',
4106         'SV': 'El Salvador',
4107         'GQ': 'Equatorial Guinea',
4108         'ER': 'Eritrea',
4109         'EE': 'Estonia',
4110         'ET': 'Ethiopia',
4111         'FK': 'Falkland Islands (Malvinas)',
4112         'FO': 'Faroe Islands',
4113         'FJ': 'Fiji',
4114         'FI': 'Finland',
4115         'FR': 'France',
4116         'GF': 'French Guiana',
4117         'PF': 'French Polynesia',
4118         'TF': 'French Southern Territories',
4119         'GA': 'Gabon',
4120         'GM': 'Gambia',
4121         'GE': 'Georgia',
4122         'DE': 'Germany',
4123         'GH': 'Ghana',
4124         'GI': 'Gibraltar',
4125         'GR': 'Greece',
4126         'GL': 'Greenland',
4127         'GD': 'Grenada',
4128         'GP': 'Guadeloupe',
4129         'GU': 'Guam',
4130         'GT': 'Guatemala',
4131         'GG': 'Guernsey',
4132         'GN': 'Guinea',
4133         'GW': 'Guinea-Bissau',
4134         'GY': 'Guyana',
4135         'HT': 'Haiti',
4136         'HM': 'Heard Island and McDonald Islands',
4137         'VA': 'Holy See (Vatican City State)',
4138         'HN': 'Honduras',
4139         'HK': 'Hong Kong',
4140         'HU': 'Hungary',
4141         'IS': 'Iceland',
4142         'IN': 'India',
4143         'ID': 'Indonesia',
4144         'IR': 'Iran, Islamic Republic of',
4145         'IQ': 'Iraq',
4146         'IE': 'Ireland',
4147         'IM': 'Isle of Man',
4148         'IL': 'Israel',
4149         'IT': 'Italy',
4150         'JM': 'Jamaica',
4151         'JP': 'Japan',
4152         'JE': 'Jersey',
4153         'JO': 'Jordan',
4154         'KZ': 'Kazakhstan',
4155         'KE': 'Kenya',
4156         'KI': 'Kiribati',
4157         'KP': 'Korea, Democratic People\'s Republic of',
4158         'KR': 'Korea, Republic of',
4159         'KW': 'Kuwait',
4160         'KG': 'Kyrgyzstan',
4161         'LA': 'Lao People\'s Democratic Republic',
4162         'LV': 'Latvia',
4163         'LB': 'Lebanon',
4164         'LS': 'Lesotho',
4165         'LR': 'Liberia',
4166         'LY': 'Libya',
4167         'LI': 'Liechtenstein',
4168         'LT': 'Lithuania',
4169         'LU': 'Luxembourg',
4170         'MO': 'Macao',
4171         'MK': 'Macedonia, the Former Yugoslav Republic of',
4172         'MG': 'Madagascar',
4173         'MW': 'Malawi',
4174         'MY': 'Malaysia',
4175         'MV': 'Maldives',
4176         'ML': 'Mali',
4177         'MT': 'Malta',
4178         'MH': 'Marshall Islands',
4179         'MQ': 'Martinique',
4180         'MR': 'Mauritania',
4181         'MU': 'Mauritius',
4182         'YT': 'Mayotte',
4183         'MX': 'Mexico',
4184         'FM': 'Micronesia, Federated States of',
4185         'MD': 'Moldova, Republic of',
4186         'MC': 'Monaco',
4187         'MN': 'Mongolia',
4188         'ME': 'Montenegro',
4189         'MS': 'Montserrat',
4190         'MA': 'Morocco',
4191         'MZ': 'Mozambique',
4192         'MM': 'Myanmar',
4193         'NA': 'Namibia',
4194         'NR': 'Nauru',
4195         'NP': 'Nepal',
4196         'NL': 'Netherlands',
4197         'NC': 'New Caledonia',
4198         'NZ': 'New Zealand',
4199         'NI': 'Nicaragua',
4200         'NE': 'Niger',
4201         'NG': 'Nigeria',
4202         'NU': 'Niue',
4203         'NF': 'Norfolk Island',
4204         'MP': 'Northern Mariana Islands',
4205         'NO': 'Norway',
4206         'OM': 'Oman',
4207         'PK': 'Pakistan',
4208         'PW': 'Palau',
4209         'PS': 'Palestine, State of',
4210         'PA': 'Panama',
4211         'PG': 'Papua New Guinea',
4212         'PY': 'Paraguay',
4213         'PE': 'Peru',
4214         'PH': 'Philippines',
4215         'PN': 'Pitcairn',
4216         'PL': 'Poland',
4217         'PT': 'Portugal',
4218         'PR': 'Puerto Rico',
4219         'QA': 'Qatar',
4220         'RE': 'Réunion',
4221         'RO': 'Romania',
4222         'RU': 'Russian Federation',
4223         'RW': 'Rwanda',
4224         'BL': 'Saint Barthélemy',
4225         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4226         'KN': 'Saint Kitts and Nevis',
4227         'LC': 'Saint Lucia',
4228         'MF': 'Saint Martin (French part)',
4229         'PM': 'Saint Pierre and Miquelon',
4230         'VC': 'Saint Vincent and the Grenadines',
4231         'WS': 'Samoa',
4232         'SM': 'San Marino',
4233         'ST': 'Sao Tome and Principe',
4234         'SA': 'Saudi Arabia',
4235         'SN': 'Senegal',
4236         'RS': 'Serbia',
4237         'SC': 'Seychelles',
4238         'SL': 'Sierra Leone',
4239         'SG': 'Singapore',
4240         'SX': 'Sint Maarten (Dutch part)',
4241         'SK': 'Slovakia',
4242         'SI': 'Slovenia',
4243         'SB': 'Solomon Islands',
4244         'SO': 'Somalia',
4245         'ZA': 'South Africa',
4246         'GS': 'South Georgia and the South Sandwich Islands',
4247         'SS': 'South Sudan',
4248         'ES': 'Spain',
4249         'LK': 'Sri Lanka',
4250         'SD': 'Sudan',
4251         'SR': 'Suriname',
4252         'SJ': 'Svalbard and Jan Mayen',
4253         'SZ': 'Swaziland',
4254         'SE': 'Sweden',
4255         'CH': 'Switzerland',
4256         'SY': 'Syrian Arab Republic',
4257         'TW': 'Taiwan, Province of China',
4258         'TJ': 'Tajikistan',
4259         'TZ': 'Tanzania, United Republic of',
4260         'TH': 'Thailand',
4261         'TL': 'Timor-Leste',
4262         'TG': 'Togo',
4263         'TK': 'Tokelau',
4264         'TO': 'Tonga',
4265         'TT': 'Trinidad and Tobago',
4266         'TN': 'Tunisia',
4267         'TR': 'Turkey',
4268         'TM': 'Turkmenistan',
4269         'TC': 'Turks and Caicos Islands',
4270         'TV': 'Tuvalu',
4271         'UG': 'Uganda',
4272         'UA': 'Ukraine',
4273         'AE': 'United Arab Emirates',
4274         'GB': 'United Kingdom',
4275         'US': 'United States',
4276         'UM': 'United States Minor Outlying Islands',
4277         'UY': 'Uruguay',
4278         'UZ': 'Uzbekistan',
4279         'VU': 'Vanuatu',
4280         'VE': 'Venezuela, Bolivarian Republic of',
4281         'VN': 'Viet Nam',
4282         'VG': 'Virgin Islands, British',
4283         'VI': 'Virgin Islands, U.S.',
4284         'WF': 'Wallis and Futuna',
4285         'EH': 'Western Sahara',
4286         'YE': 'Yemen',
4287         'ZM': 'Zambia',
4288         'ZW': 'Zimbabwe',
4289     }
4290
4291     @classmethod
4292     def short2full(cls, code):
4293         """Convert an ISO 3166-2 country code to the corresponding full name"""
4294         return cls._country_map.get(code.upper())
4295
4296
4297 class GeoUtils(object):
4298     # Major IPv4 address blocks per country
4299     _country_ip_map = {
4300         'AD': '46.172.224.0/19',
4301         'AE': '94.200.0.0/13',
4302         'AF': '149.54.0.0/17',
4303         'AG': '209.59.64.0/18',
4304         'AI': '204.14.248.0/21',
4305         'AL': '46.99.0.0/16',
4306         'AM': '46.70.0.0/15',
4307         'AO': '105.168.0.0/13',
4308         'AP': '182.50.184.0/21',
4309         'AQ': '23.154.160.0/24',
4310         'AR': '181.0.0.0/12',
4311         'AS': '202.70.112.0/20',
4312         'AT': '77.116.0.0/14',
4313         'AU': '1.128.0.0/11',
4314         'AW': '181.41.0.0/18',
4315         'AX': '185.217.4.0/22',
4316         'AZ': '5.197.0.0/16',
4317         'BA': '31.176.128.0/17',
4318         'BB': '65.48.128.0/17',
4319         'BD': '114.130.0.0/16',
4320         'BE': '57.0.0.0/8',
4321         'BF': '102.178.0.0/15',
4322         'BG': '95.42.0.0/15',
4323         'BH': '37.131.0.0/17',
4324         'BI': '154.117.192.0/18',
4325         'BJ': '137.255.0.0/16',
4326         'BL': '185.212.72.0/23',
4327         'BM': '196.12.64.0/18',
4328         'BN': '156.31.0.0/16',
4329         'BO': '161.56.0.0/16',
4330         'BQ': '161.0.80.0/20',
4331         'BR': '191.128.0.0/12',
4332         'BS': '24.51.64.0/18',
4333         'BT': '119.2.96.0/19',
4334         'BW': '168.167.0.0/16',
4335         'BY': '178.120.0.0/13',
4336         'BZ': '179.42.192.0/18',
4337         'CA': '99.224.0.0/11',
4338         'CD': '41.243.0.0/16',
4339         'CF': '197.242.176.0/21',
4340         'CG': '160.113.0.0/16',
4341         'CH': '85.0.0.0/13',
4342         'CI': '102.136.0.0/14',
4343         'CK': '202.65.32.0/19',
4344         'CL': '152.172.0.0/14',
4345         'CM': '102.244.0.0/14',
4346         'CN': '36.128.0.0/10',
4347         'CO': '181.240.0.0/12',
4348         'CR': '201.192.0.0/12',
4349         'CU': '152.206.0.0/15',
4350         'CV': '165.90.96.0/19',
4351         'CW': '190.88.128.0/17',
4352         'CY': '31.153.0.0/16',
4353         'CZ': '88.100.0.0/14',
4354         'DE': '53.0.0.0/8',
4355         'DJ': '197.241.0.0/17',
4356         'DK': '87.48.0.0/12',
4357         'DM': '192.243.48.0/20',
4358         'DO': '152.166.0.0/15',
4359         'DZ': '41.96.0.0/12',
4360         'EC': '186.68.0.0/15',
4361         'EE': '90.190.0.0/15',
4362         'EG': '156.160.0.0/11',
4363         'ER': '196.200.96.0/20',
4364         'ES': '88.0.0.0/11',
4365         'ET': '196.188.0.0/14',
4366         'EU': '2.16.0.0/13',
4367         'FI': '91.152.0.0/13',
4368         'FJ': '144.120.0.0/16',
4369         'FK': '80.73.208.0/21',
4370         'FM': '119.252.112.0/20',
4371         'FO': '88.85.32.0/19',
4372         'FR': '90.0.0.0/9',
4373         'GA': '41.158.0.0/15',
4374         'GB': '25.0.0.0/8',
4375         'GD': '74.122.88.0/21',
4376         'GE': '31.146.0.0/16',
4377         'GF': '161.22.64.0/18',
4378         'GG': '62.68.160.0/19',
4379         'GH': '154.160.0.0/12',
4380         'GI': '95.164.0.0/16',
4381         'GL': '88.83.0.0/19',
4382         'GM': '160.182.0.0/15',
4383         'GN': '197.149.192.0/18',
4384         'GP': '104.250.0.0/19',
4385         'GQ': '105.235.224.0/20',
4386         'GR': '94.64.0.0/13',
4387         'GT': '168.234.0.0/16',
4388         'GU': '168.123.0.0/16',
4389         'GW': '197.214.80.0/20',
4390         'GY': '181.41.64.0/18',
4391         'HK': '113.252.0.0/14',
4392         'HN': '181.210.0.0/16',
4393         'HR': '93.136.0.0/13',
4394         'HT': '148.102.128.0/17',
4395         'HU': '84.0.0.0/14',
4396         'ID': '39.192.0.0/10',
4397         'IE': '87.32.0.0/12',
4398         'IL': '79.176.0.0/13',
4399         'IM': '5.62.80.0/20',
4400         'IN': '117.192.0.0/10',
4401         'IO': '203.83.48.0/21',
4402         'IQ': '37.236.0.0/14',
4403         'IR': '2.176.0.0/12',
4404         'IS': '82.221.0.0/16',
4405         'IT': '79.0.0.0/10',
4406         'JE': '87.244.64.0/18',
4407         'JM': '72.27.0.0/17',
4408         'JO': '176.29.0.0/16',
4409         'JP': '133.0.0.0/8',
4410         'KE': '105.48.0.0/12',
4411         'KG': '158.181.128.0/17',
4412         'KH': '36.37.128.0/17',
4413         'KI': '103.25.140.0/22',
4414         'KM': '197.255.224.0/20',
4415         'KN': '198.167.192.0/19',
4416         'KP': '175.45.176.0/22',
4417         'KR': '175.192.0.0/10',
4418         'KW': '37.36.0.0/14',
4419         'KY': '64.96.0.0/15',
4420         'KZ': '2.72.0.0/13',
4421         'LA': '115.84.64.0/18',
4422         'LB': '178.135.0.0/16',
4423         'LC': '24.92.144.0/20',
4424         'LI': '82.117.0.0/19',
4425         'LK': '112.134.0.0/15',
4426         'LR': '102.183.0.0/16',
4427         'LS': '129.232.0.0/17',
4428         'LT': '78.56.0.0/13',
4429         'LU': '188.42.0.0/16',
4430         'LV': '46.109.0.0/16',
4431         'LY': '41.252.0.0/14',
4432         'MA': '105.128.0.0/11',
4433         'MC': '88.209.64.0/18',
4434         'MD': '37.246.0.0/16',
4435         'ME': '178.175.0.0/17',
4436         'MF': '74.112.232.0/21',
4437         'MG': '154.126.0.0/17',
4438         'MH': '117.103.88.0/21',
4439         'MK': '77.28.0.0/15',
4440         'ML': '154.118.128.0/18',
4441         'MM': '37.111.0.0/17',
4442         'MN': '49.0.128.0/17',
4443         'MO': '60.246.0.0/16',
4444         'MP': '202.88.64.0/20',
4445         'MQ': '109.203.224.0/19',
4446         'MR': '41.188.64.0/18',
4447         'MS': '208.90.112.0/22',
4448         'MT': '46.11.0.0/16',
4449         'MU': '105.16.0.0/12',
4450         'MV': '27.114.128.0/18',
4451         'MW': '102.70.0.0/15',
4452         'MX': '187.192.0.0/11',
4453         'MY': '175.136.0.0/13',
4454         'MZ': '197.218.0.0/15',
4455         'NA': '41.182.0.0/16',
4456         'NC': '101.101.0.0/18',
4457         'NE': '197.214.0.0/18',
4458         'NF': '203.17.240.0/22',
4459         'NG': '105.112.0.0/12',
4460         'NI': '186.76.0.0/15',
4461         'NL': '145.96.0.0/11',
4462         'NO': '84.208.0.0/13',
4463         'NP': '36.252.0.0/15',
4464         'NR': '203.98.224.0/19',
4465         'NU': '49.156.48.0/22',
4466         'NZ': '49.224.0.0/14',
4467         'OM': '5.36.0.0/15',
4468         'PA': '186.72.0.0/15',
4469         'PE': '186.160.0.0/14',
4470         'PF': '123.50.64.0/18',
4471         'PG': '124.240.192.0/19',
4472         'PH': '49.144.0.0/13',
4473         'PK': '39.32.0.0/11',
4474         'PL': '83.0.0.0/11',
4475         'PM': '70.36.0.0/20',
4476         'PR': '66.50.0.0/16',
4477         'PS': '188.161.0.0/16',
4478         'PT': '85.240.0.0/13',
4479         'PW': '202.124.224.0/20',
4480         'PY': '181.120.0.0/14',
4481         'QA': '37.210.0.0/15',
4482         'RE': '102.35.0.0/16',
4483         'RO': '79.112.0.0/13',
4484         'RS': '93.86.0.0/15',
4485         'RU': '5.136.0.0/13',
4486         'RW': '41.186.0.0/16',
4487         'SA': '188.48.0.0/13',
4488         'SB': '202.1.160.0/19',
4489         'SC': '154.192.0.0/11',
4490         'SD': '102.120.0.0/13',
4491         'SE': '78.64.0.0/12',
4492         'SG': '8.128.0.0/10',
4493         'SI': '188.196.0.0/14',
4494         'SK': '78.98.0.0/15',
4495         'SL': '102.143.0.0/17',
4496         'SM': '89.186.32.0/19',
4497         'SN': '41.82.0.0/15',
4498         'SO': '154.115.192.0/18',
4499         'SR': '186.179.128.0/17',
4500         'SS': '105.235.208.0/21',
4501         'ST': '197.159.160.0/19',
4502         'SV': '168.243.0.0/16',
4503         'SX': '190.102.0.0/20',
4504         'SY': '5.0.0.0/16',
4505         'SZ': '41.84.224.0/19',
4506         'TC': '65.255.48.0/20',
4507         'TD': '154.68.128.0/19',
4508         'TG': '196.168.0.0/14',
4509         'TH': '171.96.0.0/13',
4510         'TJ': '85.9.128.0/18',
4511         'TK': '27.96.24.0/21',
4512         'TL': '180.189.160.0/20',
4513         'TM': '95.85.96.0/19',
4514         'TN': '197.0.0.0/11',
4515         'TO': '175.176.144.0/21',
4516         'TR': '78.160.0.0/11',
4517         'TT': '186.44.0.0/15',
4518         'TV': '202.2.96.0/19',
4519         'TW': '120.96.0.0/11',
4520         'TZ': '156.156.0.0/14',
4521         'UA': '37.52.0.0/14',
4522         'UG': '102.80.0.0/13',
4523         'US': '6.0.0.0/8',
4524         'UY': '167.56.0.0/13',
4525         'UZ': '84.54.64.0/18',
4526         'VA': '212.77.0.0/19',
4527         'VC': '207.191.240.0/21',
4528         'VE': '186.88.0.0/13',
4529         'VG': '66.81.192.0/20',
4530         'VI': '146.226.0.0/16',
4531         'VN': '14.160.0.0/11',
4532         'VU': '202.80.32.0/20',
4533         'WF': '117.20.32.0/21',
4534         'WS': '202.4.32.0/19',
4535         'YE': '134.35.0.0/16',
4536         'YT': '41.242.116.0/22',
4537         'ZA': '41.0.0.0/11',
4538         'ZM': '102.144.0.0/13',
4539         'ZW': '102.177.192.0/18',
4540     }
4541
4542     @classmethod
4543     def random_ipv4(cls, code_or_block):
4544         if len(code_or_block) == 2:
4545             block = cls._country_ip_map.get(code_or_block.upper())
4546             if not block:
4547                 return None
4548         else:
4549             block = code_or_block
4550         addr, preflen = block.split('/')
4551         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4552         addr_max = addr_min | (0xffffffff >> int(preflen))
4553         return compat_str(socket.inet_ntoa(
4554             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4555
4556
4557 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4558     def __init__(self, proxies=None):
4559         # Set default handlers
4560         for type in ('http', 'https'):
4561             setattr(self, '%s_open' % type,
4562                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4563                         meth(r, proxy, type))
4564         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4565
4566     def proxy_open(self, req, proxy, type):
4567         req_proxy = req.headers.get('Ytdl-request-proxy')
4568         if req_proxy is not None:
4569             proxy = req_proxy
4570             del req.headers['Ytdl-request-proxy']
4571
4572         if proxy == '__noproxy__':
4573             return None  # No Proxy
4574         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4575             req.add_header('Ytdl-socks-proxy', proxy)
4576             # yt-dlp's http/https handlers do wrapping the socket with socks
4577             return None
4578         return compat_urllib_request.ProxyHandler.proxy_open(
4579             self, req, proxy, type)
4580
4581
4582 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4583 # released into Public Domain
4584 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4585
4586 def long_to_bytes(n, blocksize=0):
4587     """long_to_bytes(n:long, blocksize:int) : string
4588     Convert a long integer to a byte string.
4589
4590     If optional blocksize is given and greater than zero, pad the front of the
4591     byte string with binary zeros so that the length is a multiple of
4592     blocksize.
4593     """
4594     # after much testing, this algorithm was deemed to be the fastest
4595     s = b''
4596     n = int(n)
4597     while n > 0:
4598         s = compat_struct_pack('>I', n & 0xffffffff) + s
4599         n = n >> 32
4600     # strip off leading zeros
4601     for i in range(len(s)):
4602         if s[i] != b'\000'[0]:
4603             break
4604     else:
4605         # only happens when n == 0
4606         s = b'\000'
4607         i = 0
4608     s = s[i:]
4609     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4610     # de-padding being done above, but sigh...
4611     if blocksize > 0 and len(s) % blocksize:
4612         s = (blocksize - len(s) % blocksize) * b'\000' + s
4613     return s
4614
4615
4616 def bytes_to_long(s):
4617     """bytes_to_long(string) : long
4618     Convert a byte string to a long integer.
4619
4620     This is (essentially) the inverse of long_to_bytes().
4621     """
4622     acc = 0
4623     length = len(s)
4624     if length % 4:
4625         extra = (4 - length % 4)
4626         s = b'\000' * extra + s
4627         length = length + extra
4628     for i in range(0, length, 4):
4629         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4630     return acc
4631
4632
4633 def ohdave_rsa_encrypt(data, exponent, modulus):
4634     '''
4635     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4636
4637     Input:
4638         data: data to encrypt, bytes-like object
4639         exponent, modulus: parameter e and N of RSA algorithm, both integer
4640     Output: hex string of encrypted data
4641
4642     Limitation: supports one block encryption only
4643     '''
4644
4645     payload = int(binascii.hexlify(data[::-1]), 16)
4646     encrypted = pow(payload, exponent, modulus)
4647     return '%x' % encrypted
4648
4649
4650 def pkcs1pad(data, length):
4651     """
4652     Padding input data with PKCS#1 scheme
4653
4654     @param {int[]} data        input data
4655     @param {int}   length      target length
4656     @returns {int[]}           padded data
4657     """
4658     if len(data) > length - 11:
4659         raise ValueError('Input data too long for PKCS#1 padding')
4660
4661     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4662     return [0, 2] + pseudo_random + [0] + data
4663
4664
4665 def encode_base_n(num, n, table=None):
4666     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4667     if not table:
4668         table = FULL_TABLE[:n]
4669
4670     if n > len(table):
4671         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4672
4673     if num == 0:
4674         return table[0]
4675
4676     ret = ''
4677     while num:
4678         ret = table[num % n] + ret
4679         num = num // n
4680     return ret
4681
4682
4683 def decode_packed_codes(code):
4684     mobj = re.search(PACKED_CODES_RE, code)
4685     obfuscated_code, base, count, symbols = mobj.groups()
4686     base = int(base)
4687     count = int(count)
4688     symbols = symbols.split('|')
4689     symbol_table = {}
4690
4691     while count:
4692         count -= 1
4693         base_n_count = encode_base_n(count, base)
4694         symbol_table[base_n_count] = symbols[count] or base_n_count
4695
4696     return re.sub(
4697         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4698         obfuscated_code)
4699
4700
4701 def caesar(s, alphabet, shift):
4702     if shift == 0:
4703         return s
4704     l = len(alphabet)
4705     return ''.join(
4706         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4707         for c in s)
4708
4709
4710 def rot47(s):
4711     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4712
4713
4714 def parse_m3u8_attributes(attrib):
4715     info = {}
4716     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4717         if val.startswith('"'):
4718             val = val[1:-1]
4719         info[key] = val
4720     return info
4721
4722
4723 def urshift(val, n):
4724     return val >> n if val >= 0 else (val + 0x100000000) >> n
4725
4726
4727 # Based on png2str() written by @gdkchan and improved by @yokrysty
4728 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4729 def decode_png(png_data):
4730     # Reference: https://www.w3.org/TR/PNG/
4731     header = png_data[8:]
4732
4733     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4734         raise IOError('Not a valid PNG file.')
4735
4736     int_map = {1: '>B', 2: '>H', 4: '>I'}
4737     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4738
4739     chunks = []
4740
4741     while header:
4742         length = unpack_integer(header[:4])
4743         header = header[4:]
4744
4745         chunk_type = header[:4]
4746         header = header[4:]
4747
4748         chunk_data = header[:length]
4749         header = header[length:]
4750
4751         header = header[4:]  # Skip CRC
4752
4753         chunks.append({
4754             'type': chunk_type,
4755             'length': length,
4756             'data': chunk_data
4757         })
4758
4759     ihdr = chunks[0]['data']
4760
4761     width = unpack_integer(ihdr[:4])
4762     height = unpack_integer(ihdr[4:8])
4763
4764     idat = b''
4765
4766     for chunk in chunks:
4767         if chunk['type'] == b'IDAT':
4768             idat += chunk['data']
4769
4770     if not idat:
4771         raise IOError('Unable to read PNG data.')
4772
4773     decompressed_data = bytearray(zlib.decompress(idat))
4774
4775     stride = width * 3
4776     pixels = []
4777
4778     def _get_pixel(idx):
4779         x = idx % stride
4780         y = idx // stride
4781         return pixels[y][x]
4782
4783     for y in range(height):
4784         basePos = y * (1 + stride)
4785         filter_type = decompressed_data[basePos]
4786
4787         current_row = []
4788
4789         pixels.append(current_row)
4790
4791         for x in range(stride):
4792             color = decompressed_data[1 + basePos + x]
4793             basex = y * stride + x
4794             left = 0
4795             up = 0
4796
4797             if x > 2:
4798                 left = _get_pixel(basex - 3)
4799             if y > 0:
4800                 up = _get_pixel(basex - stride)
4801
4802             if filter_type == 1:  # Sub
4803                 color = (color + left) & 0xff
4804             elif filter_type == 2:  # Up
4805                 color = (color + up) & 0xff
4806             elif filter_type == 3:  # Average
4807                 color = (color + ((left + up) >> 1)) & 0xff
4808             elif filter_type == 4:  # Paeth
4809                 a = left
4810                 b = up
4811                 c = 0
4812
4813                 if x > 2 and y > 0:
4814                     c = _get_pixel(basex - stride - 3)
4815
4816                 p = a + b - c
4817
4818                 pa = abs(p - a)
4819                 pb = abs(p - b)
4820                 pc = abs(p - c)
4821
4822                 if pa <= pb and pa <= pc:
4823                     color = (color + a) & 0xff
4824                 elif pb <= pc:
4825                     color = (color + b) & 0xff
4826                 else:
4827                     color = (color + c) & 0xff
4828
4829             current_row.append(color)
4830
4831     return width, height, pixels
4832
4833
4834 def write_xattr(path, key, value):
4835     # This mess below finds the best xattr tool for the job
4836     try:
4837         # try the pyxattr module...
4838         import xattr
4839
4840         if hasattr(xattr, 'set'):  # pyxattr
4841             # Unicode arguments are not supported in python-pyxattr until
4842             # version 0.5.0
4843             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4844             pyxattr_required_version = '0.5.0'
4845             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4846                 # TODO: fallback to CLI tools
4847                 raise XAttrUnavailableError(
4848                     'python-pyxattr is detected but is too old. '
4849                     'yt-dlp requires %s or above while your version is %s. '
4850                     'Falling back to other xattr implementations' % (
4851                         pyxattr_required_version, xattr.__version__))
4852
4853             setxattr = xattr.set
4854         else:  # xattr
4855             setxattr = xattr.setxattr
4856
4857         try:
4858             setxattr(path, key, value)
4859         except EnvironmentError as e:
4860             raise XAttrMetadataError(e.errno, e.strerror)
4861
4862     except ImportError:
4863         if compat_os_name == 'nt':
4864             # Write xattrs to NTFS Alternate Data Streams:
4865             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4866             assert ':' not in key
4867             assert os.path.exists(path)
4868
4869             ads_fn = path + ':' + key
4870             try:
4871                 with open(ads_fn, 'wb') as f:
4872                     f.write(value)
4873             except EnvironmentError as e:
4874                 raise XAttrMetadataError(e.errno, e.strerror)
4875         else:
4876             user_has_setfattr = check_executable('setfattr', ['--version'])
4877             user_has_xattr = check_executable('xattr', ['-h'])
4878
4879             if user_has_setfattr or user_has_xattr:
4880
4881                 value = value.decode('utf-8')
4882                 if user_has_setfattr:
4883                     executable = 'setfattr'
4884                     opts = ['-n', key, '-v', value]
4885                 elif user_has_xattr:
4886                     executable = 'xattr'
4887                     opts = ['-w', key, value]
4888
4889                 cmd = ([encodeFilename(executable, True)]
4890                        + [encodeArgument(o) for o in opts]
4891                        + [encodeFilename(path, True)])
4892
4893                 try:
4894                     p = Popen(
4895                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4896                 except EnvironmentError as e:
4897                     raise XAttrMetadataError(e.errno, e.strerror)
4898                 stdout, stderr = p.communicate_or_kill()
4899                 stderr = stderr.decode('utf-8', 'replace')
4900                 if p.returncode != 0:
4901                     raise XAttrMetadataError(p.returncode, stderr)
4902
4903             else:
4904                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4905                 if sys.platform.startswith('linux'):
4906                     raise XAttrUnavailableError(
4907                         "Couldn't find a tool to set the xattrs. "
4908                         "Install either the python 'pyxattr' or 'xattr' "
4909                         "modules, or the GNU 'attr' package "
4910                         "(which contains the 'setfattr' tool).")
4911                 else:
4912                     raise XAttrUnavailableError(
4913                         "Couldn't find a tool to set the xattrs. "
4914                         "Install either the python 'xattr' module, "
4915                         "or the 'xattr' binary.")
4916
4917
4918 def random_birthday(year_field, month_field, day_field):
4919     start_date = datetime.date(1950, 1, 1)
4920     end_date = datetime.date(1995, 12, 31)
4921     offset = random.randint(0, (end_date - start_date).days)
4922     random_date = start_date + datetime.timedelta(offset)
4923     return {
4924         year_field: str(random_date.year),
4925         month_field: str(random_date.month),
4926         day_field: str(random_date.day),
4927     }
4928
4929
4930 # Templates for internet shortcut files, which are plain text files.
4931 DOT_URL_LINK_TEMPLATE = '''
4932 [InternetShortcut]
4933 URL=%(url)s
4934 '''.lstrip()
4935
4936 DOT_WEBLOC_LINK_TEMPLATE = '''
4937 <?xml version="1.0" encoding="UTF-8"?>
4938 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4939 <plist version="1.0">
4940 <dict>
4941 \t<key>URL</key>
4942 \t<string>%(url)s</string>
4943 </dict>
4944 </plist>
4945 '''.lstrip()
4946
4947 DOT_DESKTOP_LINK_TEMPLATE = '''
4948 [Desktop Entry]
4949 Encoding=UTF-8
4950 Name=%(filename)s
4951 Type=Link
4952 URL=%(url)s
4953 Icon=text-html
4954 '''.lstrip()
4955
4956 LINK_TEMPLATES = {
4957     'url': DOT_URL_LINK_TEMPLATE,
4958     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4959     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4960 }
4961
4962
4963 def iri_to_uri(iri):
4964     """
4965     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4966
4967     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4968     """
4969
4970     iri_parts = compat_urllib_parse_urlparse(iri)
4971
4972     if '[' in iri_parts.netloc:
4973         raise ValueError('IPv6 URIs are not, yet, supported.')
4974         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4975
4976     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4977
4978     net_location = ''
4979     if iri_parts.username:
4980         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4981         if iri_parts.password is not None:
4982             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4983         net_location += '@'
4984
4985     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4986     # The 'idna' encoding produces ASCII text.
4987     if iri_parts.port is not None and iri_parts.port != 80:
4988         net_location += ':' + str(iri_parts.port)
4989
4990     return compat_urllib_parse_urlunparse(
4991         (iri_parts.scheme,
4992             net_location,
4993
4994             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4995
4996             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4997             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4998
4999             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5000             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5001
5002             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5003
5004     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5005
5006
5007 def to_high_limit_path(path):
5008     if sys.platform in ['win32', 'cygwin']:
5009         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5010         return r'\\?\ '.rstrip() + os.path.abspath(path)
5011
5012     return path
5013
5014
5015 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5016     val = traverse_obj(obj, *variadic(field))
5017     if val in ignore:
5018         return default
5019     return template % (func(val) if func else val)
5020
5021
5022 def clean_podcast_url(url):
5023     return re.sub(r'''(?x)
5024         (?:
5025             (?:
5026                 chtbl\.com/track|
5027                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5028                 play\.podtrac\.com
5029             )/[^/]+|
5030             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5031             flex\.acast\.com|
5032             pd(?:
5033                 cn\.co| # https://podcorn.com/analytics-prefix/
5034                 st\.fm # https://podsights.com/docs/
5035             )/e
5036         )/''', '', url)
5037
5038
5039 _HEX_TABLE = '0123456789abcdef'
5040
5041
5042 def random_uuidv4():
5043     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5044
5045
5046 def make_dir(path, to_screen=None):
5047     try:
5048         dn = os.path.dirname(path)
5049         if dn and not os.path.exists(dn):
5050             os.makedirs(dn)
5051         return True
5052     except (OSError, IOError) as err:
5053         if callable(to_screen) is not None:
5054             to_screen('unable to create directory ' + error_to_compat_str(err))
5055         return False
5056
5057
5058 def get_executable_path():
5059     from zipimport import zipimporter
5060     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5061         path = os.path.dirname(sys.executable)
5062     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5063         path = os.path.join(os.path.dirname(__file__), '../..')
5064     else:
5065         path = os.path.join(os.path.dirname(__file__), '..')
5066     return os.path.abspath(path)
5067
5068
5069 def load_plugins(name, suffix, namespace):
5070     classes = {}
5071     try:
5072         plugins_spec = importlib.util.spec_from_file_location(
5073             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5074         plugins = importlib.util.module_from_spec(plugins_spec)
5075         sys.modules[plugins_spec.name] = plugins
5076         plugins_spec.loader.exec_module(plugins)
5077         for name in dir(plugins):
5078             if name in namespace:
5079                 continue
5080             if not name.endswith(suffix):
5081                 continue
5082             klass = getattr(plugins, name)
5083             classes[name] = namespace[name] = klass
5084     except FileNotFoundError:
5085         pass
5086     return classes
5087
5088
5089 def traverse_obj(
5090         obj, *path_list, default=None, expected_type=None, get_all=True,
5091         casesense=True, is_user_input=False, traverse_string=False):
5092     ''' Traverse nested list/dict/tuple
5093     @param path_list        A list of paths which are checked one by one.
5094                             Each path is a list of keys where each key is a string,
5095                             a function, a tuple of strings/None or "...".
5096                             When a fuction is given, it takes the key as argument and
5097                             returns whether the key matches or not. When a tuple is given,
5098                             all the keys given in the tuple are traversed, and
5099                             "..." traverses all the keys in the object
5100                             "None" returns the object without traversal
5101     @param default          Default value to return
5102     @param expected_type    Only accept final value of this type (Can also be any callable)
5103     @param get_all          Return all the values obtained from a path or only the first one
5104     @param casesense        Whether to consider dictionary keys as case sensitive
5105     @param is_user_input    Whether the keys are generated from user input. If True,
5106                             strings are converted to int/slice if necessary
5107     @param traverse_string  Whether to traverse inside strings. If True, any
5108                             non-compatible object will also be converted into a string
5109     # TODO: Write tests
5110     '''
5111     if not casesense:
5112         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5113         path_list = (map(_lower, variadic(path)) for path in path_list)
5114
5115     def _traverse_obj(obj, path, _current_depth=0):
5116         nonlocal depth
5117         path = tuple(variadic(path))
5118         for i, key in enumerate(path):
5119             if None in (key, obj):
5120                 return obj
5121             if isinstance(key, (list, tuple)):
5122                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5123                 key = ...
5124             if key is ...:
5125                 obj = (obj.values() if isinstance(obj, dict)
5126                        else obj if isinstance(obj, (list, tuple, LazyList))
5127                        else str(obj) if traverse_string else [])
5128                 _current_depth += 1
5129                 depth = max(depth, _current_depth)
5130                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5131             elif callable(key):
5132                 if isinstance(obj, (list, tuple, LazyList)):
5133                     obj = enumerate(obj)
5134                 elif isinstance(obj, dict):
5135                     obj = obj.items()
5136                 else:
5137                     if not traverse_string:
5138                         return None
5139                     obj = str(obj)
5140                 _current_depth += 1
5141                 depth = max(depth, _current_depth)
5142                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5143             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5144                 obj = (obj.get(key) if casesense or (key in obj)
5145                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5146             else:
5147                 if is_user_input:
5148                     key = (int_or_none(key) if ':' not in key
5149                            else slice(*map(int_or_none, key.split(':'))))
5150                     if key == slice(None):
5151                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5152                 if not isinstance(key, (int, slice)):
5153                     return None
5154                 if not isinstance(obj, (list, tuple, LazyList)):
5155                     if not traverse_string:
5156                         return None
5157                     obj = str(obj)
5158                 try:
5159                     obj = obj[key]
5160                 except IndexError:
5161                     return None
5162         return obj
5163
5164     if isinstance(expected_type, type):
5165         type_test = lambda val: val if isinstance(val, expected_type) else None
5166     elif expected_type is not None:
5167         type_test = expected_type
5168     else:
5169         type_test = lambda val: val
5170
5171     for path in path_list:
5172         depth = 0
5173         val = _traverse_obj(obj, path)
5174         if val is not None:
5175             if depth:
5176                 for _ in range(depth - 1):
5177                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5178                 val = [v for v in map(type_test, val) if v is not None]
5179                 if val:
5180                     return val if get_all else val[0]
5181             else:
5182                 val = type_test(val)
5183                 if val is not None:
5184                     return val
5185     return default
5186
5187
5188 def traverse_dict(dictn, keys, casesense=True):
5189     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5190                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5191     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5192
5193
5194 def variadic(x, allowed_types=(str, bytes, dict)):
5195     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5196
5197
5198 def decode_base(value, digits):
5199     # This will convert given base-x string to scalar (long or int)
5200     table = {char: index for index, char in enumerate(digits)}
5201     result = 0
5202     base = len(digits)
5203     for chr in value:
5204         result *= base
5205         result += table[chr]
5206     return result
5207
5208
5209 def time_seconds(**kwargs):
5210     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5211     return t.timestamp()
5212
5213
5214 # create a JSON Web Signature (jws) with HS256 algorithm
5215 # the resulting format is in JWS Compact Serialization
5216 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5217 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5218 def jwt_encode_hs256(payload_data, key, headers={}):
5219     header_data = {
5220         'alg': 'HS256',
5221         'typ': 'JWT',
5222     }
5223     if headers:
5224         header_data.update(headers)
5225     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5226     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5227     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5228     signature_b64 = base64.b64encode(h.digest())
5229     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5230     return token
5231
5232
5233 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5234 def jwt_decode_hs256(jwt):
5235     header_b64, payload_b64, signature_b64 = jwt.split('.')
5236     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5237     return payload_data
5238
5239
5240 def supports_terminal_sequences(stream):
5241     if compat_os_name == 'nt':
5242         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5243         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5244             return False
5245     elif not os.getenv('TERM'):
5246         return False
5247     try:
5248         return stream.isatty()
5249     except BaseException:
5250         return False
5251
5252
5253 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5254
5255
5256 def remove_terminal_sequences(string):
5257     return _terminal_sequences_re.sub('', string)
5258
5259
5260 def number_of_digits(number):
5261     return len('%d' % number)
5262
5263
5264 def join_nonempty(*values, delim='-', from_dict=None):
5265     if from_dict is not None:
5266         values = map(from_dict.get, values)
5267     return delim.join(map(str, filter(None, values)))
5268
5269
5270 def parse_http_range(range):
5271     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5272     if not range:
5273         return None, None, None
5274     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5275     if not crg:
5276         return None, None, None
5277     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5278
5279
5280 class Config:
5281     own_args = None
5282     filename = None
5283     __initialized = False
5284
5285     def __init__(self, parser, label=None):
5286         self._parser, self.label = parser, label
5287         self._loaded_paths, self.configs = set(), []
5288
5289     def init(self, args=None, filename=None):
5290         assert not self.__initialized
5291         directory = ''
5292         if filename:
5293             location = os.path.realpath(filename)
5294             directory = os.path.dirname(location)
5295             if location in self._loaded_paths:
5296                 return False
5297             self._loaded_paths.add(location)
5298
5299         self.__initialized = True
5300         self.own_args, self.filename = args, filename
5301         for location in self._parser.parse_args(args)[0].config_locations or []:
5302             location = os.path.join(directory, expand_path(location))
5303             if os.path.isdir(location):
5304                 location = os.path.join(location, 'yt-dlp.conf')
5305             if not os.path.exists(location):
5306                 self._parser.error(f'config location {location} does not exist')
5307             self.append_config(self.read_file(location), location)
5308         return True
5309
5310     def __str__(self):
5311         label = join_nonempty(
5312             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5313             delim=' ')
5314         return join_nonempty(
5315             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5316             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5317             delim='\n')
5318
5319     @staticmethod
5320     def read_file(filename, default=[]):
5321         try:
5322             optionf = open(filename)
5323         except IOError:
5324             return default  # silently skip if file is not present
5325         try:
5326             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5327             contents = optionf.read()
5328             if sys.version_info < (3,):
5329                 contents = contents.decode(preferredencoding())
5330             res = compat_shlex_split(contents, comments=True)
5331         finally:
5332             optionf.close()
5333         return res
5334
5335     @staticmethod
5336     def hide_login_info(opts):
5337         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5338         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5339
5340         def _scrub_eq(o):
5341             m = eqre.match(o)
5342             if m:
5343                 return m.group('key') + '=PRIVATE'
5344             else:
5345                 return o
5346
5347         opts = list(map(_scrub_eq, opts))
5348         for idx, opt in enumerate(opts):
5349             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5350                 opts[idx + 1] = 'PRIVATE'
5351         return opts
5352
5353     def append_config(self, *args, label=None):
5354         config = type(self)(self._parser, label)
5355         config._loaded_paths = self._loaded_paths
5356         if config.init(*args):
5357             self.configs.append(config)
5358
5359     @property
5360     def all_args(self):
5361         for config in reversed(self.configs):
5362             yield from config.all_args
5363         yield from self.own_args or []
5364
5365     def parse_args(self):
5366         return self._parser.parse_args(list(self.all_args))
5367
5368
5369 class WebSocketsWrapper():
5370     """Wraps websockets module to use in non-async scopes"""
5371
5372     def __init__(self, url, headers=None):
5373         self.loop = asyncio.events.new_event_loop()
5374         self.conn = compat_websockets.connect(
5375             url, extra_headers=headers, ping_interval=None,
5376             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5377         atexit.register(self.__exit__, None, None, None)
5378
5379     def __enter__(self):
5380         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5381         return self
5382
5383     def send(self, *args):
5384         self.run_with_loop(self.pool.send(*args), self.loop)
5385
5386     def recv(self, *args):
5387         return self.run_with_loop(self.pool.recv(*args), self.loop)
5388
5389     def __exit__(self, type, value, traceback):
5390         try:
5391             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5392         finally:
5393             self.loop.close()
5394             self._cancel_all_tasks(self.loop)
5395
5396     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5397     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5398     @staticmethod
5399     def run_with_loop(main, loop):
5400         if not asyncio.coroutines.iscoroutine(main):
5401             raise ValueError(f'a coroutine was expected, got {main!r}')
5402
5403         try:
5404             return loop.run_until_complete(main)
5405         finally:
5406             loop.run_until_complete(loop.shutdown_asyncgens())
5407             if hasattr(loop, 'shutdown_default_executor'):
5408                 loop.run_until_complete(loop.shutdown_default_executor())
5409
5410     @staticmethod
5411     def _cancel_all_tasks(loop):
5412         to_cancel = asyncio.tasks.all_tasks(loop)
5413
5414         if not to_cancel:
5415             return
5416
5417         for task in to_cancel:
5418             task.cancel()
5419
5420         loop.run_until_complete(
5421             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5422
5423         for task in to_cancel:
5424             if task.cancelled():
5425                 continue
5426             if task.exception() is not None:
5427                 loop.call_exception_handler({
5428                     'message': 'unhandled exception during asyncio.run() shutdown',
5429                     'exception': task.exception(),
5430                     'task': task,
5431                 })
5432
5433
5434 has_websockets = bool(compat_websockets)