yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_chr,
  51     compat_cookiejar,
  52     compat_ctypes_WINFUNCTYPE,
  53     compat_etree_fromstring,
  54     compat_expanduser,
  55     compat_html_entities,
  56     compat_html_entities_html5,
  57     compat_http_client,
  58     compat_integer_types,
  59     compat_numeric_types,
  60     compat_kwargs,
  61     compat_os_name,
  62     compat_parse_qs,
  63     compat_shlex_split,
  64     compat_shlex_quote,
  65     compat_str,
  66     compat_struct_pack,
  67     compat_struct_unpack,
  68     compat_urllib_error,
  69     compat_urllib_parse,
  70     compat_urllib_parse_urlencode,
  71     compat_urllib_parse_urlparse,
  72     compat_urllib_parse_urlunparse,
  73     compat_urllib_parse_quote,
  74     compat_urllib_parse_quote_plus,
  75     compat_urllib_parse_unquote_plus,
  76     compat_urllib_request,
  77     compat_urlparse,
  78     compat_websockets,
  79     compat_xpath,
  80 )
  81
  82 from .socks import (
  83     ProxyType,
  84     sockssocket,
  85 )
  86
  87
  88 def register_socks_protocols():
  89     # "Register" SOCKS protocols
  90     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  91     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  92     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  93         if scheme not in compat_urlparse.uses_netloc:
  94             compat_urlparse.uses_netloc.append(scheme)
  95
  96
  97 # This is not clearly defined otherwise
  98 compiled_regex_type = type(re.compile(''))
  99
 100
 101 def random_user_agent():
 102     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 103     _CHROME_VERSIONS = (
 104         '90.0.4430.212',
 105         '90.0.4430.24',
 106         '90.0.4430.70',
 107         '90.0.4430.72',
 108         '90.0.4430.85',
 109         '90.0.4430.93',
 110         '91.0.4472.101',
 111         '91.0.4472.106',
 112         '91.0.4472.114',
 113         '91.0.4472.124',
 114         '91.0.4472.164',
 115         '91.0.4472.19',
 116         '91.0.4472.77',
 117         '92.0.4515.107',
 118         '92.0.4515.115',
 119         '92.0.4515.131',
 120         '92.0.4515.159',
 121         '92.0.4515.43',
 122         '93.0.4556.0',
 123         '93.0.4577.15',
 124         '93.0.4577.63',
 125         '93.0.4577.82',
 126         '94.0.4606.41',
 127         '94.0.4606.54',
 128         '94.0.4606.61',
 129         '94.0.4606.71',
 130         '94.0.4606.81',
 131         '94.0.4606.85',
 132         '95.0.4638.17',
 133         '95.0.4638.50',
 134         '95.0.4638.54',
 135         '95.0.4638.69',
 136         '95.0.4638.74',
 137         '96.0.4664.18',
 138         '96.0.4664.45',
 139         '96.0.4664.55',
 140         '96.0.4664.93',
 141         '97.0.4692.20',
 142     )
 143     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 144
 145
 146 std_headers = {
 147     'User-Agent': random_user_agent(),
 148     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 149     'Accept-Encoding': 'gzip, deflate',
 150     'Accept-Language': 'en-us,en;q=0.5',
 151     'Sec-Fetch-Mode': 'navigate',
 152 }
 153
 154
 155 USER_AGENTS = {
 156     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 157 }
 158
 159
 160 NO_DEFAULT = object()
 161
 162 ENGLISH_MONTH_NAMES = [
 163     'January', 'February', 'March', 'April', 'May', 'June',
 164     'July', 'August', 'September', 'October', 'November', 'December']
 165
 166 MONTH_NAMES = {
 167     'en': ENGLISH_MONTH_NAMES,
 168     'fr': [
 169         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 170         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 171 }
 172
 173 KNOWN_EXTENSIONS = (
 174     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 175     'flv', 'f4v', 'f4a', 'f4b',
 176     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 177     'mkv', 'mka', 'mk3d',
 178     'avi', 'divx',
 179     'mov',
 180     'asf', 'wmv', 'wma',
 181     '3gp', '3g2',
 182     'mp3',
 183     'flac',
 184     'ape',
 185     'wav',
 186     'f4f', 'f4m', 'm3u8', 'smil')
 187
 188 # needed for sanitizing filenames in restricted mode
 189 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 190                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 191                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 192
 193 DATE_FORMATS = (
 194     '%d %B %Y',
 195     '%d %b %Y',
 196     '%B %d %Y',
 197     '%B %dst %Y',
 198     '%B %dnd %Y',
 199     '%B %drd %Y',
 200     '%B %dth %Y',
 201     '%b %d %Y',
 202     '%b %dst %Y',
 203     '%b %dnd %Y',
 204     '%b %drd %Y',
 205     '%b %dth %Y',
 206     '%b %dst %Y %I:%M',
 207     '%b %dnd %Y %I:%M',
 208     '%b %drd %Y %I:%M',
 209     '%b %dth %Y %I:%M',
 210     '%Y %m %d',
 211     '%Y-%m-%d',
 212     '%Y.%m.%d.',
 213     '%Y/%m/%d',
 214     '%Y/%m/%d %H:%M',
 215     '%Y/%m/%d %H:%M:%S',
 216     '%Y%m%d%H%M',
 217     '%Y%m%d%H%M%S',
 218     '%Y%m%d',
 219     '%Y-%m-%d %H:%M',
 220     '%Y-%m-%d %H:%M:%S',
 221     '%Y-%m-%d %H:%M:%S.%f',
 222     '%Y-%m-%d %H:%M:%S:%f',
 223     '%d.%m.%Y %H:%M',
 224     '%d.%m.%Y %H.%M',
 225     '%Y-%m-%dT%H:%M:%SZ',
 226     '%Y-%m-%dT%H:%M:%S.%fZ',
 227     '%Y-%m-%dT%H:%M:%S.%f0Z',
 228     '%Y-%m-%dT%H:%M:%S',
 229     '%Y-%m-%dT%H:%M:%S.%f',
 230     '%Y-%m-%dT%H:%M',
 231     '%b %d %Y at %H:%M',
 232     '%b %d %Y at %H:%M:%S',
 233     '%B %d %Y at %H:%M',
 234     '%B %d %Y at %H:%M:%S',
 235     '%H:%M %d-%b-%Y',
 236 )
 237
 238 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 239 DATE_FORMATS_DAY_FIRST.extend([
 240     '%d-%m-%Y',
 241     '%d.%m.%Y',
 242     '%d.%m.%y',
 243     '%d/%m/%Y',
 244     '%d/%m/%y',
 245     '%d/%m/%Y %H:%M:%S',
 246 ])
 247
 248 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 249 DATE_FORMATS_MONTH_FIRST.extend([
 250     '%m-%d-%Y',
 251     '%m.%d.%Y',
 252     '%m/%d/%Y',
 253     '%m/%d/%y',
 254     '%m/%d/%Y %H:%M:%S',
 255 ])
 256
 257 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 258 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 259
 260
 261 def preferredencoding():
 262     """Get preferred encoding.
 263
 264     Returns the best encoding scheme for the system, based on
 265     locale.getpreferredencoding() and some further tweaks.
 266     """
 267     try:
 268         pref = locale.getpreferredencoding()
 269         'TEST'.encode(pref)
 270     except Exception:
 271         pref = 'UTF-8'
 272
 273     return pref
 274
 275
 276 def write_json_file(obj, fn):
 277     """ Encode obj as JSON and write it to fn, atomically if possible """
 278
 279     fn = encodeFilename(fn)
 280     if sys.version_info < (3, 0) and sys.platform != 'win32':
 281         encoding = get_filesystem_encoding()
 282         # os.path.basename returns a bytes object, but NamedTemporaryFile
 283         # will fail if the filename contains non ascii characters unless we
 284         # use a unicode object
 285         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 286         # the same for os.path.dirname
 287         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 288     else:
 289         path_basename = os.path.basename
 290         path_dirname = os.path.dirname
 291
 292     args = {
 293         'suffix': '.tmp',
 294         'prefix': path_basename(fn) + '.',
 295         'dir': path_dirname(fn),
 296         'delete': False,
 297     }
 298
 299     # In Python 2.x, json.dump expects a bytestream.
 300     # In Python 3.x, it writes to a character stream
 301     if sys.version_info < (3, 0):
 302         args['mode'] = 'wb'
 303     else:
 304         args.update({
 305             'mode': 'w',
 306             'encoding': 'utf-8',
 307         })
 308
 309     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 310
 311     try:
 312         with tf:
 313             json.dump(obj, tf, ensure_ascii=False)
 314         if sys.platform == 'win32':
 315             # Need to remove existing file on Windows, else os.rename raises
 316             # WindowsError or FileExistsError.
 317             try:
 318                 os.unlink(fn)
 319             except OSError:
 320                 pass
 321         try:
 322             mask = os.umask(0)
 323             os.umask(mask)
 324             os.chmod(tf.name, 0o666 & ~mask)
 325         except OSError:
 326             pass
 327         os.rename(tf.name, fn)
 328     except Exception:
 329         try:
 330             os.remove(tf.name)
 331         except OSError:
 332             pass
 333         raise
 334
 335
 336 if sys.version_info >= (2, 7):
 337     def find_xpath_attr(node, xpath, key, val=None):
 338         """ Find the xpath xpath[@key=val] """
 339         assert re.match(r'^[a-zA-Z_-]+$', key)
 340         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 341         return node.find(expr)
 342 else:
 343     def find_xpath_attr(node, xpath, key, val=None):
 344         for f in node.findall(compat_xpath(xpath)):
 345             if key not in f.attrib:
 346                 continue
 347             if val is None or f.attrib.get(key) == val:
 348                 return f
 349         return None
 350
 351 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 352 # the namespace parameter
 353
 354
 355 def xpath_with_ns(path, ns_map):
 356     components = [c.split(':') for c in path.split('/')]
 357     replaced = []
 358     for c in components:
 359         if len(c) == 1:
 360             replaced.append(c[0])
 361         else:
 362             ns, tag = c
 363             replaced.append('{%s}%s' % (ns_map[ns], tag))
 364     return '/'.join(replaced)
 365
 366
 367 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 368     def _find_xpath(xpath):
 369         return node.find(compat_xpath(xpath))
 370
 371     if isinstance(xpath, (str, compat_str)):
 372         n = _find_xpath(xpath)
 373     else:
 374         for xp in xpath:
 375             n = _find_xpath(xp)
 376             if n is not None:
 377                 break
 378
 379     if n is None:
 380         if default is not NO_DEFAULT:
 381             return default
 382         elif fatal:
 383             name = xpath if name is None else name
 384             raise ExtractorError('Could not find XML element %s' % name)
 385         else:
 386             return None
 387     return n
 388
 389
 390 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 391     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 392     if n is None or n == default:
 393         return n
 394     if n.text is None:
 395         if default is not NO_DEFAULT:
 396             return default
 397         elif fatal:
 398             name = xpath if name is None else name
 399             raise ExtractorError('Could not find XML element\'s text %s' % name)
 400         else:
 401             return None
 402     return n.text
 403
 404
 405 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 406     n = find_xpath_attr(node, xpath, key)
 407     if n is None:
 408         if default is not NO_DEFAULT:
 409             return default
 410         elif fatal:
 411             name = '%s[@%s]' % (xpath, key) if name is None else name
 412             raise ExtractorError('Could not find XML attribute %s' % name)
 413         else:
 414             return None
 415     return n.attrib[key]
 416
 417
 418 def get_element_by_id(id, html):
 419     """Return the content of the tag with the specified ID in the passed HTML document"""
 420     return get_element_by_attribute('id', id, html)
 421
 422
 423 def get_element_html_by_id(id, html):
 424     """Return the html of the tag with the specified ID in the passed HTML document"""
 425     return get_element_html_by_attribute('id', id, html)
 426
 427
 428 def get_element_by_class(class_name, html):
 429     """Return the content of the first tag with the specified class in the passed HTML document"""
 430     retval = get_elements_by_class(class_name, html)
 431     return retval[0] if retval else None
 432
 433
 434 def get_element_html_by_class(class_name, html):
 435     """Return the html of the first tag with the specified class in the passed HTML document"""
 436     retval = get_elements_html_by_class(class_name, html)
 437     return retval[0] if retval else None
 438
 439
 440 def get_element_by_attribute(attribute, value, html, escape_value=True):
 441     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 442     return retval[0] if retval else None
 443
 444
 445 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 446     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 447     return retval[0] if retval else None
 448
 449
 450 def get_elements_by_class(class_name, html):
 451     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 452     return get_elements_by_attribute(
 453         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 454         html, escape_value=False)
 455
 456
 457 def get_elements_html_by_class(class_name, html):
 458     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 459     return get_elements_html_by_attribute(
 460         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 461         html, escape_value=False)
 462
 463
 464 def get_elements_by_attribute(*args, **kwargs):
 465     """Return the content of the tag with the specified attribute in the passed HTML document"""
 466     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 467
 468
 469 def get_elements_html_by_attribute(*args, **kwargs):
 470     """Return the html of the tag with the specified attribute in the passed HTML document"""
 471     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 472
 473
 474 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 475     """
 476     Return the text (content) and the html (whole) of the tag with the specified
 477     attribute in the passed HTML document
 478     """
 479
 480     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 481
 482     value = re.escape(value) if escape_value else value
 483
 484     partial_element_re = r'''(?x)
 485         <(?P<tag>[a-zA-Z0-9:._-]+)
 486          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 487          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 488         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 489
 490     for m in re.finditer(partial_element_re, html):
 491         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 492
 493         yield (
 494             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 495             whole
 496         )
 497
 498
 499 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 500     """
 501     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 502     closing tag for the first opening tag it has encountered, and can be used
 503     as a context manager
 504     """
 505
 506     class HTMLBreakOnClosingTagException(Exception):
 507         pass
 508
 509     def __init__(self):
 510         self.tagstack = collections.deque()
 511         compat_HTMLParser.__init__(self)
 512
 513     def __enter__(self):
 514         return self
 515
 516     def __exit__(self, *_):
 517         self.close()
 518
 519     def close(self):
 520         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 521         # so data remains buffered; we no longer have any interest in it, thus
 522         # override this method to discard it
 523         pass
 524
 525     def handle_starttag(self, tag, _):
 526         self.tagstack.append(tag)
 527
 528     def handle_endtag(self, tag):
 529         if not self.tagstack:
 530             raise compat_HTMLParseError('no tags in the stack')
 531         while self.tagstack:
 532             inner_tag = self.tagstack.pop()
 533             if inner_tag == tag:
 534                 break
 535         else:
 536             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 537         if not self.tagstack:
 538             raise self.HTMLBreakOnClosingTagException()
 539
 540
 541 def get_element_text_and_html_by_tag(tag, html):
 542     """
 543     For the first element with the specified tag in the passed HTML document
 544     return its' content (text) and the whole element (html)
 545     """
 546     def find_or_raise(haystack, needle, exc):
 547         try:
 548             return haystack.index(needle)
 549         except ValueError:
 550             raise exc
 551     closing_tag = f'</{tag}>'
 552     whole_start = find_or_raise(
 553         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 554     content_start = find_or_raise(
 555         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 556     content_start += whole_start + 1
 557     with HTMLBreakOnClosingTagParser() as parser:
 558         parser.feed(html[whole_start:content_start])
 559         if not parser.tagstack or parser.tagstack[0] != tag:
 560             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 561         offset = content_start
 562         while offset < len(html):
 563             next_closing_tag_start = find_or_raise(
 564                 html[offset:], closing_tag,
 565                 compat_HTMLParseError(f'closing {tag} tag not found'))
 566             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 567             try:
 568                 parser.feed(html[offset:offset + next_closing_tag_end])
 569                 offset += next_closing_tag_end
 570             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 571                 return html[content_start:offset + next_closing_tag_start], \
 572                     html[whole_start:offset + next_closing_tag_end]
 573         raise compat_HTMLParseError('unexpected end of html')
 574
 575
 576 class HTMLAttributeParser(compat_HTMLParser):
 577     """Trivial HTML parser to gather the attributes for a single element"""
 578
 579     def __init__(self):
 580         self.attrs = {}
 581         compat_HTMLParser.__init__(self)
 582
 583     def handle_starttag(self, tag, attrs):
 584         self.attrs = dict(attrs)
 585
 586
 587 class HTMLListAttrsParser(compat_HTMLParser):
 588     """HTML parser to gather the attributes for the elements of a list"""
 589
 590     def __init__(self):
 591         compat_HTMLParser.__init__(self)
 592         self.items = []
 593         self._level = 0
 594
 595     def handle_starttag(self, tag, attrs):
 596         if tag == 'li' and self._level == 0:
 597             self.items.append(dict(attrs))
 598         self._level += 1
 599
 600     def handle_endtag(self, tag):
 601         self._level -= 1
 602
 603
 604 def extract_attributes(html_element):
 605     """Given a string for an HTML element such as
 606     <el
 607          a="foo" B="bar" c="&98;az" d=boz
 608          empty= noval entity="&amp;"
 609          sq='"' dq="'"
 610     >
 611     Decode and return a dictionary of attributes.
 612     {
 613         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 614         'empty': '', 'noval': None, 'entity': '&',
 615         'sq': '"', 'dq': '\''
 616     }.
 617     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 618     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 619     """
 620     parser = HTMLAttributeParser()
 621     try:
 622         parser.feed(html_element)
 623         parser.close()
 624     # Older Python may throw HTMLParseError in case of malformed HTML
 625     except compat_HTMLParseError:
 626         pass
 627     return parser.attrs
 628
 629
 630 def parse_list(webpage):
 631     """Given a string for an series of HTML <li> elements,
 632     return a dictionary of their attributes"""
 633     parser = HTMLListAttrsParser()
 634     parser.feed(webpage)
 635     parser.close()
 636     return parser.items
 637
 638
 639 def clean_html(html):
 640     """Clean an HTML snippet into a readable string"""
 641
 642     if html is None:  # Convenience for sanitizing descriptions etc.
 643         return html
 644
 645     html = re.sub(r'\s+', ' ', html)
 646     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 647     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 648     # Strip html tags
 649     html = re.sub('<.*?>', '', html)
 650     # Replace html entities
 651     html = unescapeHTML(html)
 652     return html.strip()
 653
 654
 655 def sanitize_open(filename, open_mode):
 656     """Try to open the given filename, and slightly tweak it if this fails.
 657
 658     Attempts to open the given filename. If this fails, it tries to change
 659     the filename slightly, step by step, until it's either able to open it
 660     or it fails and raises a final exception, like the standard open()
 661     function.
 662
 663     It returns the tuple (stream, definitive_file_name).
 664     """
 665     try:
 666         if filename == '-':
 667             if sys.platform == 'win32':
 668                 import msvcrt
 669                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 670             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 671         stream = locked_file(filename, open_mode, block=False).open()
 672         return (stream, filename)
 673     except (IOError, OSError) as err:
 674         if err.errno in (errno.EACCES,):
 675             raise
 676
 677         # In case of error, try to remove win32 forbidden chars
 678         alt_filename = sanitize_path(filename)
 679         if alt_filename == filename:
 680             raise
 681         else:
 682             # An exception here should be caught in the caller
 683             stream = locked_file(filename, open_mode, block=False).open()
 684             return (stream, alt_filename)
 685
 686
 687 def timeconvert(timestr):
 688     """Convert RFC 2822 defined time string into system timestamp"""
 689     timestamp = None
 690     timetuple = email.utils.parsedate_tz(timestr)
 691     if timetuple is not None:
 692         timestamp = email.utils.mktime_tz(timetuple)
 693     return timestamp
 694
 695
 696 def sanitize_filename(s, restricted=False, is_id=False):
 697     """Sanitizes a string so it could be used as part of a filename.
 698     If restricted is set, use a stricter subset of allowed characters.
 699     Set is_id if this is not an arbitrary string, but an ID that should be kept
 700     if possible.
 701     """
 702     def replace_insane(char):
 703         if restricted and char in ACCENT_CHARS:
 704             return ACCENT_CHARS[char]
 705         elif not restricted and char == '\n':
 706             return ' '
 707         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 708             return ''
 709         elif char == '"':
 710             return '' if restricted else '\''
 711         elif char == ':':
 712             return '_-' if restricted else ' -'
 713         elif char in '\\/|*<>':
 714             return '_'
 715         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 716             return '_'
 717         if restricted and ord(char) > 127:
 718             return '_'
 719         return char
 720
 721     if s == '':
 722         return ''
 723     # Handle timestamps
 724     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 725     result = ''.join(map(replace_insane, s))
 726     if not is_id:
 727         while '__' in result:
 728             result = result.replace('__', '_')
 729         result = result.strip('_')
 730         # Common case of "Foreign band name - English song title"
 731         if restricted and result.startswith('-_'):
 732             result = result[2:]
 733         if result.startswith('-'):
 734             result = '_' + result[len('-'):]
 735         result = result.lstrip('.')
 736         if not result:
 737             result = '_'
 738     return result
 739
 740
 741 def sanitize_path(s, force=False):
 742     """Sanitizes and normalizes path on Windows"""
 743     if sys.platform == 'win32':
 744         force = False
 745         drive_or_unc, _ = os.path.splitdrive(s)
 746         if sys.version_info < (2, 7) and not drive_or_unc:
 747             drive_or_unc, _ = os.path.splitunc(s)
 748     elif force:
 749         drive_or_unc = ''
 750     else:
 751         return s
 752
 753     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 754     if drive_or_unc:
 755         norm_path.pop(0)
 756     sanitized_path = [
 757         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 758         for path_part in norm_path]
 759     if drive_or_unc:
 760         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 761     elif force and s[0] == os.path.sep:
 762         sanitized_path.insert(0, os.path.sep)
 763     return os.path.join(*sanitized_path)
 764
 765
 766 def sanitize_url(url):
 767     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 768     # the number of unwanted failures due to missing protocol
 769     if url.startswith('//'):
 770         return 'http:%s' % url
 771     # Fix some common typos seen so far
 772     COMMON_TYPOS = (
 773         # https://github.com/ytdl-org/youtube-dl/issues/15649
 774         (r'^httpss://', r'https://'),
 775         # https://bx1.be/lives/direct-tv/
 776         (r'^rmtp([es]?)://', r'rtmp\1://'),
 777     )
 778     for mistake, fixup in COMMON_TYPOS:
 779         if re.match(mistake, url):
 780             return re.sub(mistake, fixup, url)
 781     return url
 782
 783
 784 def extract_basic_auth(url):
 785     parts = compat_urlparse.urlsplit(url)
 786     if parts.username is None:
 787         return url, None
 788     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 789         parts.hostname if parts.port is None
 790         else '%s:%d' % (parts.hostname, parts.port))))
 791     auth_payload = base64.b64encode(
 792         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 793     return url, 'Basic ' + auth_payload.decode('utf-8')
 794
 795
 796 def sanitized_Request(url, *args, **kwargs):
 797     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 798     if auth_header is not None:
 799         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 800         headers['Authorization'] = auth_header
 801     return compat_urllib_request.Request(url, *args, **kwargs)
 802
 803
 804 def expand_path(s):
 805     """Expand shell variables and ~"""
 806     return os.path.expandvars(compat_expanduser(s))
 807
 808
 809 def orderedSet(iterable):
 810     """ Remove all duplicates from the input iterable """
 811     res = []
 812     for el in iterable:
 813         if el not in res:
 814             res.append(el)
 815     return res
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in compat_html_entities.name2codepoint:
 824         return compat_chr(compat_html_entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon. For example,
 827     # '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in compat_html_entities_html5:
 829         return compat_html_entities_html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         try:
 841             return compat_chr(int(numstr, base))
 842         except ValueError:
 843             pass
 844
 845     # Unknown entity in name, return its literal representation
 846     return '&%s;' % entity
 847
 848
 849 def unescapeHTML(s):
 850     if s is None:
 851         return None
 852     assert type(s) == compat_str
 853
 854     return re.sub(
 855         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 856
 857
 858 def escapeHTML(text):
 859     return (
 860         text
 861         .replace('&', '&amp;')
 862         .replace('<', '&lt;')
 863         .replace('>', '&gt;')
 864         .replace('"', '&quot;')
 865         .replace("'", '&#39;')
 866     )
 867
 868
 869 def process_communicate_or_kill(p, *args, **kwargs):
 870     try:
 871         return p.communicate(*args, **kwargs)
 872     except BaseException:  # Including KeyboardInterrupt
 873         p.kill()
 874         p.wait()
 875         raise
 876
 877
 878 class Popen(subprocess.Popen):
 879     if sys.platform == 'win32':
 880         _startupinfo = subprocess.STARTUPINFO()
 881         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 882     else:
 883         _startupinfo = None
 884
 885     def __init__(self, *args, **kwargs):
 886         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 887
 888     def communicate_or_kill(self, *args, **kwargs):
 889         return process_communicate_or_kill(self, *args, **kwargs)
 890
 891
 892 def get_subprocess_encoding():
 893     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 894         # For subprocess calls, encode with locale encoding
 895         # Refer to http://stackoverflow.com/a/9951851/35070
 896         encoding = preferredencoding()
 897     else:
 898         encoding = sys.getfilesystemencoding()
 899     if encoding is None:
 900         encoding = 'utf-8'
 901     return encoding
 902
 903
 904 def encodeFilename(s, for_subprocess=False):
 905     """
 906     @param s The name of the file
 907     """
 908
 909     assert type(s) == compat_str
 910
 911     # Python 3 has a Unicode API
 912     if sys.version_info >= (3, 0):
 913         return s
 914
 915     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 916     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 917     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 918     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 919         return s
 920
 921     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 922     if sys.platform.startswith('java'):
 923         return s
 924
 925     return s.encode(get_subprocess_encoding(), 'ignore')
 926
 927
 928 def decodeFilename(b, for_subprocess=False):
 929
 930     if sys.version_info >= (3, 0):
 931         return b
 932
 933     if not isinstance(b, bytes):
 934         return b
 935
 936     return b.decode(get_subprocess_encoding(), 'ignore')
 937
 938
 939 def encodeArgument(s):
 940     if not isinstance(s, compat_str):
 941         # Legacy code that uses byte strings
 942         # Uncomment the following line after fixing all post processors
 943         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 944         s = s.decode('ascii')
 945     return encodeFilename(s, True)
 946
 947
 948 def decodeArgument(b):
 949     return decodeFilename(b, True)
 950
 951
 952 def decodeOption(optval):
 953     if optval is None:
 954         return optval
 955     if isinstance(optval, bytes):
 956         optval = optval.decode(preferredencoding())
 957
 958     assert isinstance(optval, compat_str)
 959     return optval
 960
 961
 962 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 963
 964
 965 def timetuple_from_msec(msec):
 966     secs, msec = divmod(msec, 1000)
 967     mins, secs = divmod(secs, 60)
 968     hrs, mins = divmod(mins, 60)
 969     return _timetuple(hrs, mins, secs, msec)
 970
 971
 972 def formatSeconds(secs, delim=':', msec=False):
 973     time = timetuple_from_msec(secs * 1000)
 974     if time.hours:
 975         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 976     elif time.minutes:
 977         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 978     else:
 979         ret = '%d' % time.seconds
 980     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 981
 982
 983 def _ssl_load_windows_store_certs(ssl_context, storename):
 984     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 985     try:
 986         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 987                  if encoding == 'x509_asn' and (
 988                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 989     except PermissionError:
 990         return
 991     for cert in certs:
 992         try:
 993             ssl_context.load_verify_locations(cadata=cert)
 994         except ssl.SSLError:
 995             pass
 996
 997
 998 def make_HTTPS_handler(params, **kwargs):
 999     opts_check_certificate = not params.get('nocheckcertificate')
1000     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1001     context.check_hostname = opts_check_certificate
1002     if params.get('legacyserverconnect'):
1003         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1004     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1005     if opts_check_certificate:
1006         try:
1007             context.load_default_certs()
1008             # Work around the issue in load_default_certs when there are bad certificates. See:
1009             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1010             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1011         except ssl.SSLError:
1012             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1013             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1014                 # Create a new context to discard any certificates that were already loaded
1015                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1016                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1017                 for storename in ('CA', 'ROOT'):
1018                     _ssl_load_windows_store_certs(context, storename)
1019             context.set_default_verify_paths()
1020     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1021
1022
1023 def bug_reports_message(before=';'):
1024     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1025            'filling out the "Broken site" issue template properly. '
1026            'Confirm you are on the latest version using -U')
1027
1028     before = before.rstrip()
1029     if not before or before.endswith(('.', '!', '?')):
1030         msg = msg[0].title() + msg[1:]
1031
1032     return (before + ' ' if before else '') + msg
1033
1034
1035 class YoutubeDLError(Exception):
1036     """Base exception for YoutubeDL errors."""
1037     msg = None
1038
1039     def __init__(self, msg=None):
1040         if msg is not None:
1041             self.msg = msg
1042         elif self.msg is None:
1043             self.msg = type(self).__name__
1044         super().__init__(self.msg)
1045
1046
1047 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1048 if hasattr(ssl, 'CertificateError'):
1049     network_exceptions.append(ssl.CertificateError)
1050 network_exceptions = tuple(network_exceptions)
1051
1052
1053 class ExtractorError(YoutubeDLError):
1054     """Error during info extraction."""
1055
1056     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1057         """ tb, if given, is the original traceback (so that it can be printed out).
1058         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1059         """
1060         if sys.exc_info()[0] in network_exceptions:
1061             expected = True
1062
1063         self.orig_msg = str(msg)
1064         self.traceback = tb
1065         self.expected = expected
1066         self.cause = cause
1067         self.video_id = video_id
1068         self.ie = ie
1069         self.exc_info = sys.exc_info()  # preserve original exception
1070
1071         super(ExtractorError, self).__init__(''.join((
1072             format_field(ie, template='[%s] '),
1073             format_field(video_id, template='%s: '),
1074             msg,
1075             format_field(cause, template=' (caused by %r)'),
1076             '' if expected else bug_reports_message())))
1077
1078     def format_traceback(self):
1079         if self.traceback is None:
1080             return None
1081         return ''.join(traceback.format_tb(self.traceback))
1082
1083
1084 class UnsupportedError(ExtractorError):
1085     def __init__(self, url):
1086         super(UnsupportedError, self).__init__(
1087             'Unsupported URL: %s' % url, expected=True)
1088         self.url = url
1089
1090
1091 class RegexNotFoundError(ExtractorError):
1092     """Error when a regex didn't match"""
1093     pass
1094
1095
1096 class GeoRestrictedError(ExtractorError):
1097     """Geographic restriction Error exception.
1098
1099     This exception may be thrown when a video is not available from your
1100     geographic location due to geographic restrictions imposed by a website.
1101     """
1102
1103     def __init__(self, msg, countries=None, **kwargs):
1104         kwargs['expected'] = True
1105         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1106         self.countries = countries
1107
1108
1109 class DownloadError(YoutubeDLError):
1110     """Download Error exception.
1111
1112     This exception may be thrown by FileDownloader objects if they are not
1113     configured to continue on errors. They will contain the appropriate
1114     error message.
1115     """
1116
1117     def __init__(self, msg, exc_info=None):
1118         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1119         super(DownloadError, self).__init__(msg)
1120         self.exc_info = exc_info
1121
1122
1123 class EntryNotInPlaylist(YoutubeDLError):
1124     """Entry not in playlist exception.
1125
1126     This exception will be thrown by YoutubeDL when a requested entry
1127     is not found in the playlist info_dict
1128     """
1129     msg = 'Entry not found in info'
1130
1131
1132 class SameFileError(YoutubeDLError):
1133     """Same File exception.
1134
1135     This exception will be thrown by FileDownloader objects if they detect
1136     multiple files would have to be downloaded to the same file on disk.
1137     """
1138     msg = 'Fixed output name but more than one file to download'
1139
1140     def __init__(self, filename=None):
1141         if filename is not None:
1142             self.msg += f': {filename}'
1143         super().__init__(self.msg)
1144
1145
1146 class PostProcessingError(YoutubeDLError):
1147     """Post Processing exception.
1148
1149     This exception may be raised by PostProcessor's .run() method to
1150     indicate an error in the postprocessing task.
1151     """
1152
1153
1154 class DownloadCancelled(YoutubeDLError):
1155     """ Exception raised when the download queue should be interrupted """
1156     msg = 'The download was cancelled'
1157
1158
1159 class ExistingVideoReached(DownloadCancelled):
1160     """ --break-on-existing triggered """
1161     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1162
1163
1164 class RejectedVideoReached(DownloadCancelled):
1165     """ --break-on-reject triggered """
1166     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1167
1168
1169 class MaxDownloadsReached(DownloadCancelled):
1170     """ --max-downloads limit has been reached. """
1171     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1172
1173
1174 class ReExtractInfo(YoutubeDLError):
1175     """ Video info needs to be re-extracted. """
1176
1177     def __init__(self, msg, expected=False):
1178         super().__init__(msg)
1179         self.expected = expected
1180
1181
1182 class ThrottledDownload(ReExtractInfo):
1183     """ Download speed below --throttled-rate. """
1184     msg = 'The download speed is below throttle limit'
1185
1186     def __init__(self):
1187         super().__init__(self.msg, expected=False)
1188
1189
1190 class UnavailableVideoError(YoutubeDLError):
1191     """Unavailable Format exception.
1192
1193     This exception will be thrown when a video is requested
1194     in a format that is not available for that video.
1195     """
1196     msg = 'Unable to download video'
1197
1198     def __init__(self, err=None):
1199         if err is not None:
1200             self.msg += f': {err}'
1201         super().__init__(self.msg)
1202
1203
1204 class ContentTooShortError(YoutubeDLError):
1205     """Content Too Short exception.
1206
1207     This exception may be raised by FileDownloader objects when a file they
1208     download is too small for what the server announced first, indicating
1209     the connection was probably interrupted.
1210     """
1211
1212     def __init__(self, downloaded, expected):
1213         super(ContentTooShortError, self).__init__(
1214             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1215         )
1216         # Both in bytes
1217         self.downloaded = downloaded
1218         self.expected = expected
1219
1220
1221 class XAttrMetadataError(YoutubeDLError):
1222     def __init__(self, code=None, msg='Unknown error'):
1223         super(XAttrMetadataError, self).__init__(msg)
1224         self.code = code
1225         self.msg = msg
1226
1227         # Parsing code and msg
1228         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1229                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1230             self.reason = 'NO_SPACE'
1231         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1232             self.reason = 'VALUE_TOO_LONG'
1233         else:
1234             self.reason = 'NOT_SUPPORTED'
1235
1236
1237 class XAttrUnavailableError(YoutubeDLError):
1238     pass
1239
1240
1241 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1242     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1243     # expected HTTP responses to meet HTTP/1.0 or later (see also
1244     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1245     if sys.version_info < (3, 0):
1246         kwargs['strict'] = True
1247     hc = http_class(*args, **compat_kwargs(kwargs))
1248     source_address = ydl_handler._params.get('source_address')
1249
1250     if source_address is not None:
1251         # This is to workaround _create_connection() from socket where it will try all
1252         # address data from getaddrinfo() including IPv6. This filters the result from
1253         # getaddrinfo() based on the source_address value.
1254         # This is based on the cpython socket.create_connection() function.
1255         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1256         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1257             host, port = address
1258             err = None
1259             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1260             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1261             ip_addrs = [addr for addr in addrs if addr[0] == af]
1262             if addrs and not ip_addrs:
1263                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1264                 raise socket.error(
1265                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1266                     % (ip_version, source_address[0]))
1267             for res in ip_addrs:
1268                 af, socktype, proto, canonname, sa = res
1269                 sock = None
1270                 try:
1271                     sock = socket.socket(af, socktype, proto)
1272                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1273                         sock.settimeout(timeout)
1274                     sock.bind(source_address)
1275                     sock.connect(sa)
1276                     err = None  # Explicitly break reference cycle
1277                     return sock
1278                 except socket.error as _:
1279                     err = _
1280                     if sock is not None:
1281                         sock.close()
1282             if err is not None:
1283                 raise err
1284             else:
1285                 raise socket.error('getaddrinfo returns an empty list')
1286         if hasattr(hc, '_create_connection'):
1287             hc._create_connection = _create_connection
1288         sa = (source_address, 0)
1289         if hasattr(hc, 'source_address'):  # Python 2.7+
1290             hc.source_address = sa
1291         else:  # Python 2.6
1292             def _hc_connect(self, *args, **kwargs):
1293                 sock = _create_connection(
1294                     (self.host, self.port), self.timeout, sa)
1295                 if is_https:
1296                     self.sock = ssl.wrap_socket(
1297                         sock, self.key_file, self.cert_file,
1298                         ssl_version=ssl.PROTOCOL_TLSv1)
1299                 else:
1300                     self.sock = sock
1301             hc.connect = functools.partial(_hc_connect, hc)
1302
1303     return hc
1304
1305
1306 def handle_youtubedl_headers(headers):
1307     filtered_headers = headers
1308
1309     if 'Youtubedl-no-compression' in filtered_headers:
1310         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1311         del filtered_headers['Youtubedl-no-compression']
1312
1313     return filtered_headers
1314
1315
1316 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1317     """Handler for HTTP requests and responses.
1318
1319     This class, when installed with an OpenerDirector, automatically adds
1320     the standard headers to every HTTP request and handles gzipped and
1321     deflated responses from web servers. If compression is to be avoided in
1322     a particular request, the original request in the program code only has
1323     to include the HTTP header "Youtubedl-no-compression", which will be
1324     removed before making the real request.
1325
1326     Part of this code was copied from:
1327
1328     http://techknack.net/python-urllib2-handlers/
1329
1330     Andrew Rowls, the author of that code, agreed to release it to the
1331     public domain.
1332     """
1333
1334     def __init__(self, params, *args, **kwargs):
1335         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1336         self._params = params
1337
1338     def http_open(self, req):
1339         conn_class = compat_http_client.HTTPConnection
1340
1341         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1342         if socks_proxy:
1343             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1344             del req.headers['Ytdl-socks-proxy']
1345
1346         return self.do_open(functools.partial(
1347             _create_http_connection, self, conn_class, False),
1348             req)
1349
1350     @staticmethod
1351     def deflate(data):
1352         if not data:
1353             return data
1354         try:
1355             return zlib.decompress(data, -zlib.MAX_WBITS)
1356         except zlib.error:
1357             return zlib.decompress(data)
1358
1359     def http_request(self, req):
1360         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1361         # always respected by websites, some tend to give out URLs with non percent-encoded
1362         # non-ASCII characters (see telemb.py, ard.py [#3412])
1363         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1364         # To work around aforementioned issue we will replace request's original URL with
1365         # percent-encoded one
1366         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1367         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1368         url = req.get_full_url()
1369         url_escaped = escape_url(url)
1370
1371         # Substitute URL if any change after escaping
1372         if url != url_escaped:
1373             req = update_Request(req, url=url_escaped)
1374
1375         for h, v in self._params.get('http_headers', std_headers).items():
1376             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1377             # The dict keys are capitalized because of this bug by urllib
1378             if h.capitalize() not in req.headers:
1379                 req.add_header(h, v)
1380
1381         req.headers = handle_youtubedl_headers(req.headers)
1382
1383         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1384             # Python 2.6 is brain-dead when it comes to fragments
1385             req._Request__original = req._Request__original.partition('#')[0]
1386             req._Request__r_type = req._Request__r_type.partition('#')[0]
1387
1388         return req
1389
1390     def http_response(self, req, resp):
1391         old_resp = resp
1392         # gzip
1393         if resp.headers.get('Content-encoding', '') == 'gzip':
1394             content = resp.read()
1395             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1396             try:
1397                 uncompressed = io.BytesIO(gz.read())
1398             except IOError as original_ioerror:
1399                 # There may be junk add the end of the file
1400                 # See http://stackoverflow.com/q/4928560/35070 for details
1401                 for i in range(1, 1024):
1402                     try:
1403                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1404                         uncompressed = io.BytesIO(gz.read())
1405                     except IOError:
1406                         continue
1407                     break
1408                 else:
1409                     raise original_ioerror
1410             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1411             resp.msg = old_resp.msg
1412             del resp.headers['Content-encoding']
1413         # deflate
1414         if resp.headers.get('Content-encoding', '') == 'deflate':
1415             gz = io.BytesIO(self.deflate(resp.read()))
1416             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1417             resp.msg = old_resp.msg
1418             del resp.headers['Content-encoding']
1419         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1420         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1421         if 300 <= resp.code < 400:
1422             location = resp.headers.get('Location')
1423             if location:
1424                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1425                 if sys.version_info >= (3, 0):
1426                     location = location.encode('iso-8859-1').decode('utf-8')
1427                 else:
1428                     location = location.decode('utf-8')
1429                 location_escaped = escape_url(location)
1430                 if location != location_escaped:
1431                     del resp.headers['Location']
1432                     if sys.version_info < (3, 0):
1433                         location_escaped = location_escaped.encode('utf-8')
1434                     resp.headers['Location'] = location_escaped
1435         return resp
1436
1437     https_request = http_request
1438     https_response = http_response
1439
1440
1441 def make_socks_conn_class(base_class, socks_proxy):
1442     assert issubclass(base_class, (
1443         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1444
1445     url_components = compat_urlparse.urlparse(socks_proxy)
1446     if url_components.scheme.lower() == 'socks5':
1447         socks_type = ProxyType.SOCKS5
1448     elif url_components.scheme.lower() in ('socks', 'socks4'):
1449         socks_type = ProxyType.SOCKS4
1450     elif url_components.scheme.lower() == 'socks4a':
1451         socks_type = ProxyType.SOCKS4A
1452
1453     def unquote_if_non_empty(s):
1454         if not s:
1455             return s
1456         return compat_urllib_parse_unquote_plus(s)
1457
1458     proxy_args = (
1459         socks_type,
1460         url_components.hostname, url_components.port or 1080,
1461         True,  # Remote DNS
1462         unquote_if_non_empty(url_components.username),
1463         unquote_if_non_empty(url_components.password),
1464     )
1465
1466     class SocksConnection(base_class):
1467         def connect(self):
1468             self.sock = sockssocket()
1469             self.sock.setproxy(*proxy_args)
1470             if type(self.timeout) in (int, float):
1471                 self.sock.settimeout(self.timeout)
1472             self.sock.connect((self.host, self.port))
1473
1474             if isinstance(self, compat_http_client.HTTPSConnection):
1475                 if hasattr(self, '_context'):  # Python > 2.6
1476                     self.sock = self._context.wrap_socket(
1477                         self.sock, server_hostname=self.host)
1478                 else:
1479                     self.sock = ssl.wrap_socket(self.sock)
1480
1481     return SocksConnection
1482
1483
1484 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1485     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1486         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1487         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1488         self._params = params
1489
1490     def https_open(self, req):
1491         kwargs = {}
1492         conn_class = self._https_conn_class
1493
1494         if hasattr(self, '_context'):  # python > 2.6
1495             kwargs['context'] = self._context
1496         if hasattr(self, '_check_hostname'):  # python 3.x
1497             kwargs['check_hostname'] = self._check_hostname
1498
1499         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1500         if socks_proxy:
1501             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1502             del req.headers['Ytdl-socks-proxy']
1503
1504         return self.do_open(functools.partial(
1505             _create_http_connection, self, conn_class, True),
1506             req, **kwargs)
1507
1508
1509 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1510     """
1511     See [1] for cookie file format.
1512
1513     1. https://curl.haxx.se/docs/http-cookies.html
1514     """
1515     _HTTPONLY_PREFIX = '#HttpOnly_'
1516     _ENTRY_LEN = 7
1517     _HEADER = '''# Netscape HTTP Cookie File
1518 # This file is generated by yt-dlp.  Do not edit.
1519
1520 '''
1521     _CookieFileEntry = collections.namedtuple(
1522         'CookieFileEntry',
1523         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1524
1525     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1526         """
1527         Save cookies to a file.
1528
1529         Most of the code is taken from CPython 3.8 and slightly adapted
1530         to support cookie files with UTF-8 in both python 2 and 3.
1531         """
1532         if filename is None:
1533             if self.filename is not None:
1534                 filename = self.filename
1535             else:
1536                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1537
1538         # Store session cookies with `expires` set to 0 instead of an empty
1539         # string
1540         for cookie in self:
1541             if cookie.expires is None:
1542                 cookie.expires = 0
1543
1544         with io.open(filename, 'w', encoding='utf-8') as f:
1545             f.write(self._HEADER)
1546             now = time.time()
1547             for cookie in self:
1548                 if not ignore_discard and cookie.discard:
1549                     continue
1550                 if not ignore_expires and cookie.is_expired(now):
1551                     continue
1552                 if cookie.secure:
1553                     secure = 'TRUE'
1554                 else:
1555                     secure = 'FALSE'
1556                 if cookie.domain.startswith('.'):
1557                     initial_dot = 'TRUE'
1558                 else:
1559                     initial_dot = 'FALSE'
1560                 if cookie.expires is not None:
1561                     expires = compat_str(cookie.expires)
1562                 else:
1563                     expires = ''
1564                 if cookie.value is None:
1565                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1566                     # with no name, whereas http.cookiejar regards it as a
1567                     # cookie with no value.
1568                     name = ''
1569                     value = cookie.name
1570                 else:
1571                     name = cookie.name
1572                     value = cookie.value
1573                 f.write(
1574                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1575                                secure, expires, name, value]) + '\n')
1576
1577     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1578         """Load cookies from a file."""
1579         if filename is None:
1580             if self.filename is not None:
1581                 filename = self.filename
1582             else:
1583                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1584
1585         def prepare_line(line):
1586             if line.startswith(self._HTTPONLY_PREFIX):
1587                 line = line[len(self._HTTPONLY_PREFIX):]
1588             # comments and empty lines are fine
1589             if line.startswith('#') or not line.strip():
1590                 return line
1591             cookie_list = line.split('\t')
1592             if len(cookie_list) != self._ENTRY_LEN:
1593                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1594             cookie = self._CookieFileEntry(*cookie_list)
1595             if cookie.expires_at and not cookie.expires_at.isdigit():
1596                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1597             return line
1598
1599         cf = io.StringIO()
1600         with io.open(filename, encoding='utf-8') as f:
1601             for line in f:
1602                 try:
1603                     cf.write(prepare_line(line))
1604                 except compat_cookiejar.LoadError as e:
1605                     write_string(
1606                         'WARNING: skipping cookie file entry due to %s: %r\n'
1607                         % (e, line), sys.stderr)
1608                     continue
1609         cf.seek(0)
1610         self._really_load(cf, filename, ignore_discard, ignore_expires)
1611         # Session cookies are denoted by either `expires` field set to
1612         # an empty string or 0. MozillaCookieJar only recognizes the former
1613         # (see [1]). So we need force the latter to be recognized as session
1614         # cookies on our own.
1615         # Session cookies may be important for cookies-based authentication,
1616         # e.g. usually, when user does not check 'Remember me' check box while
1617         # logging in on a site, some important cookies are stored as session
1618         # cookies so that not recognizing them will result in failed login.
1619         # 1. https://bugs.python.org/issue17164
1620         for cookie in self:
1621             # Treat `expires=0` cookies as session cookies
1622             if cookie.expires == 0:
1623                 cookie.expires = None
1624                 cookie.discard = True
1625
1626
1627 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1628     def __init__(self, cookiejar=None):
1629         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1630
1631     def http_response(self, request, response):
1632         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1633         # characters in Set-Cookie HTTP header of last response (see
1634         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1635         # In order to at least prevent crashing we will percent encode Set-Cookie
1636         # header before HTTPCookieProcessor starts processing it.
1637         # if sys.version_info < (3, 0) and response.headers:
1638         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1639         #         set_cookie = response.headers.get(set_cookie_header)
1640         #         if set_cookie:
1641         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1642         #             if set_cookie != set_cookie_escaped:
1643         #                 del response.headers[set_cookie_header]
1644         #                 response.headers[set_cookie_header] = set_cookie_escaped
1645         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1646
1647     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1648     https_response = http_response
1649
1650
1651 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1652     """YoutubeDL redirect handler
1653
1654     The code is based on HTTPRedirectHandler implementation from CPython [1].
1655
1656     This redirect handler solves two issues:
1657      - ensures redirect URL is always unicode under python 2
1658      - introduces support for experimental HTTP response status code
1659        308 Permanent Redirect [2] used by some sites [3]
1660
1661     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1662     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1663     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1664     """
1665
1666     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1667
1668     def redirect_request(self, req, fp, code, msg, headers, newurl):
1669         """Return a Request or None in response to a redirect.
1670
1671         This is called by the http_error_30x methods when a
1672         redirection response is received.  If a redirection should
1673         take place, return a new Request to allow http_error_30x to
1674         perform the redirect.  Otherwise, raise HTTPError if no-one
1675         else should try to handle this url.  Return None if you can't
1676         but another Handler might.
1677         """
1678         m = req.get_method()
1679         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1680                  or code in (301, 302, 303) and m == "POST")):
1681             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1682         # Strictly (according to RFC 2616), 301 or 302 in response to
1683         # a POST MUST NOT cause a redirection without confirmation
1684         # from the user (of urllib.request, in this case).  In practice,
1685         # essentially all clients do redirect in this case, so we do
1686         # the same.
1687
1688         # On python 2 urlh.geturl() may sometimes return redirect URL
1689         # as byte string instead of unicode. This workaround allows
1690         # to force it always return unicode.
1691         if sys.version_info[0] < 3:
1692             newurl = compat_str(newurl)
1693
1694         # Be conciliant with URIs containing a space.  This is mainly
1695         # redundant with the more complete encoding done in http_error_302(),
1696         # but it is kept for compatibility with other callers.
1697         newurl = newurl.replace(' ', '%20')
1698
1699         CONTENT_HEADERS = ("content-length", "content-type")
1700         # NB: don't use dict comprehension for python 2.6 compatibility
1701         newheaders = dict((k, v) for k, v in req.headers.items()
1702                           if k.lower() not in CONTENT_HEADERS)
1703         return compat_urllib_request.Request(
1704             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1705             unverifiable=True)
1706
1707
1708 def extract_timezone(date_str):
1709     m = re.search(
1710         r'''(?x)
1711             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1712             (?P<tz>Z|                                            # just the UTC Z, or
1713                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1714                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1715                    [ ]?                                          # optional space
1716                 (?P<sign>\+|-)                                   # +/-
1717                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1718             $)
1719         ''', date_str)
1720     if not m:
1721         timezone = datetime.timedelta()
1722     else:
1723         date_str = date_str[:-len(m.group('tz'))]
1724         if not m.group('sign'):
1725             timezone = datetime.timedelta()
1726         else:
1727             sign = 1 if m.group('sign') == '+' else -1
1728             timezone = datetime.timedelta(
1729                 hours=sign * int(m.group('hours')),
1730                 minutes=sign * int(m.group('minutes')))
1731     return timezone, date_str
1732
1733
1734 def parse_iso8601(date_str, delimiter='T', timezone=None):
1735     """ Return a UNIX timestamp from the given date """
1736
1737     if date_str is None:
1738         return None
1739
1740     date_str = re.sub(r'\.[0-9]+', '', date_str)
1741
1742     if timezone is None:
1743         timezone, date_str = extract_timezone(date_str)
1744
1745     try:
1746         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1747         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1748         return calendar.timegm(dt.timetuple())
1749     except ValueError:
1750         pass
1751
1752
1753 def date_formats(day_first=True):
1754     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1755
1756
1757 def unified_strdate(date_str, day_first=True):
1758     """Return a string with the date in the format YYYYMMDD"""
1759
1760     if date_str is None:
1761         return None
1762     upload_date = None
1763     # Replace commas
1764     date_str = date_str.replace(',', ' ')
1765     # Remove AM/PM + timezone
1766     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1767     _, date_str = extract_timezone(date_str)
1768
1769     for expression in date_formats(day_first):
1770         try:
1771             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1772         except ValueError:
1773             pass
1774     if upload_date is None:
1775         timetuple = email.utils.parsedate_tz(date_str)
1776         if timetuple:
1777             try:
1778                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1779             except ValueError:
1780                 pass
1781     if upload_date is not None:
1782         return compat_str(upload_date)
1783
1784
1785 def unified_timestamp(date_str, day_first=True):
1786     if date_str is None:
1787         return None
1788
1789     date_str = re.sub(r'[,|]', '', date_str)
1790
1791     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1792     timezone, date_str = extract_timezone(date_str)
1793
1794     # Remove AM/PM + timezone
1795     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1796
1797     # Remove unrecognized timezones from ISO 8601 alike timestamps
1798     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1799     if m:
1800         date_str = date_str[:-len(m.group('tz'))]
1801
1802     # Python only supports microseconds, so remove nanoseconds
1803     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1804     if m:
1805         date_str = m.group(1)
1806
1807     for expression in date_formats(day_first):
1808         try:
1809             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1810             return calendar.timegm(dt.timetuple())
1811         except ValueError:
1812             pass
1813     timetuple = email.utils.parsedate_tz(date_str)
1814     if timetuple:
1815         return calendar.timegm(timetuple) + pm_delta * 3600
1816
1817
1818 def determine_ext(url, default_ext='unknown_video'):
1819     if url is None or '.' not in url:
1820         return default_ext
1821     guess = url.partition('?')[0].rpartition('.')[2]
1822     if re.match(r'^[A-Za-z0-9]+$', guess):
1823         return guess
1824     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1825     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1826         return guess.rstrip('/')
1827     else:
1828         return default_ext
1829
1830
1831 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1832     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1833
1834
1835 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1836     """
1837     Return a datetime object from a string in the format YYYYMMDD or
1838     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1839
1840     format: string date format used to return datetime object from
1841     precision: round the time portion of a datetime object.
1842                 auto|microsecond|second|minute|hour|day.
1843                 auto: round to the unit provided in date_str (if applicable).
1844     """
1845     auto_precision = False
1846     if precision == 'auto':
1847         auto_precision = True
1848         precision = 'microsecond'
1849     today = datetime_round(datetime.datetime.utcnow(), precision)
1850     if date_str in ('now', 'today'):
1851         return today
1852     if date_str == 'yesterday':
1853         return today - datetime.timedelta(days=1)
1854     match = re.match(
1855         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1856         date_str)
1857     if match is not None:
1858         start_time = datetime_from_str(match.group('start'), precision, format)
1859         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1860         unit = match.group('unit')
1861         if unit == 'month' or unit == 'year':
1862             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1863             unit = 'day'
1864         else:
1865             if unit == 'week':
1866                 unit = 'day'
1867                 time *= 7
1868             delta = datetime.timedelta(**{unit + 's': time})
1869             new_date = start_time + delta
1870         if auto_precision:
1871             return datetime_round(new_date, unit)
1872         return new_date
1873
1874     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1875
1876
1877 def date_from_str(date_str, format='%Y%m%d', strict=False):
1878     """
1879     Return a datetime object from a string in the format YYYYMMDD or
1880     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1881
1882     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1883
1884     format: string date format used to return datetime object from
1885     """
1886     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1887         raise ValueError(f'Invalid date format {date_str}')
1888     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1889
1890
1891 def datetime_add_months(dt, months):
1892     """Increment/Decrement a datetime object by months."""
1893     month = dt.month + months - 1
1894     year = dt.year + month // 12
1895     month = month % 12 + 1
1896     day = min(dt.day, calendar.monthrange(year, month)[1])
1897     return dt.replace(year, month, day)
1898
1899
1900 def datetime_round(dt, precision='day'):
1901     """
1902     Round a datetime object's time to a specific precision
1903     """
1904     if precision == 'microsecond':
1905         return dt
1906
1907     unit_seconds = {
1908         'day': 86400,
1909         'hour': 3600,
1910         'minute': 60,
1911         'second': 1,
1912     }
1913     roundto = lambda x, n: ((x + n / 2) // n) * n
1914     timestamp = calendar.timegm(dt.timetuple())
1915     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1916
1917
1918 def hyphenate_date(date_str):
1919     """
1920     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1921     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1922     if match is not None:
1923         return '-'.join(match.groups())
1924     else:
1925         return date_str
1926
1927
1928 class DateRange(object):
1929     """Represents a time interval between two dates"""
1930
1931     def __init__(self, start=None, end=None):
1932         """start and end must be strings in the format accepted by date"""
1933         if start is not None:
1934             self.start = date_from_str(start, strict=True)
1935         else:
1936             self.start = datetime.datetime.min.date()
1937         if end is not None:
1938             self.end = date_from_str(end, strict=True)
1939         else:
1940             self.end = datetime.datetime.max.date()
1941         if self.start > self.end:
1942             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1943
1944     @classmethod
1945     def day(cls, day):
1946         """Returns a range that only contains the given day"""
1947         return cls(day, day)
1948
1949     def __contains__(self, date):
1950         """Check if the date is in the range"""
1951         if not isinstance(date, datetime.date):
1952             date = date_from_str(date)
1953         return self.start <= date <= self.end
1954
1955     def __str__(self):
1956         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1957
1958
1959 def platform_name():
1960     """ Returns the platform name as a compat_str """
1961     res = platform.platform()
1962     if isinstance(res, bytes):
1963         res = res.decode(preferredencoding())
1964
1965     assert isinstance(res, compat_str)
1966     return res
1967
1968
1969 def get_windows_version():
1970     ''' Get Windows version. None if it's not running on Windows '''
1971     if compat_os_name == 'nt':
1972         return version_tuple(platform.win32_ver()[1])
1973     else:
1974         return None
1975
1976
1977 def _windows_write_string(s, out):
1978     """ Returns True if the string was written using special methods,
1979     False if it has yet to be written out."""
1980     # Adapted from http://stackoverflow.com/a/3259271/35070
1981
1982     import ctypes.wintypes
1983
1984     WIN_OUTPUT_IDS = {
1985         1: -11,
1986         2: -12,
1987     }
1988
1989     try:
1990         fileno = out.fileno()
1991     except AttributeError:
1992         # If the output stream doesn't have a fileno, it's virtual
1993         return False
1994     except io.UnsupportedOperation:
1995         # Some strange Windows pseudo files?
1996         return False
1997     if fileno not in WIN_OUTPUT_IDS:
1998         return False
1999
2000     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2001         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2002         ('GetStdHandle', ctypes.windll.kernel32))
2003     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2004
2005     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2006         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2007         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2008         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2009     written = ctypes.wintypes.DWORD(0)
2010
2011     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2012     FILE_TYPE_CHAR = 0x0002
2013     FILE_TYPE_REMOTE = 0x8000
2014     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2015         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2016         ctypes.POINTER(ctypes.wintypes.DWORD))(
2017         ('GetConsoleMode', ctypes.windll.kernel32))
2018     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2019
2020     def not_a_console(handle):
2021         if handle == INVALID_HANDLE_VALUE or handle is None:
2022             return True
2023         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2024                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2025
2026     if not_a_console(h):
2027         return False
2028
2029     def next_nonbmp_pos(s):
2030         try:
2031             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2032         except StopIteration:
2033             return len(s)
2034
2035     while s:
2036         count = min(next_nonbmp_pos(s), 1024)
2037
2038         ret = WriteConsoleW(
2039             h, s, count if count else 2, ctypes.byref(written), None)
2040         if ret == 0:
2041             raise OSError('Failed to write string')
2042         if not count:  # We just wrote a non-BMP character
2043             assert written.value == 2
2044             s = s[1:]
2045         else:
2046             assert written.value > 0
2047             s = s[written.value:]
2048     return True
2049
2050
2051 def write_string(s, out=None, encoding=None):
2052     if out is None:
2053         out = sys.stderr
2054     assert type(s) == compat_str
2055
2056     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2057         if _windows_write_string(s, out):
2058             return
2059
2060     if ('b' in getattr(out, 'mode', '')
2061             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2062         byt = s.encode(encoding or preferredencoding(), 'ignore')
2063         out.write(byt)
2064     elif hasattr(out, 'buffer'):
2065         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2066         byt = s.encode(enc, 'ignore')
2067         out.buffer.write(byt)
2068     else:
2069         out.write(s)
2070     out.flush()
2071
2072
2073 def bytes_to_intlist(bs):
2074     if not bs:
2075         return []
2076     if isinstance(bs[0], int):  # Python 3
2077         return list(bs)
2078     else:
2079         return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083     if not xs:
2084         return b''
2085     return compat_struct_pack('%dB' % len(xs), *xs)
2086
2087
2088 # Cross-platform file locking
2089 if sys.platform == 'win32':
2090     import ctypes.wintypes
2091     import msvcrt
2092
2093     class OVERLAPPED(ctypes.Structure):
2094         _fields_ = [
2095             ('Internal', ctypes.wintypes.LPVOID),
2096             ('InternalHigh', ctypes.wintypes.LPVOID),
2097             ('Offset', ctypes.wintypes.DWORD),
2098             ('OffsetHigh', ctypes.wintypes.DWORD),
2099             ('hEvent', ctypes.wintypes.HANDLE),
2100         ]
2101
2102     kernel32 = ctypes.windll.kernel32
2103     LockFileEx = kernel32.LockFileEx
2104     LockFileEx.argtypes = [
2105         ctypes.wintypes.HANDLE,     # hFile
2106         ctypes.wintypes.DWORD,      # dwFlags
2107         ctypes.wintypes.DWORD,      # dwReserved
2108         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2109         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2110         ctypes.POINTER(OVERLAPPED)  # Overlapped
2111     ]
2112     LockFileEx.restype = ctypes.wintypes.BOOL
2113     UnlockFileEx = kernel32.UnlockFileEx
2114     UnlockFileEx.argtypes = [
2115         ctypes.wintypes.HANDLE,     # hFile
2116         ctypes.wintypes.DWORD,      # dwReserved
2117         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2118         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2119         ctypes.POINTER(OVERLAPPED)  # Overlapped
2120     ]
2121     UnlockFileEx.restype = ctypes.wintypes.BOOL
2122     whole_low = 0xffffffff
2123     whole_high = 0x7fffffff
2124
2125     def _lock_file(f, exclusive, block):
2126         overlapped = OVERLAPPED()
2127         overlapped.Offset = 0
2128         overlapped.OffsetHigh = 0
2129         overlapped.hEvent = 0
2130         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2131
2132         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2133                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2134                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2135             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2136
2137     def _unlock_file(f):
2138         assert f._lock_file_overlapped_p
2139         handle = msvcrt.get_osfhandle(f.fileno())
2140         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2141             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2142
2143 else:
2144     try:
2145         import fcntl
2146
2147         def _lock_file(f, exclusive, block):
2148             try:
2149                 fcntl.flock(f,
2150                             fcntl.LOCK_SH if not exclusive
2151                             else fcntl.LOCK_EX if block
2152                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2153             except BlockingIOError:
2154                 raise
2155             except OSError:  # AOSP does not have flock()
2156                 fcntl.lockf(f,
2157                             fcntl.LOCK_SH if not exclusive
2158                             else fcntl.LOCK_EX if block
2159                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2160
2161         def _unlock_file(f):
2162             try:
2163                 fcntl.flock(f, fcntl.LOCK_UN)
2164             except OSError:
2165                 fcntl.lockf(f, fcntl.LOCK_UN)
2166
2167     except ImportError:
2168         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2169
2170         def _lock_file(f, exclusive, block):
2171             raise IOError(UNSUPPORTED_MSG)
2172
2173         def _unlock_file(f):
2174             raise IOError(UNSUPPORTED_MSG)
2175
2176
2177 class locked_file(object):
2178     _closed = False
2179
2180     def __init__(self, filename, mode, block=True, encoding=None):
2181         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2182         self.f = io.open(filename, mode, encoding=encoding)
2183         self.mode = mode
2184         self.block = block
2185
2186     def __enter__(self):
2187         exclusive = 'r' not in self.mode
2188         try:
2189             _lock_file(self.f, exclusive, self.block)
2190         except IOError:
2191             self.f.close()
2192             raise
2193         return self
2194
2195     def __exit__(self, etype, value, traceback):
2196         try:
2197             if not self._closed:
2198                 _unlock_file(self.f)
2199         finally:
2200             self.f.close()
2201             self._closed = True
2202
2203     def __iter__(self):
2204         return iter(self.f)
2205
2206     def write(self, *args):
2207         return self.f.write(*args)
2208
2209     def read(self, *args):
2210         return self.f.read(*args)
2211
2212     def flush(self):
2213         self.f.flush()
2214
2215     def open(self):
2216         return self.__enter__()
2217
2218     def close(self, *args):
2219         self.__exit__(self, *args, value=False, traceback=False)
2220
2221
2222 def get_filesystem_encoding():
2223     encoding = sys.getfilesystemencoding()
2224     return encoding if encoding is not None else 'utf-8'
2225
2226
2227 def shell_quote(args):
2228     quoted_args = []
2229     encoding = get_filesystem_encoding()
2230     for a in args:
2231         if isinstance(a, bytes):
2232             # We may get a filename encoded with 'encodeFilename'
2233             a = a.decode(encoding)
2234         quoted_args.append(compat_shlex_quote(a))
2235     return ' '.join(quoted_args)
2236
2237
2238 def smuggle_url(url, data):
2239     """ Pass additional data in a URL for internal use. """
2240
2241     url, idata = unsmuggle_url(url, {})
2242     data.update(idata)
2243     sdata = compat_urllib_parse_urlencode(
2244         {'__youtubedl_smuggle': json.dumps(data)})
2245     return url + '#' + sdata
2246
2247
2248 def unsmuggle_url(smug_url, default=None):
2249     if '#__youtubedl_smuggle' not in smug_url:
2250         return smug_url, default
2251     url, _, sdata = smug_url.rpartition('#')
2252     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2253     data = json.loads(jsond)
2254     return url, data
2255
2256
2257 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2258     """ Formats numbers with decimal sufixes like K, M, etc """
2259     num, factor = float_or_none(num), float(factor)
2260     if num is None or num < 0:
2261         return None
2262     exponent = 0 if num == 0 else int(math.log(num, factor))
2263     suffix = ['', *'kMGTPEZY'][exponent]
2264     if factor == 1024:
2265         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2266     converted = num / (factor ** exponent)
2267     return fmt % (converted, suffix)
2268
2269
2270 def format_bytes(bytes):
2271     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2272
2273
2274 def lookup_unit_table(unit_table, s):
2275     units_re = '|'.join(re.escape(u) for u in unit_table)
2276     m = re.match(
2277         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2278     if not m:
2279         return None
2280     num_str = m.group('num').replace(',', '.')
2281     mult = unit_table[m.group('unit')]
2282     return int(float(num_str) * mult)
2283
2284
2285 def parse_filesize(s):
2286     if s is None:
2287         return None
2288
2289     # The lower-case forms are of course incorrect and unofficial,
2290     # but we support those too
2291     _UNIT_TABLE = {
2292         'B': 1,
2293         'b': 1,
2294         'bytes': 1,
2295         'KiB': 1024,
2296         'KB': 1000,
2297         'kB': 1024,
2298         'Kb': 1000,
2299         'kb': 1000,
2300         'kilobytes': 1000,
2301         'kibibytes': 1024,
2302         'MiB': 1024 ** 2,
2303         'MB': 1000 ** 2,
2304         'mB': 1024 ** 2,
2305         'Mb': 1000 ** 2,
2306         'mb': 1000 ** 2,
2307         'megabytes': 1000 ** 2,
2308         'mebibytes': 1024 ** 2,
2309         'GiB': 1024 ** 3,
2310         'GB': 1000 ** 3,
2311         'gB': 1024 ** 3,
2312         'Gb': 1000 ** 3,
2313         'gb': 1000 ** 3,
2314         'gigabytes': 1000 ** 3,
2315         'gibibytes': 1024 ** 3,
2316         'TiB': 1024 ** 4,
2317         'TB': 1000 ** 4,
2318         'tB': 1024 ** 4,
2319         'Tb': 1000 ** 4,
2320         'tb': 1000 ** 4,
2321         'terabytes': 1000 ** 4,
2322         'tebibytes': 1024 ** 4,
2323         'PiB': 1024 ** 5,
2324         'PB': 1000 ** 5,
2325         'pB': 1024 ** 5,
2326         'Pb': 1000 ** 5,
2327         'pb': 1000 ** 5,
2328         'petabytes': 1000 ** 5,
2329         'pebibytes': 1024 ** 5,
2330         'EiB': 1024 ** 6,
2331         'EB': 1000 ** 6,
2332         'eB': 1024 ** 6,
2333         'Eb': 1000 ** 6,
2334         'eb': 1000 ** 6,
2335         'exabytes': 1000 ** 6,
2336         'exbibytes': 1024 ** 6,
2337         'ZiB': 1024 ** 7,
2338         'ZB': 1000 ** 7,
2339         'zB': 1024 ** 7,
2340         'Zb': 1000 ** 7,
2341         'zb': 1000 ** 7,
2342         'zettabytes': 1000 ** 7,
2343         'zebibytes': 1024 ** 7,
2344         'YiB': 1024 ** 8,
2345         'YB': 1000 ** 8,
2346         'yB': 1024 ** 8,
2347         'Yb': 1000 ** 8,
2348         'yb': 1000 ** 8,
2349         'yottabytes': 1000 ** 8,
2350         'yobibytes': 1024 ** 8,
2351     }
2352
2353     return lookup_unit_table(_UNIT_TABLE, s)
2354
2355
2356 def parse_count(s):
2357     if s is None:
2358         return None
2359
2360     s = re.sub(r'^[^\d]+\s', '', s).strip()
2361
2362     if re.match(r'^[\d,.]+$', s):
2363         return str_to_int(s)
2364
2365     _UNIT_TABLE = {
2366         'k': 1000,
2367         'K': 1000,
2368         'm': 1000 ** 2,
2369         'M': 1000 ** 2,
2370         'kk': 1000 ** 2,
2371         'KK': 1000 ** 2,
2372         'b': 1000 ** 3,
2373         'B': 1000 ** 3,
2374     }
2375
2376     ret = lookup_unit_table(_UNIT_TABLE, s)
2377     if ret is not None:
2378         return ret
2379
2380     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2381     if mobj:
2382         return str_to_int(mobj.group(1))
2383
2384
2385 def parse_resolution(s):
2386     if s is None:
2387         return {}
2388
2389     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2390     if mobj:
2391         return {
2392             'width': int(mobj.group('w')),
2393             'height': int(mobj.group('h')),
2394         }
2395
2396     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2397     if mobj:
2398         return {'height': int(mobj.group(1))}
2399
2400     mobj = re.search(r'\b([48])[kK]\b', s)
2401     if mobj:
2402         return {'height': int(mobj.group(1)) * 540}
2403
2404     return {}
2405
2406
2407 def parse_bitrate(s):
2408     if not isinstance(s, compat_str):
2409         return
2410     mobj = re.search(r'\b(\d+)\s*kbps', s)
2411     if mobj:
2412         return int(mobj.group(1))
2413
2414
2415 def month_by_name(name, lang='en'):
2416     """ Return the number of a month by (locale-independently) English name """
2417
2418     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2419
2420     try:
2421         return month_names.index(name) + 1
2422     except ValueError:
2423         return None
2424
2425
2426 def month_by_abbreviation(abbrev):
2427     """ Return the number of a month by (locale-independently) English
2428         abbreviations """
2429
2430     try:
2431         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2432     except ValueError:
2433         return None
2434
2435
2436 def fix_xml_ampersands(xml_str):
2437     """Replace all the '&' by '&amp;' in XML"""
2438     return re.sub(
2439         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2440         '&amp;',
2441         xml_str)
2442
2443
2444 def setproctitle(title):
2445     assert isinstance(title, compat_str)
2446
2447     # ctypes in Jython is not complete
2448     # http://bugs.jython.org/issue2148
2449     if sys.platform.startswith('java'):
2450         return
2451
2452     try:
2453         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2454     except OSError:
2455         return
2456     except TypeError:
2457         # LoadLibrary in Windows Python 2.7.13 only expects
2458         # a bytestring, but since unicode_literals turns
2459         # every string into a unicode string, it fails.
2460         return
2461     title_bytes = title.encode('utf-8')
2462     buf = ctypes.create_string_buffer(len(title_bytes))
2463     buf.value = title_bytes
2464     try:
2465         libc.prctl(15, buf, 0, 0, 0)
2466     except AttributeError:
2467         return  # Strange libc, just skip this
2468
2469
2470 def remove_start(s, start):
2471     return s[len(start):] if s is not None and s.startswith(start) else s
2472
2473
2474 def remove_end(s, end):
2475     return s[:-len(end)] if s is not None and s.endswith(end) else s
2476
2477
2478 def remove_quotes(s):
2479     if s is None or len(s) < 2:
2480         return s
2481     for quote in ('"', "'", ):
2482         if s[0] == quote and s[-1] == quote:
2483             return s[1:-1]
2484     return s
2485
2486
2487 def get_domain(url):
2488     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2489     return domain.group('domain') if domain else None
2490
2491
2492 def url_basename(url):
2493     path = compat_urlparse.urlparse(url).path
2494     return path.strip('/').split('/')[-1]
2495
2496
2497 def base_url(url):
2498     return re.match(r'https?://[^?#&]+/', url).group()
2499
2500
2501 def urljoin(base, path):
2502     if isinstance(path, bytes):
2503         path = path.decode('utf-8')
2504     if not isinstance(path, compat_str) or not path:
2505         return None
2506     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2507         return path
2508     if isinstance(base, bytes):
2509         base = base.decode('utf-8')
2510     if not isinstance(base, compat_str) or not re.match(
2511             r'^(?:https?:)?//', base):
2512         return None
2513     return compat_urlparse.urljoin(base, path)
2514
2515
2516 class HEADRequest(compat_urllib_request.Request):
2517     def get_method(self):
2518         return 'HEAD'
2519
2520
2521 class PUTRequest(compat_urllib_request.Request):
2522     def get_method(self):
2523         return 'PUT'
2524
2525
2526 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2527     if get_attr and v is not None:
2528         v = getattr(v, get_attr, None)
2529     try:
2530         return int(v) * invscale // scale
2531     except (ValueError, TypeError, OverflowError):
2532         return default
2533
2534
2535 def str_or_none(v, default=None):
2536     return default if v is None else compat_str(v)
2537
2538
2539 def str_to_int(int_str):
2540     """ A more relaxed version of int_or_none """
2541     if isinstance(int_str, compat_integer_types):
2542         return int_str
2543     elif isinstance(int_str, compat_str):
2544         int_str = re.sub(r'[,\.\+]', '', int_str)
2545         return int_or_none(int_str)
2546
2547
2548 def float_or_none(v, scale=1, invscale=1, default=None):
2549     if v is None:
2550         return default
2551     try:
2552         return float(v) * invscale / scale
2553     except (ValueError, TypeError):
2554         return default
2555
2556
2557 def bool_or_none(v, default=None):
2558     return v if isinstance(v, bool) else default
2559
2560
2561 def strip_or_none(v, default=None):
2562     return v.strip() if isinstance(v, compat_str) else default
2563
2564
2565 def url_or_none(url):
2566     if not url or not isinstance(url, compat_str):
2567         return None
2568     url = url.strip()
2569     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2570
2571
2572 def request_to_url(req):
2573     if isinstance(req, compat_urllib_request.Request):
2574         return req.get_full_url()
2575     else:
2576         return req
2577
2578
2579 def strftime_or_none(timestamp, date_format, default=None):
2580     datetime_object = None
2581     try:
2582         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2583             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2584         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2585             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2586         return datetime_object.strftime(date_format)
2587     except (ValueError, TypeError, AttributeError):
2588         return default
2589
2590
2591 def parse_duration(s):
2592     if not isinstance(s, compat_basestring):
2593         return None
2594     s = s.strip()
2595     if not s:
2596         return None
2597
2598     days, hours, mins, secs, ms = [None] * 5
2599     m = re.match(r'''(?x)
2600             (?P<before_secs>
2601                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2602             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2603             (?P<ms>[.:][0-9]+)?Z?$
2604         ''', s)
2605     if m:
2606         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2607     else:
2608         m = re.match(
2609             r'''(?ix)(?:P?
2610                 (?:
2611                     [0-9]+\s*y(?:ears?)?\s*
2612                 )?
2613                 (?:
2614                     [0-9]+\s*m(?:onths?)?\s*
2615                 )?
2616                 (?:
2617                     [0-9]+\s*w(?:eeks?)?\s*
2618                 )?
2619                 (?:
2620                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2621                 )?
2622                 T)?
2623                 (?:
2624                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2625                 )?
2626                 (?:
2627                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2628                 )?
2629                 (?:
2630                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2631                 )?Z?$''', s)
2632         if m:
2633             days, hours, mins, secs, ms = m.groups()
2634         else:
2635             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2636             if m:
2637                 hours, mins = m.groups()
2638             else:
2639                 return None
2640
2641     duration = 0
2642     if secs:
2643         duration += float(secs)
2644     if mins:
2645         duration += float(mins) * 60
2646     if hours:
2647         duration += float(hours) * 60 * 60
2648     if days:
2649         duration += float(days) * 24 * 60 * 60
2650     if ms:
2651         duration += float(ms.replace(':', '.'))
2652     return duration
2653
2654
2655 def prepend_extension(filename, ext, expected_real_ext=None):
2656     name, real_ext = os.path.splitext(filename)
2657     return (
2658         '{0}.{1}{2}'.format(name, ext, real_ext)
2659         if not expected_real_ext or real_ext[1:] == expected_real_ext
2660         else '{0}.{1}'.format(filename, ext))
2661
2662
2663 def replace_extension(filename, ext, expected_real_ext=None):
2664     name, real_ext = os.path.splitext(filename)
2665     return '{0}.{1}'.format(
2666         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2667         ext)
2668
2669
2670 def check_executable(exe, args=[]):
2671     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2672     args can be a list of arguments for a short output (like -version) """
2673     try:
2674         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2675     except OSError:
2676         return False
2677     return exe
2678
2679
2680 def _get_exe_version_output(exe, args):
2681     try:
2682         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2683         # SIGTTOU if yt-dlp is run in the background.
2684         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2685         out, _ = Popen(
2686             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2687             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2688     except OSError:
2689         return False
2690     if isinstance(out, bytes):  # Python 2.x
2691         out = out.decode('ascii', 'ignore')
2692     return out
2693
2694
2695 def detect_exe_version(output, version_re=None, unrecognized='present'):
2696     assert isinstance(output, compat_str)
2697     if version_re is None:
2698         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2699     m = re.search(version_re, output)
2700     if m:
2701         return m.group(1)
2702     else:
2703         return unrecognized
2704
2705
2706 def get_exe_version(exe, args=['--version'],
2707                     version_re=None, unrecognized='present'):
2708     """ Returns the version of the specified executable,
2709     or False if the executable is not present """
2710     out = _get_exe_version_output(exe, args)
2711     return detect_exe_version(out, version_re, unrecognized) if out else False
2712
2713
2714 class LazyList(collections.abc.Sequence):
2715     ''' Lazy immutable list from an iterable
2716     Note that slices of a LazyList are lists and not LazyList'''
2717
2718     class IndexError(IndexError):
2719         pass
2720
2721     def __init__(self, iterable, *, reverse=False, _cache=None):
2722         self.__iterable = iter(iterable)
2723         self.__cache = [] if _cache is None else _cache
2724         self.__reversed = reverse
2725
2726     def __iter__(self):
2727         if self.__reversed:
2728             # We need to consume the entire iterable to iterate in reverse
2729             yield from self.exhaust()
2730             return
2731         yield from self.__cache
2732         for item in self.__iterable:
2733             self.__cache.append(item)
2734             yield item
2735
2736     def __exhaust(self):
2737         self.__cache.extend(self.__iterable)
2738         # Discard the emptied iterable to make it pickle-able
2739         self.__iterable = []
2740         return self.__cache
2741
2742     def exhaust(self):
2743         ''' Evaluate the entire iterable '''
2744         return self.__exhaust()[::-1 if self.__reversed else 1]
2745
2746     @staticmethod
2747     def __reverse_index(x):
2748         return None if x is None else -(x + 1)
2749
2750     def __getitem__(self, idx):
2751         if isinstance(idx, slice):
2752             if self.__reversed:
2753                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2754             start, stop, step = idx.start, idx.stop, idx.step or 1
2755         elif isinstance(idx, int):
2756             if self.__reversed:
2757                 idx = self.__reverse_index(idx)
2758             start, stop, step = idx, idx, 0
2759         else:
2760             raise TypeError('indices must be integers or slices')
2761         if ((start or 0) < 0 or (stop or 0) < 0
2762                 or (start is None and step < 0)
2763                 or (stop is None and step > 0)):
2764             # We need to consume the entire iterable to be able to slice from the end
2765             # Obviously, never use this with infinite iterables
2766             self.__exhaust()
2767             try:
2768                 return self.__cache[idx]
2769             except IndexError as e:
2770                 raise self.IndexError(e) from e
2771         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2772         if n > 0:
2773             self.__cache.extend(itertools.islice(self.__iterable, n))
2774         try:
2775             return self.__cache[idx]
2776         except IndexError as e:
2777             raise self.IndexError(e) from e
2778
2779     def __bool__(self):
2780         try:
2781             self[-1] if self.__reversed else self[0]
2782         except self.IndexError:
2783             return False
2784         return True
2785
2786     def __len__(self):
2787         self.__exhaust()
2788         return len(self.__cache)
2789
2790     def __reversed__(self):
2791         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2792
2793     def __copy__(self):
2794         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2795
2796     def __repr__(self):
2797         # repr and str should mimic a list. So we exhaust the iterable
2798         return repr(self.exhaust())
2799
2800     def __str__(self):
2801         return repr(self.exhaust())
2802
2803
2804 class PagedList:
2805
2806     class IndexError(IndexError):
2807         pass
2808
2809     def __len__(self):
2810         # This is only useful for tests
2811         return len(self.getslice())
2812
2813     def __init__(self, pagefunc, pagesize, use_cache=True):
2814         self._pagefunc = pagefunc
2815         self._pagesize = pagesize
2816         self._pagecount = float('inf')
2817         self._use_cache = use_cache
2818         self._cache = {}
2819
2820     def getpage(self, pagenum):
2821         page_results = self._cache.get(pagenum)
2822         if page_results is None:
2823             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2824         if self._use_cache:
2825             self._cache[pagenum] = page_results
2826         return page_results
2827
2828     def getslice(self, start=0, end=None):
2829         return list(self._getslice(start, end))
2830
2831     def _getslice(self, start, end):
2832         raise NotImplementedError('This method must be implemented by subclasses')
2833
2834     def __getitem__(self, idx):
2835         assert self._use_cache, 'Indexing PagedList requires cache'
2836         if not isinstance(idx, int) or idx < 0:
2837             raise TypeError('indices must be non-negative integers')
2838         entries = self.getslice(idx, idx + 1)
2839         if not entries:
2840             raise self.IndexError()
2841         return entries[0]
2842
2843
2844 class OnDemandPagedList(PagedList):
2845     def _getslice(self, start, end):
2846         for pagenum in itertools.count(start // self._pagesize):
2847             firstid = pagenum * self._pagesize
2848             nextfirstid = pagenum * self._pagesize + self._pagesize
2849             if start >= nextfirstid:
2850                 continue
2851
2852             startv = (
2853                 start % self._pagesize
2854                 if firstid <= start < nextfirstid
2855                 else 0)
2856             endv = (
2857                 ((end - 1) % self._pagesize) + 1
2858                 if (end is not None and firstid <= end <= nextfirstid)
2859                 else None)
2860
2861             try:
2862                 page_results = self.getpage(pagenum)
2863             except Exception:
2864                 self._pagecount = pagenum - 1
2865                 raise
2866             if startv != 0 or endv is not None:
2867                 page_results = page_results[startv:endv]
2868             yield from page_results
2869
2870             # A little optimization - if current page is not "full", ie. does
2871             # not contain page_size videos then we can assume that this page
2872             # is the last one - there are no more ids on further pages -
2873             # i.e. no need to query again.
2874             if len(page_results) + startv < self._pagesize:
2875                 break
2876
2877             # If we got the whole page, but the next page is not interesting,
2878             # break out early as well
2879             if end == nextfirstid:
2880                 break
2881
2882
2883 class InAdvancePagedList(PagedList):
2884     def __init__(self, pagefunc, pagecount, pagesize):
2885         PagedList.__init__(self, pagefunc, pagesize, True)
2886         self._pagecount = pagecount
2887
2888     def _getslice(self, start, end):
2889         start_page = start // self._pagesize
2890         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2891         skip_elems = start - start_page * self._pagesize
2892         only_more = None if end is None else end - start
2893         for pagenum in range(start_page, end_page):
2894             page_results = self.getpage(pagenum)
2895             if skip_elems:
2896                 page_results = page_results[skip_elems:]
2897                 skip_elems = None
2898             if only_more is not None:
2899                 if len(page_results) < only_more:
2900                     only_more -= len(page_results)
2901                 else:
2902                     yield from page_results[:only_more]
2903                     break
2904             yield from page_results
2905
2906
2907 def uppercase_escape(s):
2908     unicode_escape = codecs.getdecoder('unicode_escape')
2909     return re.sub(
2910         r'\\U[0-9a-fA-F]{8}',
2911         lambda m: unicode_escape(m.group(0))[0],
2912         s)
2913
2914
2915 def lowercase_escape(s):
2916     unicode_escape = codecs.getdecoder('unicode_escape')
2917     return re.sub(
2918         r'\\u[0-9a-fA-F]{4}',
2919         lambda m: unicode_escape(m.group(0))[0],
2920         s)
2921
2922
2923 def escape_rfc3986(s):
2924     """Escape non-ASCII characters as suggested by RFC 3986"""
2925     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2926         s = s.encode('utf-8')
2927     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2928
2929
2930 def escape_url(url):
2931     """Escape URL as suggested by RFC 3986"""
2932     url_parsed = compat_urllib_parse_urlparse(url)
2933     return url_parsed._replace(
2934         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2935         path=escape_rfc3986(url_parsed.path),
2936         params=escape_rfc3986(url_parsed.params),
2937         query=escape_rfc3986(url_parsed.query),
2938         fragment=escape_rfc3986(url_parsed.fragment)
2939     ).geturl()
2940
2941
2942 def parse_qs(url):
2943     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2944
2945
2946 def read_batch_urls(batch_fd):
2947     def fixup(url):
2948         if not isinstance(url, compat_str):
2949             url = url.decode('utf-8', 'replace')
2950         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2951         for bom in BOM_UTF8:
2952             if url.startswith(bom):
2953                 url = url[len(bom):]
2954         url = url.lstrip()
2955         if not url or url.startswith(('#', ';', ']')):
2956             return False
2957         # "#" cannot be stripped out since it is part of the URI
2958         # However, it can be safely stipped out if follwing a whitespace
2959         return re.split(r'\s#', url, 1)[0].rstrip()
2960
2961     with contextlib.closing(batch_fd) as fd:
2962         return [url for url in map(fixup, fd) if url]
2963
2964
2965 def urlencode_postdata(*args, **kargs):
2966     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2967
2968
2969 def update_url_query(url, query):
2970     if not query:
2971         return url
2972     parsed_url = compat_urlparse.urlparse(url)
2973     qs = compat_parse_qs(parsed_url.query)
2974     qs.update(query)
2975     return compat_urlparse.urlunparse(parsed_url._replace(
2976         query=compat_urllib_parse_urlencode(qs, True)))
2977
2978
2979 def update_Request(req, url=None, data=None, headers={}, query={}):
2980     req_headers = req.headers.copy()
2981     req_headers.update(headers)
2982     req_data = data or req.data
2983     req_url = update_url_query(url or req.get_full_url(), query)
2984     req_get_method = req.get_method()
2985     if req_get_method == 'HEAD':
2986         req_type = HEADRequest
2987     elif req_get_method == 'PUT':
2988         req_type = PUTRequest
2989     else:
2990         req_type = compat_urllib_request.Request
2991     new_req = req_type(
2992         req_url, data=req_data, headers=req_headers,
2993         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2994     if hasattr(req, 'timeout'):
2995         new_req.timeout = req.timeout
2996     return new_req
2997
2998
2999 def _multipart_encode_impl(data, boundary):
3000     content_type = 'multipart/form-data; boundary=%s' % boundary
3001
3002     out = b''
3003     for k, v in data.items():
3004         out += b'--' + boundary.encode('ascii') + b'\r\n'
3005         if isinstance(k, compat_str):
3006             k = k.encode('utf-8')
3007         if isinstance(v, compat_str):
3008             v = v.encode('utf-8')
3009         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3010         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3011         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3012         if boundary.encode('ascii') in content:
3013             raise ValueError('Boundary overlaps with data')
3014         out += content
3015
3016     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3017
3018     return out, content_type
3019
3020
3021 def multipart_encode(data, boundary=None):
3022     '''
3023     Encode a dict to RFC 7578-compliant form-data
3024
3025     data:
3026         A dict where keys and values can be either Unicode or bytes-like
3027         objects.
3028     boundary:
3029         If specified a Unicode object, it's used as the boundary. Otherwise
3030         a random boundary is generated.
3031
3032     Reference: https://tools.ietf.org/html/rfc7578
3033     '''
3034     has_specified_boundary = boundary is not None
3035
3036     while True:
3037         if boundary is None:
3038             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3039
3040         try:
3041             out, content_type = _multipart_encode_impl(data, boundary)
3042             break
3043         except ValueError:
3044             if has_specified_boundary:
3045                 raise
3046             boundary = None
3047
3048     return out, content_type
3049
3050
3051 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3052     if isinstance(key_or_keys, (list, tuple)):
3053         for key in key_or_keys:
3054             if key not in d or d[key] is None or skip_false_values and not d[key]:
3055                 continue
3056             return d[key]
3057         return default
3058     return d.get(key_or_keys, default)
3059
3060
3061 def try_get(src, getter, expected_type=None):
3062     for get in variadic(getter):
3063         try:
3064             v = get(src)
3065         except (AttributeError, KeyError, TypeError, IndexError):
3066             pass
3067         else:
3068             if expected_type is None or isinstance(v, expected_type):
3069                 return v
3070
3071
3072 def merge_dicts(*dicts):
3073     merged = {}
3074     for a_dict in dicts:
3075         for k, v in a_dict.items():
3076             if v is None:
3077                 continue
3078             if (k not in merged
3079                     or (isinstance(v, compat_str) and v
3080                         and isinstance(merged[k], compat_str)
3081                         and not merged[k])):
3082                 merged[k] = v
3083     return merged
3084
3085
3086 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3087     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3088
3089
3090 US_RATINGS = {
3091     'G': 0,
3092     'PG': 10,
3093     'PG-13': 13,
3094     'R': 16,
3095     'NC': 18,
3096 }
3097
3098
3099 TV_PARENTAL_GUIDELINES = {
3100     'TV-Y': 0,
3101     'TV-Y7': 7,
3102     'TV-G': 0,
3103     'TV-PG': 0,
3104     'TV-14': 14,
3105     'TV-MA': 17,
3106 }
3107
3108
3109 def parse_age_limit(s):
3110     if type(s) == int:
3111         return s if 0 <= s <= 21 else None
3112     if not isinstance(s, compat_basestring):
3113         return None
3114     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3115     if m:
3116         return int(m.group('age'))
3117     s = s.upper()
3118     if s in US_RATINGS:
3119         return US_RATINGS[s]
3120     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3121     if m:
3122         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3123     return None
3124
3125
3126 def strip_jsonp(code):
3127     return re.sub(
3128         r'''(?sx)^
3129             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3130             (?:\s*&&\s*(?P=func_name))?
3131             \s*\(\s*(?P<callback_data>.*)\);?
3132             \s*?(?://[^\n]*)*$''',
3133         r'\g<callback_data>', code)
3134
3135
3136 def js_to_json(code, vars={}):
3137     # vars is a dict of var, val pairs to substitute
3138     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3139     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3140     INTEGER_TABLE = (
3141         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3142         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3143     )
3144
3145     def fix_kv(m):
3146         v = m.group(0)
3147         if v in ('true', 'false', 'null'):
3148             return v
3149         elif v in ('undefined', 'void 0'):
3150             return 'null'
3151         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3152             return ""
3153
3154         if v[0] in ("'", '"'):
3155             v = re.sub(r'(?s)\\.|"', lambda m: {
3156                 '"': '\\"',
3157                 "\\'": "'",
3158                 '\\\n': '',
3159                 '\\x': '\\u00',
3160             }.get(m.group(0), m.group(0)), v[1:-1])
3161         else:
3162             for regex, base in INTEGER_TABLE:
3163                 im = re.match(regex, v)
3164                 if im:
3165                     i = int(im.group(1), base)
3166                     return '"%d":' % i if v.endswith(':') else '%d' % i
3167
3168             if v in vars:
3169                 return vars[v]
3170
3171         return '"%s"' % v
3172
3173     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3174
3175     return re.sub(r'''(?sx)
3176         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3177         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3178         {comment}|,(?={skip}[\]}}])|
3179         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3180         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3181         [0-9]+(?={skip}:)|
3182         !+
3183         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3184
3185
3186 def qualities(quality_ids):
3187     """ Get a numeric quality value out of a list of possible values """
3188     def q(qid):
3189         try:
3190             return quality_ids.index(qid)
3191         except ValueError:
3192             return -1
3193     return q
3194
3195
3196 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3197
3198
3199 DEFAULT_OUTTMPL = {
3200     'default': '%(title)s [%(id)s].%(ext)s',
3201     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3202 }
3203 OUTTMPL_TYPES = {
3204     'chapter': None,
3205     'subtitle': None,
3206     'thumbnail': None,
3207     'description': 'description',
3208     'annotation': 'annotations.xml',
3209     'infojson': 'info.json',
3210     'link': None,
3211     'pl_video': None,
3212     'pl_thumbnail': None,
3213     'pl_description': 'description',
3214     'pl_infojson': 'info.json',
3215 }
3216
3217 # As of [1] format syntax is:
3218 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3219 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3220 STR_FORMAT_RE_TMPL = r'''(?x)
3221     (?<!%)(?P<prefix>(?:%%)*)
3222     %
3223     (?P<has_key>\((?P<key>{0})\))?
3224     (?P<format>
3225         (?P<conversion>[#0\-+ ]+)?
3226         (?P<min_width>\d+)?
3227         (?P<precision>\.\d+)?
3228         (?P<len_mod>[hlL])?  # unused in python
3229         {1}  # conversion type
3230     )
3231 '''
3232
3233
3234 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3235
3236
3237 def limit_length(s, length):
3238     """ Add ellipses to overly long strings """
3239     if s is None:
3240         return None
3241     ELLIPSES = '...'
3242     if len(s) > length:
3243         return s[:length - len(ELLIPSES)] + ELLIPSES
3244     return s
3245
3246
3247 def version_tuple(v):
3248     return tuple(int(e) for e in re.split(r'[-.]', v))
3249
3250
3251 def is_outdated_version(version, limit, assume_new=True):
3252     if not version:
3253         return not assume_new
3254     try:
3255         return version_tuple(version) < version_tuple(limit)
3256     except ValueError:
3257         return not assume_new
3258
3259
3260 def ytdl_is_updateable():
3261     """ Returns if yt-dlp can be updated with -U """
3262
3263     from .update import is_non_updateable
3264
3265     return not is_non_updateable()
3266
3267
3268 def args_to_str(args):
3269     # Get a short string representation for a subprocess command
3270     return ' '.join(compat_shlex_quote(a) for a in args)
3271
3272
3273 def error_to_compat_str(err):
3274     err_str = str(err)
3275     # On python 2 error byte string must be decoded with proper
3276     # encoding rather than ascii
3277     if sys.version_info[0] < 3:
3278         err_str = err_str.decode(preferredencoding())
3279     return err_str
3280
3281
3282 def mimetype2ext(mt):
3283     if mt is None:
3284         return None
3285
3286     mt, _, params = mt.partition(';')
3287     mt = mt.strip()
3288
3289     FULL_MAP = {
3290         'audio/mp4': 'm4a',
3291         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3292         # it's the most popular one
3293         'audio/mpeg': 'mp3',
3294         'audio/x-wav': 'wav',
3295         'audio/wav': 'wav',
3296         'audio/wave': 'wav',
3297     }
3298
3299     ext = FULL_MAP.get(mt)
3300     if ext is not None:
3301         return ext
3302
3303     SUBTYPE_MAP = {
3304         '3gpp': '3gp',
3305         'smptett+xml': 'tt',
3306         'ttaf+xml': 'dfxp',
3307         'ttml+xml': 'ttml',
3308         'x-flv': 'flv',
3309         'x-mp4-fragmented': 'mp4',
3310         'x-ms-sami': 'sami',
3311         'x-ms-wmv': 'wmv',
3312         'mpegurl': 'm3u8',
3313         'x-mpegurl': 'm3u8',
3314         'vnd.apple.mpegurl': 'm3u8',
3315         'dash+xml': 'mpd',
3316         'f4m+xml': 'f4m',
3317         'hds+xml': 'f4m',
3318         'vnd.ms-sstr+xml': 'ism',
3319         'quicktime': 'mov',
3320         'mp2t': 'ts',
3321         'x-wav': 'wav',
3322         'filmstrip+json': 'fs',
3323         'svg+xml': 'svg',
3324     }
3325
3326     _, _, subtype = mt.rpartition('/')
3327     ext = SUBTYPE_MAP.get(subtype.lower())
3328     if ext is not None:
3329         return ext
3330
3331     SUFFIX_MAP = {
3332         'json': 'json',
3333         'xml': 'xml',
3334         'zip': 'zip',
3335         'gzip': 'gz',
3336     }
3337
3338     _, _, suffix = subtype.partition('+')
3339     ext = SUFFIX_MAP.get(suffix)
3340     if ext is not None:
3341         return ext
3342
3343     return subtype.replace('+', '.')
3344
3345
3346 def ext2mimetype(ext_or_url):
3347     if not ext_or_url:
3348         return None
3349     if '.' not in ext_or_url:
3350         ext_or_url = f'file.{ext_or_url}'
3351     return mimetypes.guess_type(ext_or_url)[0]
3352
3353
3354 def parse_codecs(codecs_str):
3355     # http://tools.ietf.org/html/rfc6381
3356     if not codecs_str:
3357         return {}
3358     split_codecs = list(filter(None, map(
3359         str.strip, codecs_str.strip().strip(',').split(','))))
3360     vcodec, acodec, tcodec, hdr = None, None, None, None
3361     for full_codec in split_codecs:
3362         parts = full_codec.split('.')
3363         codec = parts[0].replace('0', '')
3364         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3365                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3366             if not vcodec:
3367                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3368                 if codec in ('dvh1', 'dvhe'):
3369                     hdr = 'DV'
3370                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3371                     hdr = 'HDR10'
3372                 elif full_codec.replace('0', '').startswith('vp9.2'):
3373                     hdr = 'HDR10'
3374         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3375             if not acodec:
3376                 acodec = full_codec
3377         elif codec in ('stpp', 'wvtt',):
3378             if not tcodec:
3379                 tcodec = full_codec
3380         else:
3381             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3382     if vcodec or acodec or tcodec:
3383         return {
3384             'vcodec': vcodec or 'none',
3385             'acodec': acodec or 'none',
3386             'dynamic_range': hdr,
3387             **({'tcodec': tcodec} if tcodec is not None else {}),
3388         }
3389     elif len(split_codecs) == 2:
3390         return {
3391             'vcodec': split_codecs[0],
3392             'acodec': split_codecs[1],
3393         }
3394     return {}
3395
3396
3397 def urlhandle_detect_ext(url_handle):
3398     getheader = url_handle.headers.get
3399
3400     cd = getheader('Content-Disposition')
3401     if cd:
3402         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3403         if m:
3404             e = determine_ext(m.group('filename'), default_ext=None)
3405             if e:
3406                 return e
3407
3408     return mimetype2ext(getheader('Content-Type'))
3409
3410
3411 def encode_data_uri(data, mime_type):
3412     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3413
3414
3415 def age_restricted(content_limit, age_limit):
3416     """ Returns True iff the content should be blocked """
3417
3418     if age_limit is None:  # No limit set
3419         return False
3420     if content_limit is None:
3421         return False  # Content available for everyone
3422     return age_limit < content_limit
3423
3424
3425 def is_html(first_bytes):
3426     """ Detect whether a file contains HTML by examining its first bytes. """
3427
3428     BOMS = [
3429         (b'\xef\xbb\xbf', 'utf-8'),
3430         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3431         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3432         (b'\xff\xfe', 'utf-16-le'),
3433         (b'\xfe\xff', 'utf-16-be'),
3434     ]
3435     for bom, enc in BOMS:
3436         if first_bytes.startswith(bom):
3437             s = first_bytes[len(bom):].decode(enc, 'replace')
3438             break
3439     else:
3440         s = first_bytes.decode('utf-8', 'replace')
3441
3442     return re.match(r'^\s*<', s)
3443
3444
3445 def determine_protocol(info_dict):
3446     protocol = info_dict.get('protocol')
3447     if protocol is not None:
3448         return protocol
3449
3450     url = sanitize_url(info_dict['url'])
3451     if url.startswith('rtmp'):
3452         return 'rtmp'
3453     elif url.startswith('mms'):
3454         return 'mms'
3455     elif url.startswith('rtsp'):
3456         return 'rtsp'
3457
3458     ext = determine_ext(url)
3459     if ext == 'm3u8':
3460         return 'm3u8'
3461     elif ext == 'f4m':
3462         return 'f4m'
3463
3464     return compat_urllib_parse_urlparse(url).scheme
3465
3466
3467 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3468     """ Render a list of rows, each as a list of values.
3469     Text after a \t will be right aligned """
3470     def width(string):
3471         return len(remove_terminal_sequences(string).replace('\t', ''))
3472
3473     def get_max_lens(table):
3474         return [max(width(str(v)) for v in col) for col in zip(*table)]
3475
3476     def filter_using_list(row, filterArray):
3477         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3478
3479     max_lens = get_max_lens(data) if hide_empty else []
3480     header_row = filter_using_list(header_row, max_lens)
3481     data = [filter_using_list(row, max_lens) for row in data]
3482
3483     table = [header_row] + data
3484     max_lens = get_max_lens(table)
3485     extra_gap += 1
3486     if delim:
3487         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3488         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3489     for row in table:
3490         for pos, text in enumerate(map(str, row)):
3491             if '\t' in text:
3492                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3493             else:
3494                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3495     ret = '\n'.join(''.join(row).rstrip() for row in table)
3496     return ret
3497
3498
3499 def _match_one(filter_part, dct, incomplete):
3500     # TODO: Generalize code with YoutubeDL._build_format_filter
3501     STRING_OPERATORS = {
3502         '*=': operator.contains,
3503         '^=': lambda attr, value: attr.startswith(value),
3504         '$=': lambda attr, value: attr.endswith(value),
3505         '~=': lambda attr, value: re.search(value, attr),
3506     }
3507     COMPARISON_OPERATORS = {
3508         **STRING_OPERATORS,
3509         '<=': operator.le,  # "<=" must be defined above "<"
3510         '<': operator.lt,
3511         '>=': operator.ge,
3512         '>': operator.gt,
3513         '=': operator.eq,
3514     }
3515
3516     operator_rex = re.compile(r'''(?x)\s*
3517         (?P<key>[a-z_]+)
3518         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3519         (?:
3520             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3521             (?P<strval>.+?)
3522         )
3523         \s*$
3524         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3525     m = operator_rex.search(filter_part)
3526     if m:
3527         m = m.groupdict()
3528         unnegated_op = COMPARISON_OPERATORS[m['op']]
3529         if m['negation']:
3530             op = lambda attr, value: not unnegated_op(attr, value)
3531         else:
3532             op = unnegated_op
3533         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3534         if m['quote']:
3535             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3536         actual_value = dct.get(m['key'])
3537         numeric_comparison = None
3538         if isinstance(actual_value, compat_numeric_types):
3539             # If the original field is a string and matching comparisonvalue is
3540             # a number we should respect the origin of the original field
3541             # and process comparison value as a string (see
3542             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3543             try:
3544                 numeric_comparison = int(comparison_value)
3545             except ValueError:
3546                 numeric_comparison = parse_filesize(comparison_value)
3547                 if numeric_comparison is None:
3548                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3549                 if numeric_comparison is None:
3550                     numeric_comparison = parse_duration(comparison_value)
3551         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3552             raise ValueError('Operator %s only supports string values!' % m['op'])
3553         if actual_value is None:
3554             return incomplete or m['none_inclusive']
3555         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3556
3557     UNARY_OPERATORS = {
3558         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3559         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3560     }
3561     operator_rex = re.compile(r'''(?x)\s*
3562         (?P<op>%s)\s*(?P<key>[a-z_]+)
3563         \s*$
3564         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3565     m = operator_rex.search(filter_part)
3566     if m:
3567         op = UNARY_OPERATORS[m.group('op')]
3568         actual_value = dct.get(m.group('key'))
3569         if incomplete and actual_value is None:
3570             return True
3571         return op(actual_value)
3572
3573     raise ValueError('Invalid filter part %r' % filter_part)
3574
3575
3576 def match_str(filter_str, dct, incomplete=False):
3577     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3578         When incomplete, all conditions passes on missing fields
3579     """
3580     return all(
3581         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3582         for filter_part in re.split(r'(?<!\\)&', filter_str))
3583
3584
3585 def match_filter_func(filter_str):
3586     def _match_func(info_dict, *args, **kwargs):
3587         if match_str(filter_str, info_dict, *args, **kwargs):
3588             return None
3589         else:
3590             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3591             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3592     return _match_func
3593
3594
3595 def parse_dfxp_time_expr(time_expr):
3596     if not time_expr:
3597         return
3598
3599     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3600     if mobj:
3601         return float(mobj.group('time_offset'))
3602
3603     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3604     if mobj:
3605         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3606
3607
3608 def srt_subtitles_timecode(seconds):
3609     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3610
3611
3612 def ass_subtitles_timecode(seconds):
3613     time = timetuple_from_msec(seconds * 1000)
3614     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3615
3616
3617 def dfxp2srt(dfxp_data):
3618     '''
3619     @param dfxp_data A bytes-like object containing DFXP data
3620     @returns A unicode object containing converted SRT data
3621     '''
3622     LEGACY_NAMESPACES = (
3623         (b'http://www.w3.org/ns/ttml', [
3624             b'http://www.w3.org/2004/11/ttaf1',
3625             b'http://www.w3.org/2006/04/ttaf1',
3626             b'http://www.w3.org/2006/10/ttaf1',
3627         ]),
3628         (b'http://www.w3.org/ns/ttml#styling', [
3629             b'http://www.w3.org/ns/ttml#style',
3630         ]),
3631     )
3632
3633     SUPPORTED_STYLING = [
3634         'color',
3635         'fontFamily',
3636         'fontSize',
3637         'fontStyle',
3638         'fontWeight',
3639         'textDecoration'
3640     ]
3641
3642     _x = functools.partial(xpath_with_ns, ns_map={
3643         'xml': 'http://www.w3.org/XML/1998/namespace',
3644         'ttml': 'http://www.w3.org/ns/ttml',
3645         'tts': 'http://www.w3.org/ns/ttml#styling',
3646     })
3647
3648     styles = {}
3649     default_style = {}
3650
3651     class TTMLPElementParser(object):
3652         _out = ''
3653         _unclosed_elements = []
3654         _applied_styles = []
3655
3656         def start(self, tag, attrib):
3657             if tag in (_x('ttml:br'), 'br'):
3658                 self._out += '\n'
3659             else:
3660                 unclosed_elements = []
3661                 style = {}
3662                 element_style_id = attrib.get('style')
3663                 if default_style:
3664                     style.update(default_style)
3665                 if element_style_id:
3666                     style.update(styles.get(element_style_id, {}))
3667                 for prop in SUPPORTED_STYLING:
3668                     prop_val = attrib.get(_x('tts:' + prop))
3669                     if prop_val:
3670                         style[prop] = prop_val
3671                 if style:
3672                     font = ''
3673                     for k, v in sorted(style.items()):
3674                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3675                             continue
3676                         if k == 'color':
3677                             font += ' color="%s"' % v
3678                         elif k == 'fontSize':
3679                             font += ' size="%s"' % v
3680                         elif k == 'fontFamily':
3681                             font += ' face="%s"' % v
3682                         elif k == 'fontWeight' and v == 'bold':
3683                             self._out += '<b>'
3684                             unclosed_elements.append('b')
3685                         elif k == 'fontStyle' and v == 'italic':
3686                             self._out += '<i>'
3687                             unclosed_elements.append('i')
3688                         elif k == 'textDecoration' and v == 'underline':
3689                             self._out += '<u>'
3690                             unclosed_elements.append('u')
3691                     if font:
3692                         self._out += '<font' + font + '>'
3693                         unclosed_elements.append('font')
3694                     applied_style = {}
3695                     if self._applied_styles:
3696                         applied_style.update(self._applied_styles[-1])
3697                     applied_style.update(style)
3698                     self._applied_styles.append(applied_style)
3699                 self._unclosed_elements.append(unclosed_elements)
3700
3701         def end(self, tag):
3702             if tag not in (_x('ttml:br'), 'br'):
3703                 unclosed_elements = self._unclosed_elements.pop()
3704                 for element in reversed(unclosed_elements):
3705                     self._out += '</%s>' % element
3706                 if unclosed_elements and self._applied_styles:
3707                     self._applied_styles.pop()
3708
3709         def data(self, data):
3710             self._out += data
3711
3712         def close(self):
3713             return self._out.strip()
3714
3715     def parse_node(node):
3716         target = TTMLPElementParser()
3717         parser = xml.etree.ElementTree.XMLParser(target=target)
3718         parser.feed(xml.etree.ElementTree.tostring(node))
3719         return parser.close()
3720
3721     for k, v in LEGACY_NAMESPACES:
3722         for ns in v:
3723             dfxp_data = dfxp_data.replace(ns, k)
3724
3725     dfxp = compat_etree_fromstring(dfxp_data)
3726     out = []
3727     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3728
3729     if not paras:
3730         raise ValueError('Invalid dfxp/TTML subtitle')
3731
3732     repeat = False
3733     while True:
3734         for style in dfxp.findall(_x('.//ttml:style')):
3735             style_id = style.get('id') or style.get(_x('xml:id'))
3736             if not style_id:
3737                 continue
3738             parent_style_id = style.get('style')
3739             if parent_style_id:
3740                 if parent_style_id not in styles:
3741                     repeat = True
3742                     continue
3743                 styles[style_id] = styles[parent_style_id].copy()
3744             for prop in SUPPORTED_STYLING:
3745                 prop_val = style.get(_x('tts:' + prop))
3746                 if prop_val:
3747                     styles.setdefault(style_id, {})[prop] = prop_val
3748         if repeat:
3749             repeat = False
3750         else:
3751             break
3752
3753     for p in ('body', 'div'):
3754         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3755         if ele is None:
3756             continue
3757         style = styles.get(ele.get('style'))
3758         if not style:
3759             continue
3760         default_style.update(style)
3761
3762     for para, index in zip(paras, itertools.count(1)):
3763         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3764         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3765         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3766         if begin_time is None:
3767             continue
3768         if not end_time:
3769             if not dur:
3770                 continue
3771             end_time = begin_time + dur
3772         out.append('%d\n%s --> %s\n%s\n\n' % (
3773             index,
3774             srt_subtitles_timecode(begin_time),
3775             srt_subtitles_timecode(end_time),
3776             parse_node(para)))
3777
3778     return ''.join(out)
3779
3780
3781 def cli_option(params, command_option, param):
3782     param = params.get(param)
3783     if param:
3784         param = compat_str(param)
3785     return [command_option, param] if param is not None else []
3786
3787
3788 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3789     param = params.get(param)
3790     if param is None:
3791         return []
3792     assert isinstance(param, bool)
3793     if separator:
3794         return [command_option + separator + (true_value if param else false_value)]
3795     return [command_option, true_value if param else false_value]
3796
3797
3798 def cli_valueless_option(params, command_option, param, expected_value=True):
3799     param = params.get(param)
3800     return [command_option] if param == expected_value else []
3801
3802
3803 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3804     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3805         if use_compat:
3806             return argdict
3807         else:
3808             argdict = None
3809     if argdict is None:
3810         return default
3811     assert isinstance(argdict, dict)
3812
3813     assert isinstance(keys, (list, tuple))
3814     for key_list in keys:
3815         arg_list = list(filter(
3816             lambda x: x is not None,
3817             [argdict.get(key.lower()) for key in variadic(key_list)]))
3818         if arg_list:
3819             return [arg for args in arg_list for arg in args]
3820     return default
3821
3822
3823 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3824     main_key, exe = main_key.lower(), exe.lower()
3825     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3826     keys = [f'{root_key}{k}' for k in (keys or [''])]
3827     if root_key in keys:
3828         if main_key != exe:
3829             keys.append((main_key, exe))
3830         keys.append('default')
3831     else:
3832         use_compat = False
3833     return cli_configuration_args(argdict, keys, default, use_compat)
3834
3835
3836 class ISO639Utils(object):
3837     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3838     _lang_map = {
3839         'aa': 'aar',
3840         'ab': 'abk',
3841         'ae': 'ave',
3842         'af': 'afr',
3843         'ak': 'aka',
3844         'am': 'amh',
3845         'an': 'arg',
3846         'ar': 'ara',
3847         'as': 'asm',
3848         'av': 'ava',
3849         'ay': 'aym',
3850         'az': 'aze',
3851         'ba': 'bak',
3852         'be': 'bel',
3853         'bg': 'bul',
3854         'bh': 'bih',
3855         'bi': 'bis',
3856         'bm': 'bam',
3857         'bn': 'ben',
3858         'bo': 'bod',
3859         'br': 'bre',
3860         'bs': 'bos',
3861         'ca': 'cat',
3862         'ce': 'che',
3863         'ch': 'cha',
3864         'co': 'cos',
3865         'cr': 'cre',
3866         'cs': 'ces',
3867         'cu': 'chu',
3868         'cv': 'chv',
3869         'cy': 'cym',
3870         'da': 'dan',
3871         'de': 'deu',
3872         'dv': 'div',
3873         'dz': 'dzo',
3874         'ee': 'ewe',
3875         'el': 'ell',
3876         'en': 'eng',
3877         'eo': 'epo',
3878         'es': 'spa',
3879         'et': 'est',
3880         'eu': 'eus',
3881         'fa': 'fas',
3882         'ff': 'ful',
3883         'fi': 'fin',
3884         'fj': 'fij',
3885         'fo': 'fao',
3886         'fr': 'fra',
3887         'fy': 'fry',
3888         'ga': 'gle',
3889         'gd': 'gla',
3890         'gl': 'glg',
3891         'gn': 'grn',
3892         'gu': 'guj',
3893         'gv': 'glv',
3894         'ha': 'hau',
3895         'he': 'heb',
3896         'iw': 'heb',  # Replaced by he in 1989 revision
3897         'hi': 'hin',
3898         'ho': 'hmo',
3899         'hr': 'hrv',
3900         'ht': 'hat',
3901         'hu': 'hun',
3902         'hy': 'hye',
3903         'hz': 'her',
3904         'ia': 'ina',
3905         'id': 'ind',
3906         'in': 'ind',  # Replaced by id in 1989 revision
3907         'ie': 'ile',
3908         'ig': 'ibo',
3909         'ii': 'iii',
3910         'ik': 'ipk',
3911         'io': 'ido',
3912         'is': 'isl',
3913         'it': 'ita',
3914         'iu': 'iku',
3915         'ja': 'jpn',
3916         'jv': 'jav',
3917         'ka': 'kat',
3918         'kg': 'kon',
3919         'ki': 'kik',
3920         'kj': 'kua',
3921         'kk': 'kaz',
3922         'kl': 'kal',
3923         'km': 'khm',
3924         'kn': 'kan',
3925         'ko': 'kor',
3926         'kr': 'kau',
3927         'ks': 'kas',
3928         'ku': 'kur',
3929         'kv': 'kom',
3930         'kw': 'cor',
3931         'ky': 'kir',
3932         'la': 'lat',
3933         'lb': 'ltz',
3934         'lg': 'lug',
3935         'li': 'lim',
3936         'ln': 'lin',
3937         'lo': 'lao',
3938         'lt': 'lit',
3939         'lu': 'lub',
3940         'lv': 'lav',
3941         'mg': 'mlg',
3942         'mh': 'mah',
3943         'mi': 'mri',
3944         'mk': 'mkd',
3945         'ml': 'mal',
3946         'mn': 'mon',
3947         'mr': 'mar',
3948         'ms': 'msa',
3949         'mt': 'mlt',
3950         'my': 'mya',
3951         'na': 'nau',
3952         'nb': 'nob',
3953         'nd': 'nde',
3954         'ne': 'nep',
3955         'ng': 'ndo',
3956         'nl': 'nld',
3957         'nn': 'nno',
3958         'no': 'nor',
3959         'nr': 'nbl',
3960         'nv': 'nav',
3961         'ny': 'nya',
3962         'oc': 'oci',
3963         'oj': 'oji',
3964         'om': 'orm',
3965         'or': 'ori',
3966         'os': 'oss',
3967         'pa': 'pan',
3968         'pi': 'pli',
3969         'pl': 'pol',
3970         'ps': 'pus',
3971         'pt': 'por',
3972         'qu': 'que',
3973         'rm': 'roh',
3974         'rn': 'run',
3975         'ro': 'ron',
3976         'ru': 'rus',
3977         'rw': 'kin',
3978         'sa': 'san',
3979         'sc': 'srd',
3980         'sd': 'snd',
3981         'se': 'sme',
3982         'sg': 'sag',
3983         'si': 'sin',
3984         'sk': 'slk',
3985         'sl': 'slv',
3986         'sm': 'smo',
3987         'sn': 'sna',
3988         'so': 'som',
3989         'sq': 'sqi',
3990         'sr': 'srp',
3991         'ss': 'ssw',
3992         'st': 'sot',
3993         'su': 'sun',
3994         'sv': 'swe',
3995         'sw': 'swa',
3996         'ta': 'tam',
3997         'te': 'tel',
3998         'tg': 'tgk',
3999         'th': 'tha',
4000         'ti': 'tir',
4001         'tk': 'tuk',
4002         'tl': 'tgl',
4003         'tn': 'tsn',
4004         'to': 'ton',
4005         'tr': 'tur',
4006         'ts': 'tso',
4007         'tt': 'tat',
4008         'tw': 'twi',
4009         'ty': 'tah',
4010         'ug': 'uig',
4011         'uk': 'ukr',
4012         'ur': 'urd',
4013         'uz': 'uzb',
4014         've': 'ven',
4015         'vi': 'vie',
4016         'vo': 'vol',
4017         'wa': 'wln',
4018         'wo': 'wol',
4019         'xh': 'xho',
4020         'yi': 'yid',
4021         'ji': 'yid',  # Replaced by yi in 1989 revision
4022         'yo': 'yor',
4023         'za': 'zha',
4024         'zh': 'zho',
4025         'zu': 'zul',
4026     }
4027
4028     @classmethod
4029     def short2long(cls, code):
4030         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4031         return cls._lang_map.get(code[:2])
4032
4033     @classmethod
4034     def long2short(cls, code):
4035         """Convert language code from ISO 639-2/T to ISO 639-1"""
4036         for short_name, long_name in cls._lang_map.items():
4037             if long_name == code:
4038                 return short_name
4039
4040
4041 class ISO3166Utils(object):
4042     # From http://data.okfn.org/data/core/country-list
4043     _country_map = {
4044         'AF': 'Afghanistan',
4045         'AX': 'Åland Islands',
4046         'AL': 'Albania',
4047         'DZ': 'Algeria',
4048         'AS': 'American Samoa',
4049         'AD': 'Andorra',
4050         'AO': 'Angola',
4051         'AI': 'Anguilla',
4052         'AQ': 'Antarctica',
4053         'AG': 'Antigua and Barbuda',
4054         'AR': 'Argentina',
4055         'AM': 'Armenia',
4056         'AW': 'Aruba',
4057         'AU': 'Australia',
4058         'AT': 'Austria',
4059         'AZ': 'Azerbaijan',
4060         'BS': 'Bahamas',
4061         'BH': 'Bahrain',
4062         'BD': 'Bangladesh',
4063         'BB': 'Barbados',
4064         'BY': 'Belarus',
4065         'BE': 'Belgium',
4066         'BZ': 'Belize',
4067         'BJ': 'Benin',
4068         'BM': 'Bermuda',
4069         'BT': 'Bhutan',
4070         'BO': 'Bolivia, Plurinational State of',
4071         'BQ': 'Bonaire, Sint Eustatius and Saba',
4072         'BA': 'Bosnia and Herzegovina',
4073         'BW': 'Botswana',
4074         'BV': 'Bouvet Island',
4075         'BR': 'Brazil',
4076         'IO': 'British Indian Ocean Territory',
4077         'BN': 'Brunei Darussalam',
4078         'BG': 'Bulgaria',
4079         'BF': 'Burkina Faso',
4080         'BI': 'Burundi',
4081         'KH': 'Cambodia',
4082         'CM': 'Cameroon',
4083         'CA': 'Canada',
4084         'CV': 'Cape Verde',
4085         'KY': 'Cayman Islands',
4086         'CF': 'Central African Republic',
4087         'TD': 'Chad',
4088         'CL': 'Chile',
4089         'CN': 'China',
4090         'CX': 'Christmas Island',
4091         'CC': 'Cocos (Keeling) Islands',
4092         'CO': 'Colombia',
4093         'KM': 'Comoros',
4094         'CG': 'Congo',
4095         'CD': 'Congo, the Democratic Republic of the',
4096         'CK': 'Cook Islands',
4097         'CR': 'Costa Rica',
4098         'CI': 'Côte d\'Ivoire',
4099         'HR': 'Croatia',
4100         'CU': 'Cuba',
4101         'CW': 'Curaçao',
4102         'CY': 'Cyprus',
4103         'CZ': 'Czech Republic',
4104         'DK': 'Denmark',
4105         'DJ': 'Djibouti',
4106         'DM': 'Dominica',
4107         'DO': 'Dominican Republic',
4108         'EC': 'Ecuador',
4109         'EG': 'Egypt',
4110         'SV': 'El Salvador',
4111         'GQ': 'Equatorial Guinea',
4112         'ER': 'Eritrea',
4113         'EE': 'Estonia',
4114         'ET': 'Ethiopia',
4115         'FK': 'Falkland Islands (Malvinas)',
4116         'FO': 'Faroe Islands',
4117         'FJ': 'Fiji',
4118         'FI': 'Finland',
4119         'FR': 'France',
4120         'GF': 'French Guiana',
4121         'PF': 'French Polynesia',
4122         'TF': 'French Southern Territories',
4123         'GA': 'Gabon',
4124         'GM': 'Gambia',
4125         'GE': 'Georgia',
4126         'DE': 'Germany',
4127         'GH': 'Ghana',
4128         'GI': 'Gibraltar',
4129         'GR': 'Greece',
4130         'GL': 'Greenland',
4131         'GD': 'Grenada',
4132         'GP': 'Guadeloupe',
4133         'GU': 'Guam',
4134         'GT': 'Guatemala',
4135         'GG': 'Guernsey',
4136         'GN': 'Guinea',
4137         'GW': 'Guinea-Bissau',
4138         'GY': 'Guyana',
4139         'HT': 'Haiti',
4140         'HM': 'Heard Island and McDonald Islands',
4141         'VA': 'Holy See (Vatican City State)',
4142         'HN': 'Honduras',
4143         'HK': 'Hong Kong',
4144         'HU': 'Hungary',
4145         'IS': 'Iceland',
4146         'IN': 'India',
4147         'ID': 'Indonesia',
4148         'IR': 'Iran, Islamic Republic of',
4149         'IQ': 'Iraq',
4150         'IE': 'Ireland',
4151         'IM': 'Isle of Man',
4152         'IL': 'Israel',
4153         'IT': 'Italy',
4154         'JM': 'Jamaica',
4155         'JP': 'Japan',
4156         'JE': 'Jersey',
4157         'JO': 'Jordan',
4158         'KZ': 'Kazakhstan',
4159         'KE': 'Kenya',
4160         'KI': 'Kiribati',
4161         'KP': 'Korea, Democratic People\'s Republic of',
4162         'KR': 'Korea, Republic of',
4163         'KW': 'Kuwait',
4164         'KG': 'Kyrgyzstan',
4165         'LA': 'Lao People\'s Democratic Republic',
4166         'LV': 'Latvia',
4167         'LB': 'Lebanon',
4168         'LS': 'Lesotho',
4169         'LR': 'Liberia',
4170         'LY': 'Libya',
4171         'LI': 'Liechtenstein',
4172         'LT': 'Lithuania',
4173         'LU': 'Luxembourg',
4174         'MO': 'Macao',
4175         'MK': 'Macedonia, the Former Yugoslav Republic of',
4176         'MG': 'Madagascar',
4177         'MW': 'Malawi',
4178         'MY': 'Malaysia',
4179         'MV': 'Maldives',
4180         'ML': 'Mali',
4181         'MT': 'Malta',
4182         'MH': 'Marshall Islands',
4183         'MQ': 'Martinique',
4184         'MR': 'Mauritania',
4185         'MU': 'Mauritius',
4186         'YT': 'Mayotte',
4187         'MX': 'Mexico',
4188         'FM': 'Micronesia, Federated States of',
4189         'MD': 'Moldova, Republic of',
4190         'MC': 'Monaco',
4191         'MN': 'Mongolia',
4192         'ME': 'Montenegro',
4193         'MS': 'Montserrat',
4194         'MA': 'Morocco',
4195         'MZ': 'Mozambique',
4196         'MM': 'Myanmar',
4197         'NA': 'Namibia',
4198         'NR': 'Nauru',
4199         'NP': 'Nepal',
4200         'NL': 'Netherlands',
4201         'NC': 'New Caledonia',
4202         'NZ': 'New Zealand',
4203         'NI': 'Nicaragua',
4204         'NE': 'Niger',
4205         'NG': 'Nigeria',
4206         'NU': 'Niue',
4207         'NF': 'Norfolk Island',
4208         'MP': 'Northern Mariana Islands',
4209         'NO': 'Norway',
4210         'OM': 'Oman',
4211         'PK': 'Pakistan',
4212         'PW': 'Palau',
4213         'PS': 'Palestine, State of',
4214         'PA': 'Panama',
4215         'PG': 'Papua New Guinea',
4216         'PY': 'Paraguay',
4217         'PE': 'Peru',
4218         'PH': 'Philippines',
4219         'PN': 'Pitcairn',
4220         'PL': 'Poland',
4221         'PT': 'Portugal',
4222         'PR': 'Puerto Rico',
4223         'QA': 'Qatar',
4224         'RE': 'Réunion',
4225         'RO': 'Romania',
4226         'RU': 'Russian Federation',
4227         'RW': 'Rwanda',
4228         'BL': 'Saint Barthélemy',
4229         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4230         'KN': 'Saint Kitts and Nevis',
4231         'LC': 'Saint Lucia',
4232         'MF': 'Saint Martin (French part)',
4233         'PM': 'Saint Pierre and Miquelon',
4234         'VC': 'Saint Vincent and the Grenadines',
4235         'WS': 'Samoa',
4236         'SM': 'San Marino',
4237         'ST': 'Sao Tome and Principe',
4238         'SA': 'Saudi Arabia',
4239         'SN': 'Senegal',
4240         'RS': 'Serbia',
4241         'SC': 'Seychelles',
4242         'SL': 'Sierra Leone',
4243         'SG': 'Singapore',
4244         'SX': 'Sint Maarten (Dutch part)',
4245         'SK': 'Slovakia',
4246         'SI': 'Slovenia',
4247         'SB': 'Solomon Islands',
4248         'SO': 'Somalia',
4249         'ZA': 'South Africa',
4250         'GS': 'South Georgia and the South Sandwich Islands',
4251         'SS': 'South Sudan',
4252         'ES': 'Spain',
4253         'LK': 'Sri Lanka',
4254         'SD': 'Sudan',
4255         'SR': 'Suriname',
4256         'SJ': 'Svalbard and Jan Mayen',
4257         'SZ': 'Swaziland',
4258         'SE': 'Sweden',
4259         'CH': 'Switzerland',
4260         'SY': 'Syrian Arab Republic',
4261         'TW': 'Taiwan, Province of China',
4262         'TJ': 'Tajikistan',
4263         'TZ': 'Tanzania, United Republic of',
4264         'TH': 'Thailand',
4265         'TL': 'Timor-Leste',
4266         'TG': 'Togo',
4267         'TK': 'Tokelau',
4268         'TO': 'Tonga',
4269         'TT': 'Trinidad and Tobago',
4270         'TN': 'Tunisia',
4271         'TR': 'Turkey',
4272         'TM': 'Turkmenistan',
4273         'TC': 'Turks and Caicos Islands',
4274         'TV': 'Tuvalu',
4275         'UG': 'Uganda',
4276         'UA': 'Ukraine',
4277         'AE': 'United Arab Emirates',
4278         'GB': 'United Kingdom',
4279         'US': 'United States',
4280         'UM': 'United States Minor Outlying Islands',
4281         'UY': 'Uruguay',
4282         'UZ': 'Uzbekistan',
4283         'VU': 'Vanuatu',
4284         'VE': 'Venezuela, Bolivarian Republic of',
4285         'VN': 'Viet Nam',
4286         'VG': 'Virgin Islands, British',
4287         'VI': 'Virgin Islands, U.S.',
4288         'WF': 'Wallis and Futuna',
4289         'EH': 'Western Sahara',
4290         'YE': 'Yemen',
4291         'ZM': 'Zambia',
4292         'ZW': 'Zimbabwe',
4293     }
4294
4295     @classmethod
4296     def short2full(cls, code):
4297         """Convert an ISO 3166-2 country code to the corresponding full name"""
4298         return cls._country_map.get(code.upper())
4299
4300
4301 class GeoUtils(object):
4302     # Major IPv4 address blocks per country
4303     _country_ip_map = {
4304         'AD': '46.172.224.0/19',
4305         'AE': '94.200.0.0/13',
4306         'AF': '149.54.0.0/17',
4307         'AG': '209.59.64.0/18',
4308         'AI': '204.14.248.0/21',
4309         'AL': '46.99.0.0/16',
4310         'AM': '46.70.0.0/15',
4311         'AO': '105.168.0.0/13',
4312         'AP': '182.50.184.0/21',
4313         'AQ': '23.154.160.0/24',
4314         'AR': '181.0.0.0/12',
4315         'AS': '202.70.112.0/20',
4316         'AT': '77.116.0.0/14',
4317         'AU': '1.128.0.0/11',
4318         'AW': '181.41.0.0/18',
4319         'AX': '185.217.4.0/22',
4320         'AZ': '5.197.0.0/16',
4321         'BA': '31.176.128.0/17',
4322         'BB': '65.48.128.0/17',
4323         'BD': '114.130.0.0/16',
4324         'BE': '57.0.0.0/8',
4325         'BF': '102.178.0.0/15',
4326         'BG': '95.42.0.0/15',
4327         'BH': '37.131.0.0/17',
4328         'BI': '154.117.192.0/18',
4329         'BJ': '137.255.0.0/16',
4330         'BL': '185.212.72.0/23',
4331         'BM': '196.12.64.0/18',
4332         'BN': '156.31.0.0/16',
4333         'BO': '161.56.0.0/16',
4334         'BQ': '161.0.80.0/20',
4335         'BR': '191.128.0.0/12',
4336         'BS': '24.51.64.0/18',
4337         'BT': '119.2.96.0/19',
4338         'BW': '168.167.0.0/16',
4339         'BY': '178.120.0.0/13',
4340         'BZ': '179.42.192.0/18',
4341         'CA': '99.224.0.0/11',
4342         'CD': '41.243.0.0/16',
4343         'CF': '197.242.176.0/21',
4344         'CG': '160.113.0.0/16',
4345         'CH': '85.0.0.0/13',
4346         'CI': '102.136.0.0/14',
4347         'CK': '202.65.32.0/19',
4348         'CL': '152.172.0.0/14',
4349         'CM': '102.244.0.0/14',
4350         'CN': '36.128.0.0/10',
4351         'CO': '181.240.0.0/12',
4352         'CR': '201.192.0.0/12',
4353         'CU': '152.206.0.0/15',
4354         'CV': '165.90.96.0/19',
4355         'CW': '190.88.128.0/17',
4356         'CY': '31.153.0.0/16',
4357         'CZ': '88.100.0.0/14',
4358         'DE': '53.0.0.0/8',
4359         'DJ': '197.241.0.0/17',
4360         'DK': '87.48.0.0/12',
4361         'DM': '192.243.48.0/20',
4362         'DO': '152.166.0.0/15',
4363         'DZ': '41.96.0.0/12',
4364         'EC': '186.68.0.0/15',
4365         'EE': '90.190.0.0/15',
4366         'EG': '156.160.0.0/11',
4367         'ER': '196.200.96.0/20',
4368         'ES': '88.0.0.0/11',
4369         'ET': '196.188.0.0/14',
4370         'EU': '2.16.0.0/13',
4371         'FI': '91.152.0.0/13',
4372         'FJ': '144.120.0.0/16',
4373         'FK': '80.73.208.0/21',
4374         'FM': '119.252.112.0/20',
4375         'FO': '88.85.32.0/19',
4376         'FR': '90.0.0.0/9',
4377         'GA': '41.158.0.0/15',
4378         'GB': '25.0.0.0/8',
4379         'GD': '74.122.88.0/21',
4380         'GE': '31.146.0.0/16',
4381         'GF': '161.22.64.0/18',
4382         'GG': '62.68.160.0/19',
4383         'GH': '154.160.0.0/12',
4384         'GI': '95.164.0.0/16',
4385         'GL': '88.83.0.0/19',
4386         'GM': '160.182.0.0/15',
4387         'GN': '197.149.192.0/18',
4388         'GP': '104.250.0.0/19',
4389         'GQ': '105.235.224.0/20',
4390         'GR': '94.64.0.0/13',
4391         'GT': '168.234.0.0/16',
4392         'GU': '168.123.0.0/16',
4393         'GW': '197.214.80.0/20',
4394         'GY': '181.41.64.0/18',
4395         'HK': '113.252.0.0/14',
4396         'HN': '181.210.0.0/16',
4397         'HR': '93.136.0.0/13',
4398         'HT': '148.102.128.0/17',
4399         'HU': '84.0.0.0/14',
4400         'ID': '39.192.0.0/10',
4401         'IE': '87.32.0.0/12',
4402         'IL': '79.176.0.0/13',
4403         'IM': '5.62.80.0/20',
4404         'IN': '117.192.0.0/10',
4405         'IO': '203.83.48.0/21',
4406         'IQ': '37.236.0.0/14',
4407         'IR': '2.176.0.0/12',
4408         'IS': '82.221.0.0/16',
4409         'IT': '79.0.0.0/10',
4410         'JE': '87.244.64.0/18',
4411         'JM': '72.27.0.0/17',
4412         'JO': '176.29.0.0/16',
4413         'JP': '133.0.0.0/8',
4414         'KE': '105.48.0.0/12',
4415         'KG': '158.181.128.0/17',
4416         'KH': '36.37.128.0/17',
4417         'KI': '103.25.140.0/22',
4418         'KM': '197.255.224.0/20',
4419         'KN': '198.167.192.0/19',
4420         'KP': '175.45.176.0/22',
4421         'KR': '175.192.0.0/10',
4422         'KW': '37.36.0.0/14',
4423         'KY': '64.96.0.0/15',
4424         'KZ': '2.72.0.0/13',
4425         'LA': '115.84.64.0/18',
4426         'LB': '178.135.0.0/16',
4427         'LC': '24.92.144.0/20',
4428         'LI': '82.117.0.0/19',
4429         'LK': '112.134.0.0/15',
4430         'LR': '102.183.0.0/16',
4431         'LS': '129.232.0.0/17',
4432         'LT': '78.56.0.0/13',
4433         'LU': '188.42.0.0/16',
4434         'LV': '46.109.0.0/16',
4435         'LY': '41.252.0.0/14',
4436         'MA': '105.128.0.0/11',
4437         'MC': '88.209.64.0/18',
4438         'MD': '37.246.0.0/16',
4439         'ME': '178.175.0.0/17',
4440         'MF': '74.112.232.0/21',
4441         'MG': '154.126.0.0/17',
4442         'MH': '117.103.88.0/21',
4443         'MK': '77.28.0.0/15',
4444         'ML': '154.118.128.0/18',
4445         'MM': '37.111.0.0/17',
4446         'MN': '49.0.128.0/17',
4447         'MO': '60.246.0.0/16',
4448         'MP': '202.88.64.0/20',
4449         'MQ': '109.203.224.0/19',
4450         'MR': '41.188.64.0/18',
4451         'MS': '208.90.112.0/22',
4452         'MT': '46.11.0.0/16',
4453         'MU': '105.16.0.0/12',
4454         'MV': '27.114.128.0/18',
4455         'MW': '102.70.0.0/15',
4456         'MX': '187.192.0.0/11',
4457         'MY': '175.136.0.0/13',
4458         'MZ': '197.218.0.0/15',
4459         'NA': '41.182.0.0/16',
4460         'NC': '101.101.0.0/18',
4461         'NE': '197.214.0.0/18',
4462         'NF': '203.17.240.0/22',
4463         'NG': '105.112.0.0/12',
4464         'NI': '186.76.0.0/15',
4465         'NL': '145.96.0.0/11',
4466         'NO': '84.208.0.0/13',
4467         'NP': '36.252.0.0/15',
4468         'NR': '203.98.224.0/19',
4469         'NU': '49.156.48.0/22',
4470         'NZ': '49.224.0.0/14',
4471         'OM': '5.36.0.0/15',
4472         'PA': '186.72.0.0/15',
4473         'PE': '186.160.0.0/14',
4474         'PF': '123.50.64.0/18',
4475         'PG': '124.240.192.0/19',
4476         'PH': '49.144.0.0/13',
4477         'PK': '39.32.0.0/11',
4478         'PL': '83.0.0.0/11',
4479         'PM': '70.36.0.0/20',
4480         'PR': '66.50.0.0/16',
4481         'PS': '188.161.0.0/16',
4482         'PT': '85.240.0.0/13',
4483         'PW': '202.124.224.0/20',
4484         'PY': '181.120.0.0/14',
4485         'QA': '37.210.0.0/15',
4486         'RE': '102.35.0.0/16',
4487         'RO': '79.112.0.0/13',
4488         'RS': '93.86.0.0/15',
4489         'RU': '5.136.0.0/13',
4490         'RW': '41.186.0.0/16',
4491         'SA': '188.48.0.0/13',
4492         'SB': '202.1.160.0/19',
4493         'SC': '154.192.0.0/11',
4494         'SD': '102.120.0.0/13',
4495         'SE': '78.64.0.0/12',
4496         'SG': '8.128.0.0/10',
4497         'SI': '188.196.0.0/14',
4498         'SK': '78.98.0.0/15',
4499         'SL': '102.143.0.0/17',
4500         'SM': '89.186.32.0/19',
4501         'SN': '41.82.0.0/15',
4502         'SO': '154.115.192.0/18',
4503         'SR': '186.179.128.0/17',
4504         'SS': '105.235.208.0/21',
4505         'ST': '197.159.160.0/19',
4506         'SV': '168.243.0.0/16',
4507         'SX': '190.102.0.0/20',
4508         'SY': '5.0.0.0/16',
4509         'SZ': '41.84.224.0/19',
4510         'TC': '65.255.48.0/20',
4511         'TD': '154.68.128.0/19',
4512         'TG': '196.168.0.0/14',
4513         'TH': '171.96.0.0/13',
4514         'TJ': '85.9.128.0/18',
4515         'TK': '27.96.24.0/21',
4516         'TL': '180.189.160.0/20',
4517         'TM': '95.85.96.0/19',
4518         'TN': '197.0.0.0/11',
4519         'TO': '175.176.144.0/21',
4520         'TR': '78.160.0.0/11',
4521         'TT': '186.44.0.0/15',
4522         'TV': '202.2.96.0/19',
4523         'TW': '120.96.0.0/11',
4524         'TZ': '156.156.0.0/14',
4525         'UA': '37.52.0.0/14',
4526         'UG': '102.80.0.0/13',
4527         'US': '6.0.0.0/8',
4528         'UY': '167.56.0.0/13',
4529         'UZ': '84.54.64.0/18',
4530         'VA': '212.77.0.0/19',
4531         'VC': '207.191.240.0/21',
4532         'VE': '186.88.0.0/13',
4533         'VG': '66.81.192.0/20',
4534         'VI': '146.226.0.0/16',
4535         'VN': '14.160.0.0/11',
4536         'VU': '202.80.32.0/20',
4537         'WF': '117.20.32.0/21',
4538         'WS': '202.4.32.0/19',
4539         'YE': '134.35.0.0/16',
4540         'YT': '41.242.116.0/22',
4541         'ZA': '41.0.0.0/11',
4542         'ZM': '102.144.0.0/13',
4543         'ZW': '102.177.192.0/18',
4544     }
4545
4546     @classmethod
4547     def random_ipv4(cls, code_or_block):
4548         if len(code_or_block) == 2:
4549             block = cls._country_ip_map.get(code_or_block.upper())
4550             if not block:
4551                 return None
4552         else:
4553             block = code_or_block
4554         addr, preflen = block.split('/')
4555         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4556         addr_max = addr_min | (0xffffffff >> int(preflen))
4557         return compat_str(socket.inet_ntoa(
4558             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4559
4560
4561 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4562     def __init__(self, proxies=None):
4563         # Set default handlers
4564         for type in ('http', 'https'):
4565             setattr(self, '%s_open' % type,
4566                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4567                         meth(r, proxy, type))
4568         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4569
4570     def proxy_open(self, req, proxy, type):
4571         req_proxy = req.headers.get('Ytdl-request-proxy')
4572         if req_proxy is not None:
4573             proxy = req_proxy
4574             del req.headers['Ytdl-request-proxy']
4575
4576         if proxy == '__noproxy__':
4577             return None  # No Proxy
4578         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4579             req.add_header('Ytdl-socks-proxy', proxy)
4580             # yt-dlp's http/https handlers do wrapping the socket with socks
4581             return None
4582         return compat_urllib_request.ProxyHandler.proxy_open(
4583             self, req, proxy, type)
4584
4585
4586 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4587 # released into Public Domain
4588 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4589
4590 def long_to_bytes(n, blocksize=0):
4591     """long_to_bytes(n:long, blocksize:int) : string
4592     Convert a long integer to a byte string.
4593
4594     If optional blocksize is given and greater than zero, pad the front of the
4595     byte string with binary zeros so that the length is a multiple of
4596     blocksize.
4597     """
4598     # after much testing, this algorithm was deemed to be the fastest
4599     s = b''
4600     n = int(n)
4601     while n > 0:
4602         s = compat_struct_pack('>I', n & 0xffffffff) + s
4603         n = n >> 32
4604     # strip off leading zeros
4605     for i in range(len(s)):
4606         if s[i] != b'\000'[0]:
4607             break
4608     else:
4609         # only happens when n == 0
4610         s = b'\000'
4611         i = 0
4612     s = s[i:]
4613     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4614     # de-padding being done above, but sigh...
4615     if blocksize > 0 and len(s) % blocksize:
4616         s = (blocksize - len(s) % blocksize) * b'\000' + s
4617     return s
4618
4619
4620 def bytes_to_long(s):
4621     """bytes_to_long(string) : long
4622     Convert a byte string to a long integer.
4623
4624     This is (essentially) the inverse of long_to_bytes().
4625     """
4626     acc = 0
4627     length = len(s)
4628     if length % 4:
4629         extra = (4 - length % 4)
4630         s = b'\000' * extra + s
4631         length = length + extra
4632     for i in range(0, length, 4):
4633         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4634     return acc
4635
4636
4637 def ohdave_rsa_encrypt(data, exponent, modulus):
4638     '''
4639     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4640
4641     Input:
4642         data: data to encrypt, bytes-like object
4643         exponent, modulus: parameter e and N of RSA algorithm, both integer
4644     Output: hex string of encrypted data
4645
4646     Limitation: supports one block encryption only
4647     '''
4648
4649     payload = int(binascii.hexlify(data[::-1]), 16)
4650     encrypted = pow(payload, exponent, modulus)
4651     return '%x' % encrypted
4652
4653
4654 def pkcs1pad(data, length):
4655     """
4656     Padding input data with PKCS#1 scheme
4657
4658     @param {int[]} data        input data
4659     @param {int}   length      target length
4660     @returns {int[]}           padded data
4661     """
4662     if len(data) > length - 11:
4663         raise ValueError('Input data too long for PKCS#1 padding')
4664
4665     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4666     return [0, 2] + pseudo_random + [0] + data
4667
4668
4669 def encode_base_n(num, n, table=None):
4670     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4671     if not table:
4672         table = FULL_TABLE[:n]
4673
4674     if n > len(table):
4675         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4676
4677     if num == 0:
4678         return table[0]
4679
4680     ret = ''
4681     while num:
4682         ret = table[num % n] + ret
4683         num = num // n
4684     return ret
4685
4686
4687 def decode_packed_codes(code):
4688     mobj = re.search(PACKED_CODES_RE, code)
4689     obfuscated_code, base, count, symbols = mobj.groups()
4690     base = int(base)
4691     count = int(count)
4692     symbols = symbols.split('|')
4693     symbol_table = {}
4694
4695     while count:
4696         count -= 1
4697         base_n_count = encode_base_n(count, base)
4698         symbol_table[base_n_count] = symbols[count] or base_n_count
4699
4700     return re.sub(
4701         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4702         obfuscated_code)
4703
4704
4705 def caesar(s, alphabet, shift):
4706     if shift == 0:
4707         return s
4708     l = len(alphabet)
4709     return ''.join(
4710         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4711         for c in s)
4712
4713
4714 def rot47(s):
4715     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4716
4717
4718 def parse_m3u8_attributes(attrib):
4719     info = {}
4720     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4721         if val.startswith('"'):
4722             val = val[1:-1]
4723         info[key] = val
4724     return info
4725
4726
4727 def urshift(val, n):
4728     return val >> n if val >= 0 else (val + 0x100000000) >> n
4729
4730
4731 # Based on png2str() written by @gdkchan and improved by @yokrysty
4732 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4733 def decode_png(png_data):
4734     # Reference: https://www.w3.org/TR/PNG/
4735     header = png_data[8:]
4736
4737     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4738         raise IOError('Not a valid PNG file.')
4739
4740     int_map = {1: '>B', 2: '>H', 4: '>I'}
4741     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4742
4743     chunks = []
4744
4745     while header:
4746         length = unpack_integer(header[:4])
4747         header = header[4:]
4748
4749         chunk_type = header[:4]
4750         header = header[4:]
4751
4752         chunk_data = header[:length]
4753         header = header[length:]
4754
4755         header = header[4:]  # Skip CRC
4756
4757         chunks.append({
4758             'type': chunk_type,
4759             'length': length,
4760             'data': chunk_data
4761         })
4762
4763     ihdr = chunks[0]['data']
4764
4765     width = unpack_integer(ihdr[:4])
4766     height = unpack_integer(ihdr[4:8])
4767
4768     idat = b''
4769
4770     for chunk in chunks:
4771         if chunk['type'] == b'IDAT':
4772             idat += chunk['data']
4773
4774     if not idat:
4775         raise IOError('Unable to read PNG data.')
4776
4777     decompressed_data = bytearray(zlib.decompress(idat))
4778
4779     stride = width * 3
4780     pixels = []
4781
4782     def _get_pixel(idx):
4783         x = idx % stride
4784         y = idx // stride
4785         return pixels[y][x]
4786
4787     for y in range(height):
4788         basePos = y * (1 + stride)
4789         filter_type = decompressed_data[basePos]
4790
4791         current_row = []
4792
4793         pixels.append(current_row)
4794
4795         for x in range(stride):
4796             color = decompressed_data[1 + basePos + x]
4797             basex = y * stride + x
4798             left = 0
4799             up = 0
4800
4801             if x > 2:
4802                 left = _get_pixel(basex - 3)
4803             if y > 0:
4804                 up = _get_pixel(basex - stride)
4805
4806             if filter_type == 1:  # Sub
4807                 color = (color + left) & 0xff
4808             elif filter_type == 2:  # Up
4809                 color = (color + up) & 0xff
4810             elif filter_type == 3:  # Average
4811                 color = (color + ((left + up) >> 1)) & 0xff
4812             elif filter_type == 4:  # Paeth
4813                 a = left
4814                 b = up
4815                 c = 0
4816
4817                 if x > 2 and y > 0:
4818                     c = _get_pixel(basex - stride - 3)
4819
4820                 p = a + b - c
4821
4822                 pa = abs(p - a)
4823                 pb = abs(p - b)
4824                 pc = abs(p - c)
4825
4826                 if pa <= pb and pa <= pc:
4827                     color = (color + a) & 0xff
4828                 elif pb <= pc:
4829                     color = (color + b) & 0xff
4830                 else:
4831                     color = (color + c) & 0xff
4832
4833             current_row.append(color)
4834
4835     return width, height, pixels
4836
4837
4838 def write_xattr(path, key, value):
4839     # This mess below finds the best xattr tool for the job
4840     try:
4841         # try the pyxattr module...
4842         import xattr
4843
4844         if hasattr(xattr, 'set'):  # pyxattr
4845             # Unicode arguments are not supported in python-pyxattr until
4846             # version 0.5.0
4847             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4848             pyxattr_required_version = '0.5.0'
4849             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4850                 # TODO: fallback to CLI tools
4851                 raise XAttrUnavailableError(
4852                     'python-pyxattr is detected but is too old. '
4853                     'yt-dlp requires %s or above while your version is %s. '
4854                     'Falling back to other xattr implementations' % (
4855                         pyxattr_required_version, xattr.__version__))
4856
4857             setxattr = xattr.set
4858         else:  # xattr
4859             setxattr = xattr.setxattr
4860
4861         try:
4862             setxattr(path, key, value)
4863         except EnvironmentError as e:
4864             raise XAttrMetadataError(e.errno, e.strerror)
4865
4866     except ImportError:
4867         if compat_os_name == 'nt':
4868             # Write xattrs to NTFS Alternate Data Streams:
4869             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4870             assert ':' not in key
4871             assert os.path.exists(path)
4872
4873             ads_fn = path + ':' + key
4874             try:
4875                 with open(ads_fn, 'wb') as f:
4876                     f.write(value)
4877             except EnvironmentError as e:
4878                 raise XAttrMetadataError(e.errno, e.strerror)
4879         else:
4880             user_has_setfattr = check_executable('setfattr', ['--version'])
4881             user_has_xattr = check_executable('xattr', ['-h'])
4882
4883             if user_has_setfattr or user_has_xattr:
4884
4885                 value = value.decode('utf-8')
4886                 if user_has_setfattr:
4887                     executable = 'setfattr'
4888                     opts = ['-n', key, '-v', value]
4889                 elif user_has_xattr:
4890                     executable = 'xattr'
4891                     opts = ['-w', key, value]
4892
4893                 cmd = ([encodeFilename(executable, True)]
4894                        + [encodeArgument(o) for o in opts]
4895                        + [encodeFilename(path, True)])
4896
4897                 try:
4898                     p = Popen(
4899                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4900                 except EnvironmentError as e:
4901                     raise XAttrMetadataError(e.errno, e.strerror)
4902                 stdout, stderr = p.communicate_or_kill()
4903                 stderr = stderr.decode('utf-8', 'replace')
4904                 if p.returncode != 0:
4905                     raise XAttrMetadataError(p.returncode, stderr)
4906
4907             else:
4908                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4909                 if sys.platform.startswith('linux'):
4910                     raise XAttrUnavailableError(
4911                         "Couldn't find a tool to set the xattrs. "
4912                         "Install either the python 'pyxattr' or 'xattr' "
4913                         "modules, or the GNU 'attr' package "
4914                         "(which contains the 'setfattr' tool).")
4915                 else:
4916                     raise XAttrUnavailableError(
4917                         "Couldn't find a tool to set the xattrs. "
4918                         "Install either the python 'xattr' module, "
4919                         "or the 'xattr' binary.")
4920
4921
4922 def random_birthday(year_field, month_field, day_field):
4923     start_date = datetime.date(1950, 1, 1)
4924     end_date = datetime.date(1995, 12, 31)
4925     offset = random.randint(0, (end_date - start_date).days)
4926     random_date = start_date + datetime.timedelta(offset)
4927     return {
4928         year_field: str(random_date.year),
4929         month_field: str(random_date.month),
4930         day_field: str(random_date.day),
4931     }
4932
4933
4934 # Templates for internet shortcut files, which are plain text files.
4935 DOT_URL_LINK_TEMPLATE = '''
4936 [InternetShortcut]
4937 URL=%(url)s
4938 '''.lstrip()
4939
4940 DOT_WEBLOC_LINK_TEMPLATE = '''
4941 <?xml version="1.0" encoding="UTF-8"?>
4942 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4943 <plist version="1.0">
4944 <dict>
4945 \t<key>URL</key>
4946 \t<string>%(url)s</string>
4947 </dict>
4948 </plist>
4949 '''.lstrip()
4950
4951 DOT_DESKTOP_LINK_TEMPLATE = '''
4952 [Desktop Entry]
4953 Encoding=UTF-8
4954 Name=%(filename)s
4955 Type=Link
4956 URL=%(url)s
4957 Icon=text-html
4958 '''.lstrip()
4959
4960 LINK_TEMPLATES = {
4961     'url': DOT_URL_LINK_TEMPLATE,
4962     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4963     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4964 }
4965
4966
4967 def iri_to_uri(iri):
4968     """
4969     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4970
4971     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4972     """
4973
4974     iri_parts = compat_urllib_parse_urlparse(iri)
4975
4976     if '[' in iri_parts.netloc:
4977         raise ValueError('IPv6 URIs are not, yet, supported.')
4978         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4979
4980     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4981
4982     net_location = ''
4983     if iri_parts.username:
4984         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4985         if iri_parts.password is not None:
4986             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4987         net_location += '@'
4988
4989     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4990     # The 'idna' encoding produces ASCII text.
4991     if iri_parts.port is not None and iri_parts.port != 80:
4992         net_location += ':' + str(iri_parts.port)
4993
4994     return compat_urllib_parse_urlunparse(
4995         (iri_parts.scheme,
4996             net_location,
4997
4998             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4999
5000             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5001             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5002
5003             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5004             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5005
5006             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5007
5008     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5009
5010
5011 def to_high_limit_path(path):
5012     if sys.platform in ['win32', 'cygwin']:
5013         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5014         return r'\\?\ '.rstrip() + os.path.abspath(path)
5015
5016     return path
5017
5018
5019 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5020     val = traverse_obj(obj, *variadic(field))
5021     if val in ignore:
5022         return default
5023     return template % (func(val) if func else val)
5024
5025
5026 def clean_podcast_url(url):
5027     return re.sub(r'''(?x)
5028         (?:
5029             (?:
5030                 chtbl\.com/track|
5031                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5032                 play\.podtrac\.com
5033             )/[^/]+|
5034             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5035             flex\.acast\.com|
5036             pd(?:
5037                 cn\.co| # https://podcorn.com/analytics-prefix/
5038                 st\.fm # https://podsights.com/docs/
5039             )/e
5040         )/''', '', url)
5041
5042
5043 _HEX_TABLE = '0123456789abcdef'
5044
5045
5046 def random_uuidv4():
5047     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5048
5049
5050 def make_dir(path, to_screen=None):
5051     try:
5052         dn = os.path.dirname(path)
5053         if dn and not os.path.exists(dn):
5054             os.makedirs(dn)
5055         return True
5056     except (OSError, IOError) as err:
5057         if callable(to_screen) is not None:
5058             to_screen('unable to create directory ' + error_to_compat_str(err))
5059         return False
5060
5061
5062 def get_executable_path():
5063     from zipimport import zipimporter
5064     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5065         path = os.path.dirname(sys.executable)
5066     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5067         path = os.path.join(os.path.dirname(__file__), '../..')
5068     else:
5069         path = os.path.join(os.path.dirname(__file__), '..')
5070     return os.path.abspath(path)
5071
5072
5073 def load_plugins(name, suffix, namespace):
5074     classes = {}
5075     try:
5076         plugins_spec = importlib.util.spec_from_file_location(
5077             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5078         plugins = importlib.util.module_from_spec(plugins_spec)
5079         sys.modules[plugins_spec.name] = plugins
5080         plugins_spec.loader.exec_module(plugins)
5081         for name in dir(plugins):
5082             if name in namespace:
5083                 continue
5084             if not name.endswith(suffix):
5085                 continue
5086             klass = getattr(plugins, name)
5087             classes[name] = namespace[name] = klass
5088     except FileNotFoundError:
5089         pass
5090     return classes
5091
5092
5093 def traverse_obj(
5094         obj, *path_list, default=None, expected_type=None, get_all=True,
5095         casesense=True, is_user_input=False, traverse_string=False):
5096     ''' Traverse nested list/dict/tuple
5097     @param path_list        A list of paths which are checked one by one.
5098                             Each path is a list of keys where each key is a string,
5099                             a function, a tuple of strings/None or "...".
5100                             When a fuction is given, it takes the key as argument and
5101                             returns whether the key matches or not. When a tuple is given,
5102                             all the keys given in the tuple are traversed, and
5103                             "..." traverses all the keys in the object
5104                             "None" returns the object without traversal
5105     @param default          Default value to return
5106     @param expected_type    Only accept final value of this type (Can also be any callable)
5107     @param get_all          Return all the values obtained from a path or only the first one
5108     @param casesense        Whether to consider dictionary keys as case sensitive
5109     @param is_user_input    Whether the keys are generated from user input. If True,
5110                             strings are converted to int/slice if necessary
5111     @param traverse_string  Whether to traverse inside strings. If True, any
5112                             non-compatible object will also be converted into a string
5113     # TODO: Write tests
5114     '''
5115     if not casesense:
5116         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5117         path_list = (map(_lower, variadic(path)) for path in path_list)
5118
5119     def _traverse_obj(obj, path, _current_depth=0):
5120         nonlocal depth
5121         path = tuple(variadic(path))
5122         for i, key in enumerate(path):
5123             if None in (key, obj):
5124                 return obj
5125             if isinstance(key, (list, tuple)):
5126                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5127                 key = ...
5128             if key is ...:
5129                 obj = (obj.values() if isinstance(obj, dict)
5130                        else obj if isinstance(obj, (list, tuple, LazyList))
5131                        else str(obj) if traverse_string else [])
5132                 _current_depth += 1
5133                 depth = max(depth, _current_depth)
5134                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5135             elif callable(key):
5136                 if isinstance(obj, (list, tuple, LazyList)):
5137                     obj = enumerate(obj)
5138                 elif isinstance(obj, dict):
5139                     obj = obj.items()
5140                 else:
5141                     if not traverse_string:
5142                         return None
5143                     obj = str(obj)
5144                 _current_depth += 1
5145                 depth = max(depth, _current_depth)
5146                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5147             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5148                 obj = (obj.get(key) if casesense or (key in obj)
5149                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5150             else:
5151                 if is_user_input:
5152                     key = (int_or_none(key) if ':' not in key
5153                            else slice(*map(int_or_none, key.split(':'))))
5154                     if key == slice(None):
5155                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5156                 if not isinstance(key, (int, slice)):
5157                     return None
5158                 if not isinstance(obj, (list, tuple, LazyList)):
5159                     if not traverse_string:
5160                         return None
5161                     obj = str(obj)
5162                 try:
5163                     obj = obj[key]
5164                 except IndexError:
5165                     return None
5166         return obj
5167
5168     if isinstance(expected_type, type):
5169         type_test = lambda val: val if isinstance(val, expected_type) else None
5170     elif expected_type is not None:
5171         type_test = expected_type
5172     else:
5173         type_test = lambda val: val
5174
5175     for path in path_list:
5176         depth = 0
5177         val = _traverse_obj(obj, path)
5178         if val is not None:
5179             if depth:
5180                 for _ in range(depth - 1):
5181                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5182                 val = [v for v in map(type_test, val) if v is not None]
5183                 if val:
5184                     return val if get_all else val[0]
5185             else:
5186                 val = type_test(val)
5187                 if val is not None:
5188                     return val
5189     return default
5190
5191
5192 def traverse_dict(dictn, keys, casesense=True):
5193     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5194                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5195     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5196
5197
5198 def variadic(x, allowed_types=(str, bytes, dict)):
5199     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5200
5201
5202 def decode_base(value, digits):
5203     # This will convert given base-x string to scalar (long or int)
5204     table = {char: index for index, char in enumerate(digits)}
5205     result = 0
5206     base = len(digits)
5207     for chr in value:
5208         result *= base
5209         result += table[chr]
5210     return result
5211
5212
5213 def time_seconds(**kwargs):
5214     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5215     return t.timestamp()
5216
5217
5218 # create a JSON Web Signature (jws) with HS256 algorithm
5219 # the resulting format is in JWS Compact Serialization
5220 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5221 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5222 def jwt_encode_hs256(payload_data, key, headers={}):
5223     header_data = {
5224         'alg': 'HS256',
5225         'typ': 'JWT',
5226     }
5227     if headers:
5228         header_data.update(headers)
5229     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5230     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5231     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5232     signature_b64 = base64.b64encode(h.digest())
5233     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5234     return token
5235
5236
5237 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5238 def jwt_decode_hs256(jwt):
5239     header_b64, payload_b64, signature_b64 = jwt.split('.')
5240     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5241     return payload_data
5242
5243
5244 def supports_terminal_sequences(stream):
5245     if compat_os_name == 'nt':
5246         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5247         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5248             return False
5249     elif not os.getenv('TERM'):
5250         return False
5251     try:
5252         return stream.isatty()
5253     except BaseException:
5254         return False
5255
5256
5257 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5258
5259
5260 def remove_terminal_sequences(string):
5261     return _terminal_sequences_re.sub('', string)
5262
5263
5264 def number_of_digits(number):
5265     return len('%d' % number)
5266
5267
5268 def join_nonempty(*values, delim='-', from_dict=None):
5269     if from_dict is not None:
5270         values = map(from_dict.get, values)
5271     return delim.join(map(str, filter(None, values)))
5272
5273
5274 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5275     """
5276     Find the largest format dimensions in terms of video width and, for each thumbnail:
5277     * Modify the URL: Match the width with the provided regex and replace with the former width
5278     * Update dimensions
5279
5280     This function is useful with video services that scale the provided thumbnails on demand
5281     """
5282     _keys = ('width', 'height')
5283     max_dimensions = max(
5284         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5285         default=(0, 0))
5286     if not max_dimensions[0]:
5287         return thumbnails
5288     return [
5289         merge_dicts(
5290             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5291             dict(zip(_keys, max_dimensions)), thumbnail)
5292         for thumbnail in thumbnails
5293     ]
5294
5295
5296 def parse_http_range(range):
5297     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5298     if not range:
5299         return None, None, None
5300     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5301     if not crg:
5302         return None, None, None
5303     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5304
5305
5306 class Config:
5307     own_args = None
5308     filename = None
5309     __initialized = False
5310
5311     def __init__(self, parser, label=None):
5312         self._parser, self.label = parser, label
5313         self._loaded_paths, self.configs = set(), []
5314
5315     def init(self, args=None, filename=None):
5316         assert not self.__initialized
5317         directory = ''
5318         if filename:
5319             location = os.path.realpath(filename)
5320             directory = os.path.dirname(location)
5321             if location in self._loaded_paths:
5322                 return False
5323             self._loaded_paths.add(location)
5324
5325         self.__initialized = True
5326         self.own_args, self.filename = args, filename
5327         for location in self._parser.parse_args(args)[0].config_locations or []:
5328             location = os.path.join(directory, expand_path(location))
5329             if os.path.isdir(location):
5330                 location = os.path.join(location, 'yt-dlp.conf')
5331             if not os.path.exists(location):
5332                 self._parser.error(f'config location {location} does not exist')
5333             self.append_config(self.read_file(location), location)
5334         return True
5335
5336     def __str__(self):
5337         label = join_nonempty(
5338             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5339             delim=' ')
5340         return join_nonempty(
5341             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5342             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5343             delim='\n')
5344
5345     @staticmethod
5346     def read_file(filename, default=[]):
5347         try:
5348             optionf = open(filename)
5349         except IOError:
5350             return default  # silently skip if file is not present
5351         try:
5352             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5353             contents = optionf.read()
5354             if sys.version_info < (3,):
5355                 contents = contents.decode(preferredencoding())
5356             res = compat_shlex_split(contents, comments=True)
5357         finally:
5358             optionf.close()
5359         return res
5360
5361     @staticmethod
5362     def hide_login_info(opts):
5363         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5364         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5365
5366         def _scrub_eq(o):
5367             m = eqre.match(o)
5368             if m:
5369                 return m.group('key') + '=PRIVATE'
5370             else:
5371                 return o
5372
5373         opts = list(map(_scrub_eq, opts))
5374         for idx, opt in enumerate(opts):
5375             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5376                 opts[idx + 1] = 'PRIVATE'
5377         return opts
5378
5379     def append_config(self, *args, label=None):
5380         config = type(self)(self._parser, label)
5381         config._loaded_paths = self._loaded_paths
5382         if config.init(*args):
5383             self.configs.append(config)
5384
5385     @property
5386     def all_args(self):
5387         for config in reversed(self.configs):
5388             yield from config.all_args
5389         yield from self.own_args or []
5390
5391     def parse_args(self):
5392         return self._parser.parse_args(list(self.all_args))
5393
5394
5395 class WebSocketsWrapper():
5396     """Wraps websockets module to use in non-async scopes"""
5397
5398     def __init__(self, url, headers=None):
5399         self.loop = asyncio.events.new_event_loop()
5400         self.conn = compat_websockets.connect(
5401             url, extra_headers=headers, ping_interval=None,
5402             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5403         atexit.register(self.__exit__, None, None, None)
5404
5405     def __enter__(self):
5406         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5407         return self
5408
5409     def send(self, *args):
5410         self.run_with_loop(self.pool.send(*args), self.loop)
5411
5412     def recv(self, *args):
5413         return self.run_with_loop(self.pool.recv(*args), self.loop)
5414
5415     def __exit__(self, type, value, traceback):
5416         try:
5417             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5418         finally:
5419             self.loop.close()
5420             self._cancel_all_tasks(self.loop)
5421
5422     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5423     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5424     @staticmethod
5425     def run_with_loop(main, loop):
5426         if not asyncio.coroutines.iscoroutine(main):
5427             raise ValueError(f'a coroutine was expected, got {main!r}')
5428
5429         try:
5430             return loop.run_until_complete(main)
5431         finally:
5432             loop.run_until_complete(loop.shutdown_asyncgens())
5433             if hasattr(loop, 'shutdown_default_executor'):
5434                 loop.run_until_complete(loop.shutdown_default_executor())
5435
5436     @staticmethod
5437     def _cancel_all_tasks(loop):
5438         to_cancel = asyncio.tasks.all_tasks(loop)
5439
5440         if not to_cancel:
5441             return
5442
5443         for task in to_cancel:
5444             task.cancel()
5445
5446         loop.run_until_complete(
5447             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5448
5449         for task in to_cancel:
5450             if task.cancelled():
5451                 continue
5452             if task.exception() is not None:
5453                 loop.call_exception_handler({
5454                     'message': 'unhandled exception during asyncio.run() shutdown',
5455                     'exception': task.exception(),
5456                     'task': task,
5457                 })
5458
5459
5460 has_websockets = bool(compat_websockets)
5461
5462
5463 def merge_headers(*dicts):
5464     """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5465     return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}