]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[cleanup] Misc cleanup
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
15dfb392 1import atexit
1e399778 2import base64
5bc880b9 3import binascii
912b38b4 4import calendar
676eb3f2 5import codecs
c380cc28 6import collections
62e609ab 7import contextlib
e3946f98 8import ctypes
c496ca96 9import datetime
0c265486 10import email.header
f8271158 11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
019a94f7 20import importlib.util
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
14f25df2 43import urllib.error
f8271158 44import urllib.parse
ac668111 45import urllib.request
bcf89ce6 46import xml.etree.ElementTree
d77c3dfd 47import zlib
d77c3dfd 48
c487cf00 49from .compat import asyncio, functools # isort: split
8c25f81b 50from .compat import (
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
f8271158 53 compat_HTMLParseError,
efa97bdc 54 compat_os_name,
702ccf2d 55 compat_shlex_quote,
8c25f81b 56)
ac668111 57from .dependencies import brotli, certifi, websockets, xattr
f8271158 58from .socks import ProxyType, sockssocket
71aff188 59
4644ac55 60
51fb4995
YCH
61def register_socks_protocols():
62 # "Register" SOCKS protocols
d5ae6bb5
YCH
63 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
64 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 65 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 66 if scheme not in urllib.parse.uses_netloc:
67 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
68
69
468e2e92
FV
70# This is not clearly defined otherwise
71compiled_regex_type = type(re.compile(''))
72
f7a147e3
S
73
74def random_user_agent():
75 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
76 _CHROME_VERSIONS = (
19b4c74d 77 '90.0.4430.212',
78 '90.0.4430.24',
79 '90.0.4430.70',
80 '90.0.4430.72',
81 '90.0.4430.85',
82 '90.0.4430.93',
83 '91.0.4472.101',
84 '91.0.4472.106',
85 '91.0.4472.114',
86 '91.0.4472.124',
87 '91.0.4472.164',
88 '91.0.4472.19',
89 '91.0.4472.77',
90 '92.0.4515.107',
91 '92.0.4515.115',
92 '92.0.4515.131',
93 '92.0.4515.159',
94 '92.0.4515.43',
95 '93.0.4556.0',
96 '93.0.4577.15',
97 '93.0.4577.63',
98 '93.0.4577.82',
99 '94.0.4606.41',
100 '94.0.4606.54',
101 '94.0.4606.61',
102 '94.0.4606.71',
103 '94.0.4606.81',
104 '94.0.4606.85',
105 '95.0.4638.17',
106 '95.0.4638.50',
107 '95.0.4638.54',
108 '95.0.4638.69',
109 '95.0.4638.74',
110 '96.0.4664.18',
111 '96.0.4664.45',
112 '96.0.4664.55',
113 '96.0.4664.93',
114 '97.0.4692.20',
f7a147e3
S
115 )
116 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
117
118
4390d5ec 119SUPPORTED_ENCODINGS = [
120 'gzip', 'deflate'
121]
9b8ee23b 122if brotli:
4390d5ec 123 SUPPORTED_ENCODINGS.append('br')
124
3e669f36 125std_headers = {
f7a147e3 126 'User-Agent': random_user_agent(),
59ae15a5 127 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 128 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 129 'Sec-Fetch-Mode': 'navigate',
3e669f36 130}
f427df17 131
5f6a1245 132
fb37eb25
S
133USER_AGENTS = {
134 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
135}
136
137
bf42a990 138NO_DEFAULT = object()
7b2c3f47 139IDENTITY = lambda x: x
bf42a990 140
7105440c
YCH
141ENGLISH_MONTH_NAMES = [
142 'January', 'February', 'March', 'April', 'May', 'June',
143 'July', 'August', 'September', 'October', 'November', 'December']
144
f6717dec
S
145MONTH_NAMES = {
146 'en': ENGLISH_MONTH_NAMES,
147 'fr': [
3e4185c3
S
148 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
149 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 150}
a942d6cb 151
a7aaa398
S
152KNOWN_EXTENSIONS = (
153 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
154 'flv', 'f4v', 'f4a', 'f4b',
155 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
156 'mkv', 'mka', 'mk3d',
157 'avi', 'divx',
158 'mov',
159 'asf', 'wmv', 'wma',
160 '3gp', '3g2',
161 'mp3',
162 'flac',
163 'ape',
164 'wav',
165 'f4f', 'f4m', 'm3u8', 'smil')
166
c587cbb7 167# needed for sanitizing filenames in restricted mode
c8827027 168ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
169 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
170 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 171
46f59e89
S
172DATE_FORMATS = (
173 '%d %B %Y',
174 '%d %b %Y',
175 '%B %d %Y',
cb655f34
S
176 '%B %dst %Y',
177 '%B %dnd %Y',
9d30c213 178 '%B %drd %Y',
cb655f34 179 '%B %dth %Y',
46f59e89 180 '%b %d %Y',
cb655f34
S
181 '%b %dst %Y',
182 '%b %dnd %Y',
9d30c213 183 '%b %drd %Y',
cb655f34 184 '%b %dth %Y',
46f59e89
S
185 '%b %dst %Y %I:%M',
186 '%b %dnd %Y %I:%M',
9d30c213 187 '%b %drd %Y %I:%M',
46f59e89
S
188 '%b %dth %Y %I:%M',
189 '%Y %m %d',
190 '%Y-%m-%d',
bccdbd22 191 '%Y.%m.%d.',
46f59e89 192 '%Y/%m/%d',
81c13222 193 '%Y/%m/%d %H:%M',
46f59e89 194 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
195 '%Y%m%d%H%M',
196 '%Y%m%d%H%M%S',
4f3fa23e 197 '%Y%m%d',
0c1c6f4b 198 '%Y-%m-%d %H:%M',
46f59e89
S
199 '%Y-%m-%d %H:%M:%S',
200 '%Y-%m-%d %H:%M:%S.%f',
5014558a 201 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
202 '%d.%m.%Y %H:%M',
203 '%d.%m.%Y %H.%M',
204 '%Y-%m-%dT%H:%M:%SZ',
205 '%Y-%m-%dT%H:%M:%S.%fZ',
206 '%Y-%m-%dT%H:%M:%S.%f0Z',
207 '%Y-%m-%dT%H:%M:%S',
208 '%Y-%m-%dT%H:%M:%S.%f',
209 '%Y-%m-%dT%H:%M',
c6eed6b8
S
210 '%b %d %Y at %H:%M',
211 '%b %d %Y at %H:%M:%S',
b555ae9b
S
212 '%B %d %Y at %H:%M',
213 '%B %d %Y at %H:%M:%S',
a63d9bd0 214 '%H:%M %d-%b-%Y',
46f59e89
S
215)
216
217DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
218DATE_FORMATS_DAY_FIRST.extend([
219 '%d-%m-%Y',
220 '%d.%m.%Y',
221 '%d.%m.%y',
222 '%d/%m/%Y',
223 '%d/%m/%y',
224 '%d/%m/%Y %H:%M:%S',
225])
226
227DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
228DATE_FORMATS_MONTH_FIRST.extend([
229 '%m-%d-%Y',
230 '%m.%d.%Y',
231 '%m/%d/%Y',
232 '%m/%d/%y',
233 '%m/%d/%Y %H:%M:%S',
234])
235
06b3fe29 236PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 237JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 238
1d485a1a 239NUMBER_RE = r'\d+(?:\.\d+)?'
240
7105440c 241
0b9c08b4 242@functools.cache
d77c3dfd 243def preferredencoding():
59ae15a5 244 """Get preferred encoding.
d77c3dfd 245
59ae15a5
PH
246 Returns the best encoding scheme for the system, based on
247 locale.getpreferredencoding() and some further tweaks.
248 """
249 try:
250 pref = locale.getpreferredencoding()
28e614de 251 'TEST'.encode(pref)
70a1165b 252 except Exception:
59ae15a5 253 pref = 'UTF-8'
bae611f2 254
59ae15a5 255 return pref
d77c3dfd 256
f4bfd65f 257
181c8655 258def write_json_file(obj, fn):
1394646a 259 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 260
cfb0511d 261 tf = tempfile.NamedTemporaryFile(
262 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
263 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
264
265 try:
266 with tf:
45d86abe 267 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
268 if sys.platform == 'win32':
269 # Need to remove existing file on Windows, else os.rename raises
270 # WindowsError or FileExistsError.
19a03940 271 with contextlib.suppress(OSError):
1394646a 272 os.unlink(fn)
19a03940 273 with contextlib.suppress(OSError):
9cd5f54e
R
274 mask = os.umask(0)
275 os.umask(mask)
276 os.chmod(tf.name, 0o666 & ~mask)
181c8655 277 os.rename(tf.name, fn)
70a1165b 278 except Exception:
19a03940 279 with contextlib.suppress(OSError):
181c8655 280 os.remove(tf.name)
181c8655
PH
281 raise
282
283
cfb0511d 284def find_xpath_attr(node, xpath, key, val=None):
285 """ Find the xpath xpath[@key=val] """
286 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 287 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 288 return node.find(expr)
59ae56fa 289
d7e66d39
JMF
290# On python2.6 the xml.etree.ElementTree.Element methods don't support
291# the namespace parameter
5f6a1245
JW
292
293
d7e66d39
JMF
294def xpath_with_ns(path, ns_map):
295 components = [c.split(':') for c in path.split('/')]
296 replaced = []
297 for c in components:
298 if len(c) == 1:
299 replaced.append(c[0])
300 else:
301 ns, tag = c
302 replaced.append('{%s}%s' % (ns_map[ns], tag))
303 return '/'.join(replaced)
304
d77c3dfd 305
a41fb80c 306def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 307 def _find_xpath(xpath):
f9934b96 308 return node.find(xpath)
578c0745 309
14f25df2 310 if isinstance(xpath, str):
578c0745
S
311 n = _find_xpath(xpath)
312 else:
313 for xp in xpath:
314 n = _find_xpath(xp)
315 if n is not None:
316 break
d74bebd5 317
8e636da4 318 if n is None:
bf42a990
S
319 if default is not NO_DEFAULT:
320 return default
321 elif fatal:
bf0ff932
PH
322 name = xpath if name is None else name
323 raise ExtractorError('Could not find XML element %s' % name)
324 else:
325 return None
a41fb80c
S
326 return n
327
328
329def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
330 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
331 if n is None or n == default:
332 return n
333 if n.text is None:
334 if default is not NO_DEFAULT:
335 return default
336 elif fatal:
337 name = xpath if name is None else name
338 raise ExtractorError('Could not find XML element\'s text %s' % name)
339 else:
340 return None
341 return n.text
a41fb80c
S
342
343
344def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
345 n = find_xpath_attr(node, xpath, key)
346 if n is None:
347 if default is not NO_DEFAULT:
348 return default
349 elif fatal:
86e5f3ed 350 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
351 raise ExtractorError('Could not find XML attribute %s' % name)
352 else:
353 return None
354 return n.attrib[key]
bf0ff932
PH
355
356
c487cf00 357def get_element_by_id(id, html, **kwargs):
43e8fafd 358 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 359 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 360
12ea2f30 361
c487cf00 362def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 363 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 364 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
365
366
84c237fb 367def get_element_by_class(class_name, html):
2af12ad9
TC
368 """Return the content of the first tag with the specified class in the passed HTML document"""
369 retval = get_elements_by_class(class_name, html)
370 return retval[0] if retval else None
371
372
6f32a0b5
ZM
373def get_element_html_by_class(class_name, html):
374 """Return the html of the first tag with the specified class in the passed HTML document"""
375 retval = get_elements_html_by_class(class_name, html)
376 return retval[0] if retval else None
377
378
c487cf00 379def get_element_by_attribute(attribute, value, html, **kwargs):
380 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
381 return retval[0] if retval else None
382
383
c487cf00 384def get_element_html_by_attribute(attribute, value, html, **kargs):
385 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
386 return retval[0] if retval else None
387
388
c487cf00 389def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
390 """Return the content of all tags with the specified class in the passed HTML document as a list"""
391 return get_elements_by_attribute(
64fa820c 392 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
393 html, escape_value=False)
394
395
6f32a0b5
ZM
396def get_elements_html_by_class(class_name, html):
397 """Return the html of all tags with the specified class in the passed HTML document as a list"""
398 return get_elements_html_by_attribute(
64fa820c 399 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
400 html, escape_value=False)
401
402
403def get_elements_by_attribute(*args, **kwargs):
43e8fafd 404 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
405 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
406
407
408def get_elements_html_by_attribute(*args, **kwargs):
409 """Return the html of the tag with the specified attribute in the passed HTML document"""
410 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
411
412
413def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
414 """
415 Return the text (content) and the html (whole) of the tag with the specified
416 attribute in the passed HTML document
417 """
9e6dd238 418
86e5f3ed 419 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 420
84c237fb
YCH
421 value = re.escape(value) if escape_value else value
422
86e5f3ed 423 partial_element_re = rf'''(?x)
6f32a0b5 424 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 425 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 426 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
427 '''
38285056 428
0254f162
ZM
429 for m in re.finditer(partial_element_re, html):
430 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 431
0254f162
ZM
432 yield (
433 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
434 whole
435 )
a921f407 436
c5229f39 437
ac668111 438class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
439 """
440 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
441 closing tag for the first opening tag it has encountered, and can be used
442 as a context manager
443 """
444
445 class HTMLBreakOnClosingTagException(Exception):
446 pass
447
448 def __init__(self):
449 self.tagstack = collections.deque()
ac668111 450 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
451
452 def __enter__(self):
453 return self
454
455 def __exit__(self, *_):
456 self.close()
457
458 def close(self):
459 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
460 # so data remains buffered; we no longer have any interest in it, thus
461 # override this method to discard it
462 pass
463
464 def handle_starttag(self, tag, _):
465 self.tagstack.append(tag)
466
467 def handle_endtag(self, tag):
468 if not self.tagstack:
469 raise compat_HTMLParseError('no tags in the stack')
470 while self.tagstack:
471 inner_tag = self.tagstack.pop()
472 if inner_tag == tag:
473 break
474 else:
475 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
476 if not self.tagstack:
477 raise self.HTMLBreakOnClosingTagException()
478
479
480def get_element_text_and_html_by_tag(tag, html):
481 """
482 For the first element with the specified tag in the passed HTML document
483 return its' content (text) and the whole element (html)
484 """
485 def find_or_raise(haystack, needle, exc):
486 try:
487 return haystack.index(needle)
488 except ValueError:
489 raise exc
490 closing_tag = f'</{tag}>'
491 whole_start = find_or_raise(
492 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
493 content_start = find_or_raise(
494 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
495 content_start += whole_start + 1
496 with HTMLBreakOnClosingTagParser() as parser:
497 parser.feed(html[whole_start:content_start])
498 if not parser.tagstack or parser.tagstack[0] != tag:
499 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
500 offset = content_start
501 while offset < len(html):
502 next_closing_tag_start = find_or_raise(
503 html[offset:], closing_tag,
504 compat_HTMLParseError(f'closing {tag} tag not found'))
505 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
506 try:
507 parser.feed(html[offset:offset + next_closing_tag_end])
508 offset += next_closing_tag_end
509 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
510 return html[content_start:offset + next_closing_tag_start], \
511 html[whole_start:offset + next_closing_tag_end]
512 raise compat_HTMLParseError('unexpected end of html')
513
514
ac668111 515class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 516 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 517
8bb56eee 518 def __init__(self):
c5229f39 519 self.attrs = {}
ac668111 520 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
521
522 def handle_starttag(self, tag, attrs):
523 self.attrs = dict(attrs)
524
c5229f39 525
ac668111 526class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
527 """HTML parser to gather the attributes for the elements of a list"""
528
529 def __init__(self):
ac668111 530 html.parser.HTMLParser.__init__(self)
73673ccf
FF
531 self.items = []
532 self._level = 0
533
534 def handle_starttag(self, tag, attrs):
535 if tag == 'li' and self._level == 0:
536 self.items.append(dict(attrs))
537 self._level += 1
538
539 def handle_endtag(self, tag):
540 self._level -= 1
541
542
8bb56eee
BF
543def extract_attributes(html_element):
544 """Given a string for an HTML element such as
545 <el
546 a="foo" B="bar" c="&98;az" d=boz
547 empty= noval entity="&amp;"
548 sq='"' dq="'"
549 >
550 Decode and return a dictionary of attributes.
551 {
552 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
553 'empty': '', 'noval': None, 'entity': '&',
554 'sq': '"', 'dq': '\''
555 }.
8bb56eee
BF
556 """
557 parser = HTMLAttributeParser()
19a03940 558 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
559 parser.feed(html_element)
560 parser.close()
8bb56eee 561 return parser.attrs
9e6dd238 562
c5229f39 563
73673ccf
FF
564def parse_list(webpage):
565 """Given a string for an series of HTML <li> elements,
566 return a dictionary of their attributes"""
567 parser = HTMLListAttrsParser()
568 parser.feed(webpage)
569 parser.close()
570 return parser.items
571
572
9e6dd238 573def clean_html(html):
59ae15a5 574 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
575
576 if html is None: # Convenience for sanitizing descriptions etc.
577 return html
578
49185227 579 html = re.sub(r'\s+', ' ', html)
580 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
581 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
582 # Strip html tags
583 html = re.sub('<.*?>', '', html)
584 # Replace html entities
585 html = unescapeHTML(html)
7decf895 586 return html.strip()
9e6dd238
FV
587
588
b7c47b74 589class LenientJSONDecoder(json.JSONDecoder):
590 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
591 self.transform_source, self.ignore_extra = transform_source, ignore_extra
592 super().__init__(*args, **kwargs)
593
594 def decode(self, s):
595 if self.transform_source:
596 s = self.transform_source(s)
597 if self.ignore_extra:
598 return self.raw_decode(s.lstrip())[0]
599 return super().decode(s)
600
601
d77c3dfd 602def sanitize_open(filename, open_mode):
59ae15a5
PH
603 """Try to open the given filename, and slightly tweak it if this fails.
604
605 Attempts to open the given filename. If this fails, it tries to change
606 the filename slightly, step by step, until it's either able to open it
607 or it fails and raises a final exception, like the standard open()
608 function.
609
610 It returns the tuple (stream, definitive_file_name).
611 """
0edb3e33 612 if filename == '-':
613 if sys.platform == 'win32':
614 import msvcrt
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 617
0edb3e33 618 for attempt in range(2):
619 try:
620 try:
89737671 621 if sys.platform == 'win32':
b506289f 622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the file on windows (for now).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 625 raise LockingUnsupportedError()
0edb3e33 626 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 627 except OSError:
0edb3e33 628 stream = open(filename, open_mode)
8a82af35 629 return stream, filename
86e5f3ed 630 except OSError as err:
0edb3e33 631 if attempt or err.errno in (errno.EACCES,):
632 raise
633 old_filename, filename = filename, sanitize_path(filename)
634 if old_filename == filename:
635 raise
d77c3dfd
FV
636
637
638def timeconvert(timestr):
59ae15a5
PH
639 """Convert RFC 2822 defined time string into system timestamp"""
640 timestamp = None
641 timetuple = email.utils.parsedate_tz(timestr)
642 if timetuple is not None:
643 timestamp = email.utils.mktime_tz(timetuple)
644 return timestamp
1c469a94 645
5f6a1245 646
5c3895ff 647def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 648 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 652 """
5c3895ff 653 if s == '':
654 return ''
655
59ae15a5 656 def replace_insane(char):
c587cbb7
AT
657 if restricted and char in ACCENT_CHARS:
658 return ACCENT_CHARS[char]
91dd88b9 659 elif not restricted and char == '\n':
5c3895ff 660 return '\0 '
91dd88b9 661 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
662 return ''
663 elif char == '"':
664 return '' if restricted else '\''
665 elif char == ':':
5c3895ff 666 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 667 elif char in '\\/|*<>':
5c3895ff 668 return '\0_'
669 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
670 return '\0_'
59ae15a5
PH
671 return char
672
5c3895ff 673 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 674 result = ''.join(map(replace_insane, s))
5c3895ff 675 if is_id is NO_DEFAULT:
ae61d108 676 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
677 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 678 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
679 result = result.replace('\0', '') or '_'
680
796173d0
PH
681 if not is_id:
682 while '__' in result:
683 result = result.replace('__', '_')
684 result = result.strip('_')
685 # Common case of "Foreign band name - English song title"
686 if restricted and result.startswith('-_'):
687 result = result[2:]
5a42414b
PH
688 if result.startswith('-'):
689 result = '_' + result[len('-'):]
a7440261 690 result = result.lstrip('.')
796173d0
PH
691 if not result:
692 result = '_'
59ae15a5 693 return result
d77c3dfd 694
5f6a1245 695
c2934512 696def sanitize_path(s, force=False):
a2aaf4db 697 """Sanitizes and normalizes path on Windows"""
c2934512 698 if sys.platform == 'win32':
c4218ac3 699 force = False
c2934512 700 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 701 elif force:
702 drive_or_unc = ''
703 else:
a2aaf4db 704 return s
c2934512 705
be531ef1
S
706 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
707 if drive_or_unc:
a2aaf4db
S
708 norm_path.pop(0)
709 sanitized_path = [
ec85ded8 710 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 711 for path_part in norm_path]
be531ef1
S
712 if drive_or_unc:
713 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 714 elif force and s and s[0] == os.path.sep:
c4218ac3 715 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
716 return os.path.join(*sanitized_path)
717
718
17bcc626 719def sanitize_url(url):
befa4708
S
720 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
721 # the number of unwanted failures due to missing protocol
21633673 722 if url is None:
723 return
724 elif url.startswith('//'):
befa4708
S
725 return 'http:%s' % url
726 # Fix some common typos seen so far
727 COMMON_TYPOS = (
067aa17e 728 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
729 (r'^httpss://', r'https://'),
730 # https://bx1.be/lives/direct-tv/
731 (r'^rmtp([es]?)://', r'rtmp\1://'),
732 )
733 for mistake, fixup in COMMON_TYPOS:
734 if re.match(mistake, url):
735 return re.sub(mistake, fixup, url)
bc6b9bcd 736 return url
17bcc626
S
737
738
5435dcf9 739def extract_basic_auth(url):
14f25df2 740 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
741 if parts.username is None:
742 return url, None
14f25df2 743 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
744 parts.hostname if parts.port is None
745 else '%s:%d' % (parts.hostname, parts.port))))
746 auth_payload = base64.b64encode(
0f06bcd7 747 ('%s:%s' % (parts.username, parts.password or '')).encode())
748 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
749
750
67dda517 751def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 752 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
753 if auth_header is not None:
754 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
755 headers['Authorization'] = auth_header
ac668111 756 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
757
758
51098426
S
759def expand_path(s):
760 """Expand shell variables and ~"""
761 return os.path.expandvars(compat_expanduser(s))
762
763
7e9a6125 764def orderedSet(iterable, *, lazy=False):
765 """Remove all duplicates from the input iterable"""
766 def _iter():
767 seen = [] # Do not use set since the items can be unhashable
768 for x in iterable:
769 if x not in seen:
770 seen.append(x)
771 yield x
772
773 return _iter() if lazy else list(_iter())
d77c3dfd 774
912b38b4 775
55b2f099 776def _htmlentity_transform(entity_with_semicolon):
4e408e47 777 """Transforms an HTML entity to a character."""
55b2f099
YCH
778 entity = entity_with_semicolon[:-1]
779
4e408e47 780 # Known non-numeric HTML entity
ac668111 781 if entity in html.entities.name2codepoint:
782 return chr(html.entities.name2codepoint[entity])
4e408e47 783
55b2f099
YCH
784 # TODO: HTML5 allows entities without a semicolon. For example,
785 # '&Eacuteric' should be decoded as 'Éric'.
ac668111 786 if entity_with_semicolon in html.entities.html5:
787 return html.entities.html5[entity_with_semicolon]
55b2f099 788
91757b0f 789 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
790 if mobj is not None:
791 numstr = mobj.group(1)
28e614de 792 if numstr.startswith('x'):
4e408e47 793 base = 16
28e614de 794 numstr = '0%s' % numstr
4e408e47
PH
795 else:
796 base = 10
067aa17e 797 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 798 with contextlib.suppress(ValueError):
ac668111 799 return chr(int(numstr, base))
4e408e47
PH
800
801 # Unknown entity in name, return its literal representation
7a3f0c00 802 return '&%s;' % entity
4e408e47
PH
803
804
d77c3dfd 805def unescapeHTML(s):
912b38b4
PH
806 if s is None:
807 return None
19a03940 808 assert isinstance(s, str)
d77c3dfd 809
4e408e47 810 return re.sub(
95f3f7c2 811 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 812
8bf48f23 813
cdb19aa4 814def escapeHTML(text):
815 return (
816 text
817 .replace('&', '&amp;')
818 .replace('<', '&lt;')
819 .replace('>', '&gt;')
820 .replace('"', '&quot;')
821 .replace("'", '&#39;')
822 )
823
824
f5b1bca9 825def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 826 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
827 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
828 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 829
830
d3c93ec2 831class Popen(subprocess.Popen):
832 if sys.platform == 'win32':
833 _startupinfo = subprocess.STARTUPINFO()
834 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
835 else:
836 _startupinfo = None
837
f0c9fb96 838 def __init__(self, *args, text=False, **kwargs):
839 if text is True:
840 kwargs['universal_newlines'] = True # For 3.6 compatibility
841 kwargs.setdefault('encoding', 'utf-8')
842 kwargs.setdefault('errors', 'replace')
86e5f3ed 843 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 844
845 def communicate_or_kill(self, *args, **kwargs):
8a82af35 846 try:
847 return self.communicate(*args, **kwargs)
848 except BaseException: # Including KeyboardInterrupt
f0c9fb96 849 self.kill(timeout=None)
8a82af35 850 raise
d3c93ec2 851
f0c9fb96 852 def kill(self, *, timeout=0):
853 super().kill()
854 if timeout != 0:
855 self.wait(timeout=timeout)
856
857 @classmethod
858 def run(cls, *args, **kwargs):
859 with cls(*args, **kwargs) as proc:
860 stdout, stderr = proc.communicate_or_kill()
861 return stdout or '', stderr or '', proc.returncode
862
d3c93ec2 863
aa49acd1
S
864def get_subprocess_encoding():
865 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
866 # For subprocess calls, encode with locale encoding
867 # Refer to http://stackoverflow.com/a/9951851/35070
868 encoding = preferredencoding()
869 else:
870 encoding = sys.getfilesystemencoding()
871 if encoding is None:
872 encoding = 'utf-8'
873 return encoding
874
875
8bf48f23 876def encodeFilename(s, for_subprocess=False):
19a03940 877 assert isinstance(s, str)
cfb0511d 878 return s
aa49acd1
S
879
880
881def decodeFilename(b, for_subprocess=False):
cfb0511d 882 return b
8bf48f23 883
f07b74fc
PH
884
885def encodeArgument(s):
cfb0511d 886 # Legacy code that uses byte strings
887 # Uncomment the following line after fixing all post processors
14f25df2 888 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 889 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
890
891
aa49acd1 892def decodeArgument(b):
cfb0511d 893 return b
aa49acd1
S
894
895
8271226a
PH
896def decodeOption(optval):
897 if optval is None:
898 return optval
899 if isinstance(optval, bytes):
900 optval = optval.decode(preferredencoding())
901
14f25df2 902 assert isinstance(optval, str)
8271226a 903 return optval
1c256f70 904
5f6a1245 905
aa7785f8 906_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
907
908
909def timetuple_from_msec(msec):
910 secs, msec = divmod(msec, 1000)
911 mins, secs = divmod(secs, 60)
912 hrs, mins = divmod(mins, 60)
913 return _timetuple(hrs, mins, secs, msec)
914
915
cdb19aa4 916def formatSeconds(secs, delim=':', msec=False):
aa7785f8 917 time = timetuple_from_msec(secs * 1000)
918 if time.hours:
919 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
920 elif time.minutes:
921 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 922 else:
aa7785f8 923 ret = '%d' % time.seconds
924 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 925
a0ddb8a2 926
77562778 927def _ssl_load_windows_store_certs(ssl_context, storename):
928 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
929 try:
930 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
931 if encoding == 'x509_asn' and (
932 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
933 except PermissionError:
934 return
935 for cert in certs:
19a03940 936 with contextlib.suppress(ssl.SSLError):
77562778 937 ssl_context.load_verify_locations(cadata=cert)
a2366922 938
77562778 939
940def make_HTTPS_handler(params, **kwargs):
941 opts_check_certificate = not params.get('nocheckcertificate')
942 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
943 context.check_hostname = opts_check_certificate
f81c62a6 944 if params.get('legacyserverconnect'):
945 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 946 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
947 context.set_ciphers('DEFAULT')
8a82af35 948
77562778 949 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
950 if opts_check_certificate:
d5820461 951 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
952 context.load_verify_locations(cafile=certifi.where())
8a82af35 953 try:
954 context.load_default_certs()
955 # Work around the issue in load_default_certs when there are bad certificates. See:
956 # https://github.com/yt-dlp/yt-dlp/issues/1060,
957 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
958 except ssl.SSLError:
959 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
960 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
961 for storename in ('CA', 'ROOT'):
962 _ssl_load_windows_store_certs(context, storename)
963 context.set_default_verify_paths()
964
bb58c9ed 965 client_certfile = params.get('client_certificate')
966 if client_certfile:
967 try:
968 context.load_cert_chain(
969 client_certfile, keyfile=params.get('client_certificate_key'),
970 password=params.get('client_certificate_password'))
971 except ssl.SSLError:
972 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 973
974 # Some servers may reject requests if ALPN extension is not sent. See:
975 # https://github.com/python/cpython/issues/85140
976 # https://github.com/yt-dlp/yt-dlp/issues/3878
977 with contextlib.suppress(NotImplementedError):
978 context.set_alpn_protocols(['http/1.1'])
979
77562778 980 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 981
732ea2f0 982
5873d4cc 983def bug_reports_message(before=';'):
57e0f077 984 from .update import REPOSITORY
985
986 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
987 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
988
989 before = before.rstrip()
990 if not before or before.endswith(('.', '!', '?')):
991 msg = msg[0].title() + msg[1:]
992
993 return (before + ' ' if before else '') + msg
08f2a92c
JMF
994
995
bf5b9d85
PM
996class YoutubeDLError(Exception):
997 """Base exception for YoutubeDL errors."""
aa9369a2 998 msg = None
999
1000 def __init__(self, msg=None):
1001 if msg is not None:
1002 self.msg = msg
1003 elif self.msg is None:
1004 self.msg = type(self).__name__
1005 super().__init__(self.msg)
bf5b9d85
PM
1006
1007
ac668111 1008network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1009if hasattr(ssl, 'CertificateError'):
1010 network_exceptions.append(ssl.CertificateError)
1011network_exceptions = tuple(network_exceptions)
1012
1013
bf5b9d85 1014class ExtractorError(YoutubeDLError):
1c256f70 1015 """Error during info extraction."""
5f6a1245 1016
1151c407 1017 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1018 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1019 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1020 """
3158150c 1021 if sys.exc_info()[0] in network_exceptions:
9a82b238 1022 expected = True
d5979c5d 1023
7265a219 1024 self.orig_msg = str(msg)
1c256f70 1025 self.traceback = tb
1151c407 1026 self.expected = expected
2eabb802 1027 self.cause = cause
d11271dd 1028 self.video_id = video_id
1151c407 1029 self.ie = ie
1030 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1031 if isinstance(self.exc_info[1], ExtractorError):
1032 self.exc_info = self.exc_info[1].exc_info
1151c407 1033
86e5f3ed 1034 super().__init__(''.join((
a70635b8 1035 format_field(ie, None, '[%s] '),
1036 format_field(video_id, None, '%s: '),
7265a219 1037 msg,
a70635b8 1038 format_field(cause, None, ' (caused by %r)'),
1151c407 1039 '' if expected else bug_reports_message())))
1c256f70 1040
01951dda 1041 def format_traceback(self):
497d2fab 1042 return join_nonempty(
1043 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1044 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1045 delim='\n') or None
01951dda 1046
1c256f70 1047
416c7fcb
PH
1048class UnsupportedError(ExtractorError):
1049 def __init__(self, url):
86e5f3ed 1050 super().__init__(
416c7fcb
PH
1051 'Unsupported URL: %s' % url, expected=True)
1052 self.url = url
1053
1054
55b3e45b
JMF
1055class RegexNotFoundError(ExtractorError):
1056 """Error when a regex didn't match"""
1057 pass
1058
1059
773f291d
S
1060class GeoRestrictedError(ExtractorError):
1061 """Geographic restriction Error exception.
1062
1063 This exception may be thrown when a video is not available from your
1064 geographic location due to geographic restrictions imposed by a website.
1065 """
b6e0c7d2 1066
0db3bae8 1067 def __init__(self, msg, countries=None, **kwargs):
1068 kwargs['expected'] = True
86e5f3ed 1069 super().__init__(msg, **kwargs)
773f291d
S
1070 self.countries = countries
1071
1072
bf5b9d85 1073class DownloadError(YoutubeDLError):
59ae15a5 1074 """Download Error exception.
d77c3dfd 1075
59ae15a5
PH
1076 This exception may be thrown by FileDownloader objects if they are not
1077 configured to continue on errors. They will contain the appropriate
1078 error message.
1079 """
5f6a1245 1080
8cc83b8d
FV
1081 def __init__(self, msg, exc_info=None):
1082 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1083 super().__init__(msg)
8cc83b8d 1084 self.exc_info = exc_info
d77c3dfd
FV
1085
1086
498f5606 1087class EntryNotInPlaylist(YoutubeDLError):
1088 """Entry not in playlist exception.
1089
1090 This exception will be thrown by YoutubeDL when a requested entry
1091 is not found in the playlist info_dict
1092 """
aa9369a2 1093 msg = 'Entry not found in info'
498f5606 1094
1095
bf5b9d85 1096class SameFileError(YoutubeDLError):
59ae15a5 1097 """Same File exception.
d77c3dfd 1098
59ae15a5
PH
1099 This exception will be thrown by FileDownloader objects if they detect
1100 multiple files would have to be downloaded to the same file on disk.
1101 """
aa9369a2 1102 msg = 'Fixed output name but more than one file to download'
1103
1104 def __init__(self, filename=None):
1105 if filename is not None:
1106 self.msg += f': {filename}'
1107 super().__init__(self.msg)
d77c3dfd
FV
1108
1109
bf5b9d85 1110class PostProcessingError(YoutubeDLError):
59ae15a5 1111 """Post Processing exception.
d77c3dfd 1112
59ae15a5
PH
1113 This exception may be raised by PostProcessor's .run() method to
1114 indicate an error in the postprocessing task.
1115 """
5f6a1245 1116
5f6a1245 1117
48f79687 1118class DownloadCancelled(YoutubeDLError):
1119 """ Exception raised when the download queue should be interrupted """
1120 msg = 'The download was cancelled'
8b0d7497 1121
8b0d7497 1122
48f79687 1123class ExistingVideoReached(DownloadCancelled):
1124 """ --break-on-existing triggered """
1125 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1126
48f79687 1127
1128class RejectedVideoReached(DownloadCancelled):
1129 """ --break-on-reject triggered """
1130 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1131
1132
48f79687 1133class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1134 """ --max-downloads limit has been reached. """
48f79687 1135 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1136
1137
f2ebc5c7 1138class ReExtractInfo(YoutubeDLError):
1139 """ Video info needs to be re-extracted. """
1140
1141 def __init__(self, msg, expected=False):
1142 super().__init__(msg)
1143 self.expected = expected
1144
1145
1146class ThrottledDownload(ReExtractInfo):
48f79687 1147 """ Download speed below --throttled-rate. """
aa9369a2 1148 msg = 'The download speed is below throttle limit'
d77c3dfd 1149
43b22906 1150 def __init__(self):
1151 super().__init__(self.msg, expected=False)
f2ebc5c7 1152
d77c3dfd 1153
bf5b9d85 1154class UnavailableVideoError(YoutubeDLError):
59ae15a5 1155 """Unavailable Format exception.
d77c3dfd 1156
59ae15a5
PH
1157 This exception will be thrown when a video is requested
1158 in a format that is not available for that video.
1159 """
aa9369a2 1160 msg = 'Unable to download video'
1161
1162 def __init__(self, err=None):
1163 if err is not None:
1164 self.msg += f': {err}'
1165 super().__init__(self.msg)
d77c3dfd
FV
1166
1167
bf5b9d85 1168class ContentTooShortError(YoutubeDLError):
59ae15a5 1169 """Content Too Short exception.
d77c3dfd 1170
59ae15a5
PH
1171 This exception may be raised by FileDownloader objects when a file they
1172 download is too small for what the server announced first, indicating
1173 the connection was probably interrupted.
1174 """
d77c3dfd 1175
59ae15a5 1176 def __init__(self, downloaded, expected):
86e5f3ed 1177 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1178 # Both in bytes
59ae15a5
PH
1179 self.downloaded = downloaded
1180 self.expected = expected
d77c3dfd 1181
5f6a1245 1182
bf5b9d85 1183class XAttrMetadataError(YoutubeDLError):
efa97bdc 1184 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1185 super().__init__(msg)
efa97bdc 1186 self.code = code
bd264412 1187 self.msg = msg
efa97bdc
YCH
1188
1189 # Parsing code and msg
3089bc74 1190 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1191 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1192 self.reason = 'NO_SPACE'
1193 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1194 self.reason = 'VALUE_TOO_LONG'
1195 else:
1196 self.reason = 'NOT_SUPPORTED'
1197
1198
bf5b9d85 1199class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1200 pass
1201
1202
c5a59d93 1203def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1204 hc = http_class(*args, **kwargs)
be4a824d 1205 source_address = ydl_handler._params.get('source_address')
8959018a 1206
be4a824d 1207 if source_address is not None:
8959018a
AU
1208 # This is to workaround _create_connection() from socket where it will try all
1209 # address data from getaddrinfo() including IPv6. This filters the result from
1210 # getaddrinfo() based on the source_address value.
1211 # This is based on the cpython socket.create_connection() function.
1212 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1213 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1214 host, port = address
1215 err = None
1216 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1217 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1218 ip_addrs = [addr for addr in addrs if addr[0] == af]
1219 if addrs and not ip_addrs:
1220 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1221 raise OSError(
9e21e6d9
S
1222 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1223 % (ip_version, source_address[0]))
8959018a
AU
1224 for res in ip_addrs:
1225 af, socktype, proto, canonname, sa = res
1226 sock = None
1227 try:
1228 sock = socket.socket(af, socktype, proto)
1229 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1230 sock.settimeout(timeout)
1231 sock.bind(source_address)
1232 sock.connect(sa)
1233 err = None # Explicitly break reference cycle
1234 return sock
86e5f3ed 1235 except OSError as _:
8959018a
AU
1236 err = _
1237 if sock is not None:
1238 sock.close()
1239 if err is not None:
1240 raise err
1241 else:
86e5f3ed 1242 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1243 if hasattr(hc, '_create_connection'):
1244 hc._create_connection = _create_connection
cfb0511d 1245 hc.source_address = (source_address, 0)
be4a824d
PH
1246
1247 return hc
1248
1249
87f0e62d 1250def handle_youtubedl_headers(headers):
992fc9d6
YCH
1251 filtered_headers = headers
1252
1253 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1254 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1255 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1256
992fc9d6 1257 return filtered_headers
87f0e62d
YCH
1258
1259
ac668111 1260class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1261 """Handler for HTTP requests and responses.
1262
1263 This class, when installed with an OpenerDirector, automatically adds
1264 the standard headers to every HTTP request and handles gzipped and
1265 deflated responses from web servers. If compression is to be avoided in
1266 a particular request, the original request in the program code only has
0424ec30 1267 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1268 removed before making the real request.
1269
1270 Part of this code was copied from:
1271
1272 http://techknack.net/python-urllib2-handlers/
1273
1274 Andrew Rowls, the author of that code, agreed to release it to the
1275 public domain.
1276 """
1277
be4a824d 1278 def __init__(self, params, *args, **kwargs):
ac668111 1279 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1280 self._params = params
1281
1282 def http_open(self, req):
ac668111 1283 conn_class = http.client.HTTPConnection
71aff188
YCH
1284
1285 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1286 if socks_proxy:
1287 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1288 del req.headers['Ytdl-socks-proxy']
1289
be4a824d 1290 return self.do_open(functools.partial(
71aff188 1291 _create_http_connection, self, conn_class, False),
be4a824d
PH
1292 req)
1293
59ae15a5
PH
1294 @staticmethod
1295 def deflate(data):
fc2119f2 1296 if not data:
1297 return data
59ae15a5
PH
1298 try:
1299 return zlib.decompress(data, -zlib.MAX_WBITS)
1300 except zlib.error:
1301 return zlib.decompress(data)
1302
4390d5ec 1303 @staticmethod
1304 def brotli(data):
1305 if not data:
1306 return data
9b8ee23b 1307 return brotli.decompress(data)
4390d5ec 1308
acebc9cd 1309 def http_request(self, req):
51f267d9
S
1310 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1311 # always respected by websites, some tend to give out URLs with non percent-encoded
1312 # non-ASCII characters (see telemb.py, ard.py [#3412])
1313 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1314 # To work around aforementioned issue we will replace request's original URL with
1315 # percent-encoded one
1316 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1317 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1318 url = req.get_full_url()
1319 url_escaped = escape_url(url)
1320
1321 # Substitute URL if any change after escaping
1322 if url != url_escaped:
15d260eb 1323 req = update_Request(req, url=url_escaped)
51f267d9 1324
8b7539d2 1325 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1326 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1327 # The dict keys are capitalized because of this bug by urllib
1328 if h.capitalize() not in req.headers:
33ac271b 1329 req.add_header(h, v)
87f0e62d 1330
af14914b 1331 if 'Accept-encoding' not in req.headers:
1332 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1333
87f0e62d 1334 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1335
379a4f16 1336 return super().do_request_(req)
59ae15a5 1337
acebc9cd 1338 def http_response(self, req, resp):
59ae15a5
PH
1339 old_resp = resp
1340 # gzip
1341 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1342 content = resp.read()
1343 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1344 try:
1345 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1346 except OSError as original_ioerror:
aa3e9507
PH
1347 # There may be junk add the end of the file
1348 # See http://stackoverflow.com/q/4928560/35070 for details
1349 for i in range(1, 1024):
1350 try:
1351 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1352 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1353 except OSError:
aa3e9507
PH
1354 continue
1355 break
1356 else:
1357 raise original_ioerror
ac668111 1358 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1359 resp.msg = old_resp.msg
c047270c 1360 del resp.headers['Content-encoding']
59ae15a5
PH
1361 # deflate
1362 if resp.headers.get('Content-encoding', '') == 'deflate':
1363 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1364 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1365 resp.msg = old_resp.msg
c047270c 1366 del resp.headers['Content-encoding']
4390d5ec 1367 # brotli
1368 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1369 resp = urllib.request.addinfourl(
4390d5ec 1370 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1371 resp.msg = old_resp.msg
1372 del resp.headers['Content-encoding']
ad729172 1373 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1374 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1375 if 300 <= resp.code < 400:
1376 location = resp.headers.get('Location')
1377 if location:
1378 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1379 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1380 location_escaped = escape_url(location)
1381 if location != location_escaped:
1382 del resp.headers['Location']
1383 resp.headers['Location'] = location_escaped
59ae15a5 1384 return resp
0f8d03f8 1385
acebc9cd
PH
1386 https_request = http_request
1387 https_response = http_response
bf50b038 1388
5de90176 1389
71aff188
YCH
1390def make_socks_conn_class(base_class, socks_proxy):
1391 assert issubclass(base_class, (
ac668111 1392 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1393
14f25df2 1394 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1395 if url_components.scheme.lower() == 'socks5':
1396 socks_type = ProxyType.SOCKS5
1397 elif url_components.scheme.lower() in ('socks', 'socks4'):
1398 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1399 elif url_components.scheme.lower() == 'socks4a':
1400 socks_type = ProxyType.SOCKS4A
71aff188 1401
cdd94c2e
YCH
1402 def unquote_if_non_empty(s):
1403 if not s:
1404 return s
ac668111 1405 return urllib.parse.unquote_plus(s)
cdd94c2e 1406
71aff188
YCH
1407 proxy_args = (
1408 socks_type,
1409 url_components.hostname, url_components.port or 1080,
1410 True, # Remote DNS
cdd94c2e
YCH
1411 unquote_if_non_empty(url_components.username),
1412 unquote_if_non_empty(url_components.password),
71aff188
YCH
1413 )
1414
1415 class SocksConnection(base_class):
1416 def connect(self):
1417 self.sock = sockssocket()
1418 self.sock.setproxy(*proxy_args)
19a03940 1419 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1420 self.sock.settimeout(self.timeout)
1421 self.sock.connect((self.host, self.port))
1422
ac668111 1423 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1424 if hasattr(self, '_context'): # Python > 2.6
1425 self.sock = self._context.wrap_socket(
1426 self.sock, server_hostname=self.host)
1427 else:
1428 self.sock = ssl.wrap_socket(self.sock)
1429
1430 return SocksConnection
1431
1432
ac668111 1433class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1434 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1435 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1436 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1437 self._params = params
1438
1439 def https_open(self, req):
4f264c02 1440 kwargs = {}
71aff188
YCH
1441 conn_class = self._https_conn_class
1442
4f264c02
JMF
1443 if hasattr(self, '_context'): # python > 2.6
1444 kwargs['context'] = self._context
1445 if hasattr(self, '_check_hostname'): # python 3.x
1446 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1447
1448 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1449 if socks_proxy:
1450 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1451 del req.headers['Ytdl-socks-proxy']
1452
4f28b537 1453 try:
1454 return self.do_open(
1455 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1456 except urllib.error.URLError as e:
1457 if (isinstance(e.reason, ssl.SSLError)
1458 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1459 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1460 raise
be4a824d
PH
1461
1462
ac668111 1463class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1464 """
1465 See [1] for cookie file format.
1466
1467 1. https://curl.haxx.se/docs/http-cookies.html
1468 """
e7e62441 1469 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1470 _ENTRY_LEN = 7
1471 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1472# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1473
1474'''
1475 _CookieFileEntry = collections.namedtuple(
1476 'CookieFileEntry',
1477 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1478
d76fa1f3 1479 def __init__(self, filename=None, *args, **kwargs):
1480 super().__init__(None, *args, **kwargs)
1481 if self.is_path(filename):
1482 filename = os.fspath(filename)
1483 self.filename = filename
1484
24146491 1485 @staticmethod
1486 def _true_or_false(cndn):
1487 return 'TRUE' if cndn else 'FALSE'
1488
d76fa1f3 1489 @staticmethod
1490 def is_path(file):
1491 return isinstance(file, (str, bytes, os.PathLike))
1492
1493 @contextlib.contextmanager
1494 def open(self, file, *, write=False):
1495 if self.is_path(file):
1496 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1497 yield f
1498 else:
1499 if write:
1500 file.truncate(0)
1501 yield file
1502
24146491 1503 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1504 now = time.time()
1505 for cookie in self:
1506 if (not ignore_discard and cookie.discard
1507 or not ignore_expires and cookie.is_expired(now)):
1508 continue
1509 name, value = cookie.name, cookie.value
1510 if value is None:
1511 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1512 # with no name, whereas http.cookiejar regards it as a
1513 # cookie with no value.
1514 name, value = '', name
1515 f.write('%s\n' % '\t'.join((
1516 cookie.domain,
1517 self._true_or_false(cookie.domain.startswith('.')),
1518 cookie.path,
1519 self._true_or_false(cookie.secure),
1520 str_or_none(cookie.expires, default=''),
1521 name, value
1522 )))
1523
1524 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1525 """
1526 Save cookies to a file.
24146491 1527 Code is taken from CPython 3.6
1528 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1529
c380cc28
S
1530 if filename is None:
1531 if self.filename is not None:
1532 filename = self.filename
1533 else:
ac668111 1534 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1535
24146491 1536 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1537 for cookie in self:
1538 if cookie.expires is None:
1539 cookie.expires = 0
c380cc28 1540
d76fa1f3 1541 with self.open(filename, write=True) as f:
c380cc28 1542 f.write(self._HEADER)
24146491 1543 self._really_save(f, *args, **kwargs)
1bab3437
S
1544
1545 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1546 """Load cookies from a file."""
1547 if filename is None:
1548 if self.filename is not None:
1549 filename = self.filename
1550 else:
ac668111 1551 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1552
c380cc28
S
1553 def prepare_line(line):
1554 if line.startswith(self._HTTPONLY_PREFIX):
1555 line = line[len(self._HTTPONLY_PREFIX):]
1556 # comments and empty lines are fine
1557 if line.startswith('#') or not line.strip():
1558 return line
1559 cookie_list = line.split('\t')
1560 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1561 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1562 cookie = self._CookieFileEntry(*cookie_list)
1563 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1564 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1565 return line
1566
e7e62441 1567 cf = io.StringIO()
d76fa1f3 1568 with self.open(filename) as f:
e7e62441 1569 for line in f:
c380cc28
S
1570 try:
1571 cf.write(prepare_line(line))
ac668111 1572 except http.cookiejar.LoadError as e:
94aa0644 1573 if f'{line.strip()} '[0] in '[{"':
ac668111 1574 raise http.cookiejar.LoadError(
94aa0644
L
1575 'Cookies file must be Netscape formatted, not JSON. See '
1576 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1577 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1578 continue
e7e62441 1579 cf.seek(0)
1580 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1581 # Session cookies are denoted by either `expires` field set to
1582 # an empty string or 0. MozillaCookieJar only recognizes the former
1583 # (see [1]). So we need force the latter to be recognized as session
1584 # cookies on our own.
1585 # Session cookies may be important for cookies-based authentication,
1586 # e.g. usually, when user does not check 'Remember me' check box while
1587 # logging in on a site, some important cookies are stored as session
1588 # cookies so that not recognizing them will result in failed login.
1589 # 1. https://bugs.python.org/issue17164
1590 for cookie in self:
1591 # Treat `expires=0` cookies as session cookies
1592 if cookie.expires == 0:
1593 cookie.expires = None
1594 cookie.discard = True
1595
1596
ac668111 1597class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1598 def __init__(self, cookiejar=None):
ac668111 1599 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1600
1601 def http_response(self, request, response):
ac668111 1602 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1603
ac668111 1604 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1605 https_response = http_response
1606
1607
ac668111 1608class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1609 """YoutubeDL redirect handler
1610
1611 The code is based on HTTPRedirectHandler implementation from CPython [1].
1612
1613 This redirect handler solves two issues:
1614 - ensures redirect URL is always unicode under python 2
1615 - introduces support for experimental HTTP response status code
1616 308 Permanent Redirect [2] used by some sites [3]
1617
1618 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1619 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1620 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1621 """
1622
ac668111 1623 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1624
1625 def redirect_request(self, req, fp, code, msg, headers, newurl):
1626 """Return a Request or None in response to a redirect.
1627
1628 This is called by the http_error_30x methods when a
1629 redirection response is received. If a redirection should
1630 take place, return a new Request to allow http_error_30x to
1631 perform the redirect. Otherwise, raise HTTPError if no-one
1632 else should try to handle this url. Return None if you can't
1633 but another Handler might.
1634 """
1635 m = req.get_method()
1636 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1637 or code in (301, 302, 303) and m == "POST")):
14f25df2 1638 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1639 # Strictly (according to RFC 2616), 301 or 302 in response to
1640 # a POST MUST NOT cause a redirection without confirmation
1641 # from the user (of urllib.request, in this case). In practice,
1642 # essentially all clients do redirect in this case, so we do
1643 # the same.
1644
201c1459 1645 # Be conciliant with URIs containing a space. This is mainly
1646 # redundant with the more complete encoding done in http_error_302(),
1647 # but it is kept for compatibility with other callers.
1648 newurl = newurl.replace(' ', '%20')
1649
1650 CONTENT_HEADERS = ("content-length", "content-type")
1651 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1652 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1653
1654 # A 303 must either use GET or HEAD for subsequent request
1655 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1656 if code == 303 and m != 'HEAD':
1657 m = 'GET'
1658 # 301 and 302 redirects are commonly turned into a GET from a POST
1659 # for subsequent requests by browsers, so we'll do the same.
1660 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1661 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1662 if code in (301, 302) and m == 'POST':
1663 m = 'GET'
1664
ac668111 1665 return urllib.request.Request(
201c1459 1666 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1667 unverifiable=True, method=m)
fca6dba8
S
1668
1669
46f59e89
S
1670def extract_timezone(date_str):
1671 m = re.search(
f137e4c2 1672 r'''(?x)
1673 ^.{8,}? # >=8 char non-TZ prefix, if present
1674 (?P<tz>Z| # just the UTC Z, or
1675 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1676 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1677 [ ]? # optional space
1678 (?P<sign>\+|-) # +/-
1679 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1680 $)
1681 ''', date_str)
46f59e89
S
1682 if not m:
1683 timezone = datetime.timedelta()
1684 else:
1685 date_str = date_str[:-len(m.group('tz'))]
1686 if not m.group('sign'):
1687 timezone = datetime.timedelta()
1688 else:
1689 sign = 1 if m.group('sign') == '+' else -1
1690 timezone = datetime.timedelta(
1691 hours=sign * int(m.group('hours')),
1692 minutes=sign * int(m.group('minutes')))
1693 return timezone, date_str
1694
1695
08b38d54 1696def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1697 """ Return a UNIX timestamp from the given date """
1698
1699 if date_str is None:
1700 return None
1701
52c3a6e4
S
1702 date_str = re.sub(r'\.[0-9]+', '', date_str)
1703
08b38d54 1704 if timezone is None:
46f59e89
S
1705 timezone, date_str = extract_timezone(date_str)
1706
19a03940 1707 with contextlib.suppress(ValueError):
86e5f3ed 1708 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1709 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1710 return calendar.timegm(dt.timetuple())
912b38b4
PH
1711
1712
46f59e89
S
1713def date_formats(day_first=True):
1714 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1715
1716
42bdd9d0 1717def unified_strdate(date_str, day_first=True):
bf50b038 1718 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1719
1720 if date_str is None:
1721 return None
bf50b038 1722 upload_date = None
5f6a1245 1723 # Replace commas
026fcc04 1724 date_str = date_str.replace(',', ' ')
42bdd9d0 1725 # Remove AM/PM + timezone
9bb8e0a3 1726 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1727 _, date_str = extract_timezone(date_str)
42bdd9d0 1728
46f59e89 1729 for expression in date_formats(day_first):
19a03940 1730 with contextlib.suppress(ValueError):
bf50b038 1731 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1732 if upload_date is None:
1733 timetuple = email.utils.parsedate_tz(date_str)
1734 if timetuple:
19a03940 1735 with contextlib.suppress(ValueError):
c6b9cf05 1736 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1737 if upload_date is not None:
14f25df2 1738 return str(upload_date)
bf50b038 1739
5f6a1245 1740
46f59e89
S
1741def unified_timestamp(date_str, day_first=True):
1742 if date_str is None:
1743 return None
1744
2ae2ffda 1745 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1746
7dc2a74e 1747 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1748 timezone, date_str = extract_timezone(date_str)
1749
1750 # Remove AM/PM + timezone
1751 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1752
deef3195
S
1753 # Remove unrecognized timezones from ISO 8601 alike timestamps
1754 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1755 if m:
1756 date_str = date_str[:-len(m.group('tz'))]
1757
f226880c
PH
1758 # Python only supports microseconds, so remove nanoseconds
1759 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1760 if m:
1761 date_str = m.group(1)
1762
46f59e89 1763 for expression in date_formats(day_first):
19a03940 1764 with contextlib.suppress(ValueError):
7dc2a74e 1765 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1766 return calendar.timegm(dt.timetuple())
46f59e89
S
1767 timetuple = email.utils.parsedate_tz(date_str)
1768 if timetuple:
7dc2a74e 1769 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1770
1771
28e614de 1772def determine_ext(url, default_ext='unknown_video'):
85750f89 1773 if url is None or '.' not in url:
f4776371 1774 return default_ext
9cb9a5df 1775 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1776 if re.match(r'^[A-Za-z0-9]+$', guess):
1777 return guess
a7aaa398
S
1778 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1779 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1780 return guess.rstrip('/')
73e79f2a 1781 else:
cbdbb766 1782 return default_ext
73e79f2a 1783
5f6a1245 1784
824fa511
S
1785def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1786 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1787
5f6a1245 1788
9e62f283 1789def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1790 R"""
1791 Return a datetime object from a string.
1792 Supported format:
1793 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1794
1795 @param format strftime format of DATE
1796 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1797 auto: round to the unit provided in date_str (if applicable).
9e62f283 1798 """
1799 auto_precision = False
1800 if precision == 'auto':
1801 auto_precision = True
1802 precision = 'microsecond'
396a76f7 1803 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1804 if date_str in ('now', 'today'):
37254abc 1805 return today
f8795e10
PH
1806 if date_str == 'yesterday':
1807 return today - datetime.timedelta(days=1)
9e62f283 1808 match = re.match(
3d38b2d6 1809 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1810 date_str)
37254abc 1811 if match is not None:
9e62f283 1812 start_time = datetime_from_str(match.group('start'), precision, format)
1813 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1814 unit = match.group('unit')
9e62f283 1815 if unit == 'month' or unit == 'year':
1816 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1817 unit = 'day'
9e62f283 1818 else:
1819 if unit == 'week':
1820 unit = 'day'
1821 time *= 7
1822 delta = datetime.timedelta(**{unit + 's': time})
1823 new_date = start_time + delta
1824 if auto_precision:
1825 return datetime_round(new_date, unit)
1826 return new_date
1827
1828 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1829
1830
d49f8db3 1831def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1832 R"""
1833 Return a date object from a string using datetime_from_str
9e62f283 1834
3d38b2d6 1835 @param strict Restrict allowed patterns to "YYYYMMDD" and
1836 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1837 """
3d38b2d6 1838 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1839 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1840 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1841
1842
1843def datetime_add_months(dt, months):
1844 """Increment/Decrement a datetime object by months."""
1845 month = dt.month + months - 1
1846 year = dt.year + month // 12
1847 month = month % 12 + 1
1848 day = min(dt.day, calendar.monthrange(year, month)[1])
1849 return dt.replace(year, month, day)
1850
1851
1852def datetime_round(dt, precision='day'):
1853 """
1854 Round a datetime object's time to a specific precision
1855 """
1856 if precision == 'microsecond':
1857 return dt
1858
1859 unit_seconds = {
1860 'day': 86400,
1861 'hour': 3600,
1862 'minute': 60,
1863 'second': 1,
1864 }
1865 roundto = lambda x, n: ((x + n / 2) // n) * n
1866 timestamp = calendar.timegm(dt.timetuple())
1867 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1868
1869
e63fc1be 1870def hyphenate_date(date_str):
1871 """
1872 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1873 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1874 if match is not None:
1875 return '-'.join(match.groups())
1876 else:
1877 return date_str
1878
5f6a1245 1879
86e5f3ed 1880class DateRange:
bd558525 1881 """Represents a time interval between two dates"""
5f6a1245 1882
bd558525
JMF
1883 def __init__(self, start=None, end=None):
1884 """start and end must be strings in the format accepted by date"""
1885 if start is not None:
d49f8db3 1886 self.start = date_from_str(start, strict=True)
bd558525
JMF
1887 else:
1888 self.start = datetime.datetime.min.date()
1889 if end is not None:
d49f8db3 1890 self.end = date_from_str(end, strict=True)
bd558525
JMF
1891 else:
1892 self.end = datetime.datetime.max.date()
37254abc 1893 if self.start > self.end:
bd558525 1894 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1895
bd558525
JMF
1896 @classmethod
1897 def day(cls, day):
1898 """Returns a range that only contains the given day"""
5f6a1245
JW
1899 return cls(day, day)
1900
bd558525
JMF
1901 def __contains__(self, date):
1902 """Check if the date is in the range"""
37254abc
JMF
1903 if not isinstance(date, datetime.date):
1904 date = date_from_str(date)
1905 return self.start <= date <= self.end
5f6a1245 1906
bd558525 1907 def __str__(self):
86e5f3ed 1908 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1909
1910
1911def platform_name():
14f25df2 1912 """ Returns the platform name as a str """
b1f94422 1913 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1914 return platform.platform()
c496ca96 1915
b1f94422 1916
1917@functools.cache
1918def system_identifier():
1919 python_implementation = platform.python_implementation()
1920 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1921 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1922
1923 return 'Python %s (%s %s) - %s %s' % (
1924 platform.python_version(),
1925 python_implementation,
1926 platform.architecture()[0],
1927 platform.platform(),
1928 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1929 )
c257baff
PH
1930
1931
0b9c08b4 1932@functools.cache
49fa4d9a 1933def get_windows_version():
8a82af35 1934 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1935 if compat_os_name == 'nt':
1936 return version_tuple(platform.win32_ver()[1])
1937 else:
8a82af35 1938 return ()
49fa4d9a
N
1939
1940
734f90bb 1941def write_string(s, out=None, encoding=None):
19a03940 1942 assert isinstance(s, str)
1943 out = out or sys.stderr
7459e3a2 1944
fe1daad3 1945 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1946 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1947
8a82af35 1948 enc, buffer = None, out
cfb0511d 1949 if 'b' in getattr(out, 'mode', ''):
c487cf00 1950 enc = encoding or preferredencoding()
104aa738 1951 elif hasattr(out, 'buffer'):
8a82af35 1952 buffer = out.buffer
104aa738 1953 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1954
8a82af35 1955 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1956 out.flush()
1957
1958
48ea9cea
PH
1959def bytes_to_intlist(bs):
1960 if not bs:
1961 return []
1962 if isinstance(bs[0], int): # Python 3
1963 return list(bs)
1964 else:
1965 return [ord(c) for c in bs]
1966
c257baff 1967
cba892fa 1968def intlist_to_bytes(xs):
1969 if not xs:
1970 return b''
ac668111 1971 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1972
1973
8a82af35 1974class LockingUnsupportedError(OSError):
1890fc63 1975 msg = 'File locking is not supported'
0edb3e33 1976
1977 def __init__(self):
1978 super().__init__(self.msg)
1979
1980
c1c9a79c
PH
1981# Cross-platform file locking
1982if sys.platform == 'win32':
1983 import ctypes.wintypes
1984 import msvcrt
1985
1986 class OVERLAPPED(ctypes.Structure):
1987 _fields_ = [
1988 ('Internal', ctypes.wintypes.LPVOID),
1989 ('InternalHigh', ctypes.wintypes.LPVOID),
1990 ('Offset', ctypes.wintypes.DWORD),
1991 ('OffsetHigh', ctypes.wintypes.DWORD),
1992 ('hEvent', ctypes.wintypes.HANDLE),
1993 ]
1994
1995 kernel32 = ctypes.windll.kernel32
1996 LockFileEx = kernel32.LockFileEx
1997 LockFileEx.argtypes = [
1998 ctypes.wintypes.HANDLE, # hFile
1999 ctypes.wintypes.DWORD, # dwFlags
2000 ctypes.wintypes.DWORD, # dwReserved
2001 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2002 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2003 ctypes.POINTER(OVERLAPPED) # Overlapped
2004 ]
2005 LockFileEx.restype = ctypes.wintypes.BOOL
2006 UnlockFileEx = kernel32.UnlockFileEx
2007 UnlockFileEx.argtypes = [
2008 ctypes.wintypes.HANDLE, # hFile
2009 ctypes.wintypes.DWORD, # dwReserved
2010 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2011 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2012 ctypes.POINTER(OVERLAPPED) # Overlapped
2013 ]
2014 UnlockFileEx.restype = ctypes.wintypes.BOOL
2015 whole_low = 0xffffffff
2016 whole_high = 0x7fffffff
2017
747c0bd1 2018 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2019 overlapped = OVERLAPPED()
2020 overlapped.Offset = 0
2021 overlapped.OffsetHigh = 0
2022 overlapped.hEvent = 0
2023 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2024
2025 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2026 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2027 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2028 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2029 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2030
2031 def _unlock_file(f):
2032 assert f._lock_file_overlapped_p
2033 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2034 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2035 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2036
2037else:
399a76e6
YCH
2038 try:
2039 import fcntl
c1c9a79c 2040
a3125791 2041 def _lock_file(f, exclusive, block):
b63837bc 2042 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2043 if not block:
2044 flags |= fcntl.LOCK_NB
acea8d7c 2045 try:
b63837bc 2046 fcntl.flock(f, flags)
acea8d7c
JK
2047 except BlockingIOError:
2048 raise
2049 except OSError: # AOSP does not have flock()
b63837bc 2050 fcntl.lockf(f, flags)
c1c9a79c 2051
399a76e6 2052 def _unlock_file(f):
acea8d7c
JK
2053 try:
2054 fcntl.flock(f, fcntl.LOCK_UN)
2055 except OSError:
2056 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2057
399a76e6 2058 except ImportError:
399a76e6 2059
a3125791 2060 def _lock_file(f, exclusive, block):
0edb3e33 2061 raise LockingUnsupportedError()
399a76e6
YCH
2062
2063 def _unlock_file(f):
0edb3e33 2064 raise LockingUnsupportedError()
c1c9a79c
PH
2065
2066
86e5f3ed 2067class locked_file:
0edb3e33 2068 locked = False
747c0bd1 2069
a3125791 2070 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2071 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2072 raise NotImplementedError(mode)
2073 self.mode, self.block = mode, block
2074
2075 writable = any(f in mode for f in 'wax+')
2076 readable = any(f in mode for f in 'r+')
2077 flags = functools.reduce(operator.ior, (
2078 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2079 getattr(os, 'O_BINARY', 0), # Windows only
2080 getattr(os, 'O_NOINHERIT', 0), # Windows only
2081 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2082 os.O_APPEND if 'a' in mode else 0,
2083 os.O_EXCL if 'x' in mode else 0,
2084 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2085 ))
2086
98804d03 2087 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2088
2089 def __enter__(self):
a3125791 2090 exclusive = 'r' not in self.mode
c1c9a79c 2091 try:
a3125791 2092 _lock_file(self.f, exclusive, self.block)
0edb3e33 2093 self.locked = True
86e5f3ed 2094 except OSError:
c1c9a79c
PH
2095 self.f.close()
2096 raise
fcfa8853 2097 if 'w' in self.mode:
131e14dc
JK
2098 try:
2099 self.f.truncate()
2100 except OSError as e:
1890fc63 2101 if e.errno not in (
2102 errno.ESPIPE, # Illegal seek - expected for FIFO
2103 errno.EINVAL, # Invalid argument - expected for /dev/null
2104 ):
2105 raise
c1c9a79c
PH
2106 return self
2107
0edb3e33 2108 def unlock(self):
2109 if not self.locked:
2110 return
c1c9a79c 2111 try:
0edb3e33 2112 _unlock_file(self.f)
c1c9a79c 2113 finally:
0edb3e33 2114 self.locked = False
c1c9a79c 2115
0edb3e33 2116 def __exit__(self, *_):
2117 try:
2118 self.unlock()
2119 finally:
2120 self.f.close()
4eb7f1d1 2121
0edb3e33 2122 open = __enter__
2123 close = __exit__
a3125791 2124
0edb3e33 2125 def __getattr__(self, attr):
2126 return getattr(self.f, attr)
a3125791 2127
0edb3e33 2128 def __iter__(self):
2129 return iter(self.f)
a3125791 2130
4eb7f1d1 2131
0b9c08b4 2132@functools.cache
4644ac55
S
2133def get_filesystem_encoding():
2134 encoding = sys.getfilesystemencoding()
2135 return encoding if encoding is not None else 'utf-8'
2136
2137
4eb7f1d1 2138def shell_quote(args):
a6a173c2 2139 quoted_args = []
4644ac55 2140 encoding = get_filesystem_encoding()
a6a173c2
JMF
2141 for a in args:
2142 if isinstance(a, bytes):
2143 # We may get a filename encoded with 'encodeFilename'
2144 a = a.decode(encoding)
aefce8e6 2145 quoted_args.append(compat_shlex_quote(a))
28e614de 2146 return ' '.join(quoted_args)
9d4660ca
PH
2147
2148
2149def smuggle_url(url, data):
2150 """ Pass additional data in a URL for internal use. """
2151
81953d1a
RA
2152 url, idata = unsmuggle_url(url, {})
2153 data.update(idata)
14f25df2 2154 sdata = urllib.parse.urlencode(
28e614de
PH
2155 {'__youtubedl_smuggle': json.dumps(data)})
2156 return url + '#' + sdata
9d4660ca
PH
2157
2158
79f82953 2159def unsmuggle_url(smug_url, default=None):
83e865a3 2160 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2161 return smug_url, default
28e614de 2162 url, _, sdata = smug_url.rpartition('#')
14f25df2 2163 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2164 data = json.loads(jsond)
2165 return url, data
02dbf93f
PH
2166
2167
e0fd9573 2168def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2169 """ Formats numbers with decimal sufixes like K, M, etc """
2170 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2171 if num is None or num < 0:
e0fd9573 2172 return None
eeb2a770 2173 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2174 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2175 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2176 if factor == 1024:
2177 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2178 converted = num / (factor ** exponent)
abbeeebc 2179 return fmt % (converted, suffix)
e0fd9573 2180
2181
02dbf93f 2182def format_bytes(bytes):
f02d24d8 2183 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2184
1c088fa8 2185
fb47597b
S
2186def lookup_unit_table(unit_table, s):
2187 units_re = '|'.join(re.escape(u) for u in unit_table)
2188 m = re.match(
782b1b5b 2189 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2190 if not m:
2191 return None
2192 num_str = m.group('num').replace(',', '.')
2193 mult = unit_table[m.group('unit')]
2194 return int(float(num_str) * mult)
2195
2196
be64b5b0
PH
2197def parse_filesize(s):
2198 if s is None:
2199 return None
2200
dfb1b146 2201 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2202 # but we support those too
2203 _UNIT_TABLE = {
2204 'B': 1,
2205 'b': 1,
70852b47 2206 'bytes': 1,
be64b5b0
PH
2207 'KiB': 1024,
2208 'KB': 1000,
2209 'kB': 1024,
2210 'Kb': 1000,
13585d76 2211 'kb': 1000,
70852b47
YCH
2212 'kilobytes': 1000,
2213 'kibibytes': 1024,
be64b5b0
PH
2214 'MiB': 1024 ** 2,
2215 'MB': 1000 ** 2,
2216 'mB': 1024 ** 2,
2217 'Mb': 1000 ** 2,
13585d76 2218 'mb': 1000 ** 2,
70852b47
YCH
2219 'megabytes': 1000 ** 2,
2220 'mebibytes': 1024 ** 2,
be64b5b0
PH
2221 'GiB': 1024 ** 3,
2222 'GB': 1000 ** 3,
2223 'gB': 1024 ** 3,
2224 'Gb': 1000 ** 3,
13585d76 2225 'gb': 1000 ** 3,
70852b47
YCH
2226 'gigabytes': 1000 ** 3,
2227 'gibibytes': 1024 ** 3,
be64b5b0
PH
2228 'TiB': 1024 ** 4,
2229 'TB': 1000 ** 4,
2230 'tB': 1024 ** 4,
2231 'Tb': 1000 ** 4,
13585d76 2232 'tb': 1000 ** 4,
70852b47
YCH
2233 'terabytes': 1000 ** 4,
2234 'tebibytes': 1024 ** 4,
be64b5b0
PH
2235 'PiB': 1024 ** 5,
2236 'PB': 1000 ** 5,
2237 'pB': 1024 ** 5,
2238 'Pb': 1000 ** 5,
13585d76 2239 'pb': 1000 ** 5,
70852b47
YCH
2240 'petabytes': 1000 ** 5,
2241 'pebibytes': 1024 ** 5,
be64b5b0
PH
2242 'EiB': 1024 ** 6,
2243 'EB': 1000 ** 6,
2244 'eB': 1024 ** 6,
2245 'Eb': 1000 ** 6,
13585d76 2246 'eb': 1000 ** 6,
70852b47
YCH
2247 'exabytes': 1000 ** 6,
2248 'exbibytes': 1024 ** 6,
be64b5b0
PH
2249 'ZiB': 1024 ** 7,
2250 'ZB': 1000 ** 7,
2251 'zB': 1024 ** 7,
2252 'Zb': 1000 ** 7,
13585d76 2253 'zb': 1000 ** 7,
70852b47
YCH
2254 'zettabytes': 1000 ** 7,
2255 'zebibytes': 1024 ** 7,
be64b5b0
PH
2256 'YiB': 1024 ** 8,
2257 'YB': 1000 ** 8,
2258 'yB': 1024 ** 8,
2259 'Yb': 1000 ** 8,
13585d76 2260 'yb': 1000 ** 8,
70852b47
YCH
2261 'yottabytes': 1000 ** 8,
2262 'yobibytes': 1024 ** 8,
be64b5b0
PH
2263 }
2264
fb47597b
S
2265 return lookup_unit_table(_UNIT_TABLE, s)
2266
2267
2268def parse_count(s):
2269 if s is None:
be64b5b0
PH
2270 return None
2271
352d5da8 2272 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2273
2274 if re.match(r'^[\d,.]+$', s):
2275 return str_to_int(s)
2276
2277 _UNIT_TABLE = {
2278 'k': 1000,
2279 'K': 1000,
2280 'm': 1000 ** 2,
2281 'M': 1000 ** 2,
2282 'kk': 1000 ** 2,
2283 'KK': 1000 ** 2,
352d5da8 2284 'b': 1000 ** 3,
2285 'B': 1000 ** 3,
fb47597b 2286 }
be64b5b0 2287
352d5da8 2288 ret = lookup_unit_table(_UNIT_TABLE, s)
2289 if ret is not None:
2290 return ret
2291
2292 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2293 if mobj:
2294 return str_to_int(mobj.group(1))
be64b5b0 2295
2f7ae819 2296
5d45484c 2297def parse_resolution(s, *, lenient=False):
b871d7e9
S
2298 if s is None:
2299 return {}
2300
5d45484c
LNO
2301 if lenient:
2302 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2303 else:
2304 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2305 if mobj:
2306 return {
2307 'width': int(mobj.group('w')),
2308 'height': int(mobj.group('h')),
2309 }
2310
17ec8bcf 2311 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2312 if mobj:
2313 return {'height': int(mobj.group(1))}
2314
2315 mobj = re.search(r'\b([48])[kK]\b', s)
2316 if mobj:
2317 return {'height': int(mobj.group(1)) * 540}
2318
2319 return {}
2320
2321
0dc41787 2322def parse_bitrate(s):
14f25df2 2323 if not isinstance(s, str):
0dc41787
S
2324 return
2325 mobj = re.search(r'\b(\d+)\s*kbps', s)
2326 if mobj:
2327 return int(mobj.group(1))
2328
2329
a942d6cb 2330def month_by_name(name, lang='en'):
caefb1de
PH
2331 """ Return the number of a month by (locale-independently) English name """
2332
f6717dec 2333 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2334
caefb1de 2335 try:
f6717dec 2336 return month_names.index(name) + 1
7105440c
YCH
2337 except ValueError:
2338 return None
2339
2340
2341def month_by_abbreviation(abbrev):
2342 """ Return the number of a month by (locale-independently) English
2343 abbreviations """
2344
2345 try:
2346 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2347 except ValueError:
2348 return None
18258362
JMF
2349
2350
5aafe895 2351def fix_xml_ampersands(xml_str):
18258362 2352 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2353 return re.sub(
2354 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2355 '&amp;',
5aafe895 2356 xml_str)
e3946f98
PH
2357
2358
2359def setproctitle(title):
14f25df2 2360 assert isinstance(title, str)
c1c05c67
YCH
2361
2362 # ctypes in Jython is not complete
2363 # http://bugs.jython.org/issue2148
2364 if sys.platform.startswith('java'):
2365 return
2366
e3946f98 2367 try:
611c1dd9 2368 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2369 except OSError:
2370 return
2f49bcd6
RC
2371 except TypeError:
2372 # LoadLibrary in Windows Python 2.7.13 only expects
2373 # a bytestring, but since unicode_literals turns
2374 # every string into a unicode string, it fails.
2375 return
0f06bcd7 2376 title_bytes = title.encode()
6eefe533
PH
2377 buf = ctypes.create_string_buffer(len(title_bytes))
2378 buf.value = title_bytes
e3946f98 2379 try:
6eefe533 2380 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2381 except AttributeError:
2382 return # Strange libc, just skip this
d7dda168
PH
2383
2384
2385def remove_start(s, start):
46bc9b7d 2386 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2387
2388
2b9faf55 2389def remove_end(s, end):
46bc9b7d 2390 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2391
2392
31b2051e
S
2393def remove_quotes(s):
2394 if s is None or len(s) < 2:
2395 return s
2396 for quote in ('"', "'", ):
2397 if s[0] == quote and s[-1] == quote:
2398 return s[1:-1]
2399 return s
2400
2401
b6e0c7d2 2402def get_domain(url):
ae61d108 2403 return '.'.join(urllib.parse.urlparse(url).netloc.rsplit('.', 2)[-2:])
b6e0c7d2
U
2404
2405
29eb5174 2406def url_basename(url):
14f25df2 2407 path = urllib.parse.urlparse(url).path
28e614de 2408 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2409
2410
02dc0a36
S
2411def base_url(url):
2412 return re.match(r'https?://[^?#&]+/', url).group()
2413
2414
e34c3361 2415def urljoin(base, path):
4b5de77b 2416 if isinstance(path, bytes):
0f06bcd7 2417 path = path.decode()
14f25df2 2418 if not isinstance(path, str) or not path:
e34c3361 2419 return None
fad4ceb5 2420 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2421 return path
4b5de77b 2422 if isinstance(base, bytes):
0f06bcd7 2423 base = base.decode()
14f25df2 2424 if not isinstance(base, str) or not re.match(
4b5de77b 2425 r'^(?:https?:)?//', base):
e34c3361 2426 return None
14f25df2 2427 return urllib.parse.urljoin(base, path)
e34c3361
S
2428
2429
ac668111 2430class HEADRequest(urllib.request.Request):
aa94a6d3 2431 def get_method(self):
611c1dd9 2432 return 'HEAD'
7217e148
PH
2433
2434
ac668111 2435class PUTRequest(urllib.request.Request):
95cf60e8
S
2436 def get_method(self):
2437 return 'PUT'
2438
2439
9732d77e 2440def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2441 if get_attr and v is not None:
2442 v = getattr(v, get_attr, None)
1812afb7
S
2443 try:
2444 return int(v) * invscale // scale
31c49255 2445 except (ValueError, TypeError, OverflowError):
af98f8ff 2446 return default
9732d77e 2447
9572013d 2448
40a90862 2449def str_or_none(v, default=None):
14f25df2 2450 return default if v is None else str(v)
40a90862 2451
9732d77e
PH
2452
2453def str_to_int(int_str):
48d4681e 2454 """ A more relaxed version of int_or_none """
f9934b96 2455 if isinstance(int_str, int):
348c6bf1 2456 return int_str
14f25df2 2457 elif isinstance(int_str, str):
42db58ec
S
2458 int_str = re.sub(r'[,\.\+]', '', int_str)
2459 return int_or_none(int_str)
608d11f5
PH
2460
2461
9732d77e 2462def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2463 if v is None:
2464 return default
2465 try:
2466 return float(v) * invscale / scale
5e1271c5 2467 except (ValueError, TypeError):
caf80631 2468 return default
43f775e4
PH
2469
2470
c7e327c4
S
2471def bool_or_none(v, default=None):
2472 return v if isinstance(v, bool) else default
2473
2474
53cd37ba 2475def strip_or_none(v, default=None):
14f25df2 2476 return v.strip() if isinstance(v, str) else default
b72b4431
S
2477
2478
af03000a 2479def url_or_none(url):
14f25df2 2480 if not url or not isinstance(url, str):
af03000a
S
2481 return None
2482 url = url.strip()
29f7c58a 2483 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2484
2485
3e9b66d7 2486def request_to_url(req):
ac668111 2487 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2488 return req.get_full_url()
2489 else:
2490 return req
2491
2492
e29663c6 2493def strftime_or_none(timestamp, date_format, default=None):
2494 datetime_object = None
2495 try:
f9934b96 2496 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2497 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2498 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2499 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2500 return datetime_object.strftime(date_format)
2501 except (ValueError, TypeError, AttributeError):
2502 return default
2503
2504
608d11f5 2505def parse_duration(s):
f9934b96 2506 if not isinstance(s, str):
608d11f5 2507 return None
ca7b3246 2508 s = s.strip()
38d79fd1 2509 if not s:
2510 return None
ca7b3246 2511
acaff495 2512 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2513 m = re.match(r'''(?x)
2514 (?P<before_secs>
2515 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2516 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2517 (?P<ms>[.:][0-9]+)?Z?$
2518 ''', s)
acaff495 2519 if m:
8bd1c00b 2520 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2521 else:
2522 m = re.match(
056653bb
S
2523 r'''(?ix)(?:P?
2524 (?:
1c1b2f96 2525 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2526 )?
2527 (?:
1c1b2f96 2528 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2529 )?
2530 (?:
1c1b2f96 2531 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2532 )?
8f4b58d7 2533 (?:
1c1b2f96 2534 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2535 )?
056653bb 2536 T)?
acaff495 2537 (?:
1c1b2f96 2538 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2539 )?
2540 (?:
1c1b2f96 2541 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2542 )?
2543 (?:
2544 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2545 )?Z?$''', s)
acaff495 2546 if m:
2547 days, hours, mins, secs, ms = m.groups()
2548 else:
15846398 2549 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2550 if m:
2551 hours, mins = m.groups()
2552 else:
2553 return None
2554
acaff495 2555 if ms:
19a03940 2556 ms = ms.replace(':', '.')
2557 return sum(float(part or 0) * mult for part, mult in (
2558 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2559
2560
e65e4c88 2561def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2562 name, real_ext = os.path.splitext(filename)
e65e4c88 2563 return (
86e5f3ed 2564 f'{name}.{ext}{real_ext}'
e65e4c88 2565 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2566 else f'{filename}.{ext}')
d70ad093
PH
2567
2568
b3ed15b7
S
2569def replace_extension(filename, ext, expected_real_ext=None):
2570 name, real_ext = os.path.splitext(filename)
86e5f3ed 2571 return '{}.{}'.format(
b3ed15b7
S
2572 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2573 ext)
2574
2575
d70ad093
PH
2576def check_executable(exe, args=[]):
2577 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2578 args can be a list of arguments for a short output (like -version) """
2579 try:
f0c9fb96 2580 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2581 except OSError:
2582 return False
2583 return exe
b7ab0590
PH
2584
2585
8a7f68d0 2586def _get_exe_version_output(exe, args, *, to_screen=None):
2587 if to_screen:
2588 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2589 try:
b64d04c1 2590 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2591 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2592 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2593 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2594 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2595 except OSError:
2596 return False
f0c9fb96 2597 return stdout
cae97f65
PH
2598
2599
2600def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2601 assert isinstance(output, str)
cae97f65
PH
2602 if version_re is None:
2603 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2604 m = re.search(version_re, output)
95807118
PH
2605 if m:
2606 return m.group(1)
2607 else:
2608 return unrecognized
2609
2610
9af98e17 2611def get_exe_version(exe, args=['--version'],
2612 version_re=None, unrecognized='present'):
2613 """ Returns the version of the specified executable,
2614 or False if the executable is not present """
2615 out = _get_exe_version_output(exe, args)
2616 return detect_exe_version(out, version_re, unrecognized) if out else False
2617
2618
7e88d7d7 2619def frange(start=0, stop=None, step=1):
2620 """Float range"""
2621 if stop is None:
2622 start, stop = 0, start
2623 sign = [-1, 1][step > 0] if step else 0
2624 while sign * start < sign * stop:
2625 yield start
2626 start += step
2627
2628
cb89cfc1 2629class LazyList(collections.abc.Sequence):
0f06bcd7 2630 """Lazy immutable list from an iterable
2631 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2632
8e5fecc8 2633 class IndexError(IndexError):
2634 pass
2635
282f5709 2636 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2637 self._iterable = iter(iterable)
2638 self._cache = [] if _cache is None else _cache
2639 self._reversed = reverse
483336e7 2640
2641 def __iter__(self):
0f06bcd7 2642 if self._reversed:
28419ca2 2643 # We need to consume the entire iterable to iterate in reverse
981052c9 2644 yield from self.exhaust()
28419ca2 2645 return
0f06bcd7 2646 yield from self._cache
2647 for item in self._iterable:
2648 self._cache.append(item)
483336e7 2649 yield item
2650
0f06bcd7 2651 def _exhaust(self):
2652 self._cache.extend(self._iterable)
2653 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2654 return self._cache
28419ca2 2655
981052c9 2656 def exhaust(self):
0f06bcd7 2657 """Evaluate the entire iterable"""
2658 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2659
28419ca2 2660 @staticmethod
0f06bcd7 2661 def _reverse_index(x):
e0f2b4b4 2662 return None if x is None else -(x + 1)
483336e7 2663
2664 def __getitem__(self, idx):
2665 if isinstance(idx, slice):
0f06bcd7 2666 if self._reversed:
2667 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2668 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2669 elif isinstance(idx, int):
0f06bcd7 2670 if self._reversed:
2671 idx = self._reverse_index(idx)
e0f2b4b4 2672 start, stop, step = idx, idx, 0
483336e7 2673 else:
2674 raise TypeError('indices must be integers or slices')
e0f2b4b4 2675 if ((start or 0) < 0 or (stop or 0) < 0
2676 or (start is None and step < 0)
2677 or (stop is None and step > 0)):
483336e7 2678 # We need to consume the entire iterable to be able to slice from the end
2679 # Obviously, never use this with infinite iterables
0f06bcd7 2680 self._exhaust()
8e5fecc8 2681 try:
0f06bcd7 2682 return self._cache[idx]
8e5fecc8 2683 except IndexError as e:
2684 raise self.IndexError(e) from e
0f06bcd7 2685 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2686 if n > 0:
0f06bcd7 2687 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2688 try:
0f06bcd7 2689 return self._cache[idx]
8e5fecc8 2690 except IndexError as e:
2691 raise self.IndexError(e) from e
483336e7 2692
2693 def __bool__(self):
2694 try:
0f06bcd7 2695 self[-1] if self._reversed else self[0]
8e5fecc8 2696 except self.IndexError:
483336e7 2697 return False
2698 return True
2699
2700 def __len__(self):
0f06bcd7 2701 self._exhaust()
2702 return len(self._cache)
483336e7 2703
282f5709 2704 def __reversed__(self):
0f06bcd7 2705 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2706
2707 def __copy__(self):
0f06bcd7 2708 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2709
28419ca2 2710 def __repr__(self):
2711 # repr and str should mimic a list. So we exhaust the iterable
2712 return repr(self.exhaust())
2713
2714 def __str__(self):
2715 return repr(self.exhaust())
2716
483336e7 2717
7be9ccff 2718class PagedList:
c07a39ae 2719
2720 class IndexError(IndexError):
2721 pass
2722
dd26ced1
PH
2723 def __len__(self):
2724 # This is only useful for tests
2725 return len(self.getslice())
2726
7be9ccff 2727 def __init__(self, pagefunc, pagesize, use_cache=True):
2728 self._pagefunc = pagefunc
2729 self._pagesize = pagesize
f1d13090 2730 self._pagecount = float('inf')
7be9ccff 2731 self._use_cache = use_cache
2732 self._cache = {}
2733
2734 def getpage(self, pagenum):
d8cf8d97 2735 page_results = self._cache.get(pagenum)
2736 if page_results is None:
f1d13090 2737 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2738 if self._use_cache:
2739 self._cache[pagenum] = page_results
2740 return page_results
2741
2742 def getslice(self, start=0, end=None):
2743 return list(self._getslice(start, end))
2744
2745 def _getslice(self, start, end):
55575225 2746 raise NotImplementedError('This method must be implemented by subclasses')
2747
2748 def __getitem__(self, idx):
f1d13090 2749 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2750 if not isinstance(idx, int) or idx < 0:
2751 raise TypeError('indices must be non-negative integers')
2752 entries = self.getslice(idx, idx + 1)
d8cf8d97 2753 if not entries:
c07a39ae 2754 raise self.IndexError()
d8cf8d97 2755 return entries[0]
55575225 2756
9c44d242
PH
2757
2758class OnDemandPagedList(PagedList):
a44ca5a4 2759 """Download pages until a page with less than maximum results"""
86e5f3ed 2760
7be9ccff 2761 def _getslice(self, start, end):
b7ab0590
PH
2762 for pagenum in itertools.count(start // self._pagesize):
2763 firstid = pagenum * self._pagesize
2764 nextfirstid = pagenum * self._pagesize + self._pagesize
2765 if start >= nextfirstid:
2766 continue
2767
b7ab0590
PH
2768 startv = (
2769 start % self._pagesize
2770 if firstid <= start < nextfirstid
2771 else 0)
b7ab0590
PH
2772 endv = (
2773 ((end - 1) % self._pagesize) + 1
2774 if (end is not None and firstid <= end <= nextfirstid)
2775 else None)
2776
f1d13090 2777 try:
2778 page_results = self.getpage(pagenum)
2779 except Exception:
2780 self._pagecount = pagenum - 1
2781 raise
b7ab0590
PH
2782 if startv != 0 or endv is not None:
2783 page_results = page_results[startv:endv]
7be9ccff 2784 yield from page_results
b7ab0590
PH
2785
2786 # A little optimization - if current page is not "full", ie. does
2787 # not contain page_size videos then we can assume that this page
2788 # is the last one - there are no more ids on further pages -
2789 # i.e. no need to query again.
2790 if len(page_results) + startv < self._pagesize:
2791 break
2792
2793 # If we got the whole page, but the next page is not interesting,
2794 # break out early as well
2795 if end == nextfirstid:
2796 break
81c2f20b
PH
2797
2798
9c44d242 2799class InAdvancePagedList(PagedList):
a44ca5a4 2800 """PagedList with total number of pages known in advance"""
86e5f3ed 2801
9c44d242 2802 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2803 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2804 self._pagecount = pagecount
9c44d242 2805
7be9ccff 2806 def _getslice(self, start, end):
9c44d242 2807 start_page = start // self._pagesize
d37707bd 2808 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2809 skip_elems = start - start_page * self._pagesize
2810 only_more = None if end is None else end - start
2811 for pagenum in range(start_page, end_page):
7be9ccff 2812 page_results = self.getpage(pagenum)
9c44d242 2813 if skip_elems:
7be9ccff 2814 page_results = page_results[skip_elems:]
9c44d242
PH
2815 skip_elems = None
2816 if only_more is not None:
7be9ccff 2817 if len(page_results) < only_more:
2818 only_more -= len(page_results)
9c44d242 2819 else:
7be9ccff 2820 yield from page_results[:only_more]
9c44d242 2821 break
7be9ccff 2822 yield from page_results
9c44d242
PH
2823
2824
7e88d7d7 2825class PlaylistEntries:
2826 MissingEntry = object()
2827 is_exhausted = False
2828
2829 def __init__(self, ydl, info_dict):
7e9a6125 2830 self.ydl = ydl
2831
2832 # _entries must be assigned now since infodict can change during iteration
2833 entries = info_dict.get('entries')
2834 if entries is None:
2835 raise EntryNotInPlaylist('There are no entries')
2836 elif isinstance(entries, list):
2837 self.is_exhausted = True
2838
2839 requested_entries = info_dict.get('requested_entries')
2840 self.is_incomplete = bool(requested_entries)
2841 if self.is_incomplete:
2842 assert self.is_exhausted
2843 self._entries = [self.MissingEntry] * max(requested_entries)
2844 for i, entry in zip(requested_entries, entries):
2845 self._entries[i - 1] = entry
2846 elif isinstance(entries, (list, PagedList, LazyList)):
2847 self._entries = entries
2848 else:
2849 self._entries = LazyList(entries)
7e88d7d7 2850
2851 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2852 (?P<start>[+-]?\d+)?
2853 (?P<range>[:-]
2854 (?P<end>[+-]?\d+|inf(?:inite)?)?
2855 (?::(?P<step>[+-]?\d+))?
2856 )?''')
2857
2858 @classmethod
2859 def parse_playlist_items(cls, string):
2860 for segment in string.split(','):
2861 if not segment:
2862 raise ValueError('There is two or more consecutive commas')
2863 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2864 if not mobj:
2865 raise ValueError(f'{segment!r} is not a valid specification')
2866 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2867 if int_or_none(step) == 0:
2868 raise ValueError(f'Step in {segment!r} cannot be zero')
2869 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2870
2871 def get_requested_items(self):
2872 playlist_items = self.ydl.params.get('playlist_items')
2873 playlist_start = self.ydl.params.get('playliststart', 1)
2874 playlist_end = self.ydl.params.get('playlistend')
2875 # For backwards compatibility, interpret -1 as whole list
2876 if playlist_end in (-1, None):
2877 playlist_end = ''
2878 if not playlist_items:
2879 playlist_items = f'{playlist_start}:{playlist_end}'
2880 elif playlist_start != 1 or playlist_end:
2881 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2882
2883 for index in self.parse_playlist_items(playlist_items):
2884 for i, entry in self[index]:
2885 yield i, entry
1ac4fd80 2886 if not entry:
2887 continue
7e88d7d7 2888 try:
2889 # TODO: Add auto-generated fields
2890 self.ydl._match_entry(entry, incomplete=True, silent=True)
2891 except (ExistingVideoReached, RejectedVideoReached):
2892 return
2893
7e9a6125 2894 def get_full_count(self):
2895 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2896 return len(self)
2897 elif isinstance(self._entries, InAdvancePagedList):
2898 if self._entries._pagesize == 1:
2899 return self._entries._pagecount
2900
7e88d7d7 2901 @functools.cached_property
2902 def _getter(self):
2903 if isinstance(self._entries, list):
2904 def get_entry(i):
2905 try:
2906 entry = self._entries[i]
2907 except IndexError:
2908 entry = self.MissingEntry
2909 if not self.is_incomplete:
2910 raise self.IndexError()
2911 if entry is self.MissingEntry:
2912 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2913 return entry
2914 else:
2915 def get_entry(i):
2916 try:
2917 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2918 except (LazyList.IndexError, PagedList.IndexError):
2919 raise self.IndexError()
2920 return get_entry
2921
2922 def __getitem__(self, idx):
2923 if isinstance(idx, int):
2924 idx = slice(idx, idx)
2925
2926 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2927 step = 1 if idx.step is None else idx.step
2928 if idx.start is None:
2929 start = 0 if step > 0 else len(self) - 1
2930 else:
2931 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2932
2933 # NB: Do not call len(self) when idx == [:]
2934 if idx.stop is None:
2935 stop = 0 if step < 0 else float('inf')
2936 else:
2937 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2938 stop += [-1, 1][step > 0]
2939
2940 for i in frange(start, stop, step):
2941 if i < 0:
2942 continue
2943 try:
7e9a6125 2944 entry = self._getter(i)
2945 except self.IndexError:
2946 self.is_exhausted = True
2947 if step > 0:
7e88d7d7 2948 break
7e9a6125 2949 continue
7e88d7d7 2950 yield i + 1, entry
2951
2952 def __len__(self):
2953 return len(tuple(self[:]))
2954
2955 class IndexError(IndexError):
2956 pass
2957
2958
81c2f20b 2959def uppercase_escape(s):
676eb3f2 2960 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2961 return re.sub(
a612753d 2962 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2963 lambda m: unicode_escape(m.group(0))[0],
2964 s)
0fe2ff78
YCH
2965
2966
2967def lowercase_escape(s):
2968 unicode_escape = codecs.getdecoder('unicode_escape')
2969 return re.sub(
2970 r'\\u[0-9a-fA-F]{4}',
2971 lambda m: unicode_escape(m.group(0))[0],
2972 s)
b53466e1 2973
d05cfe06
S
2974
2975def escape_rfc3986(s):
2976 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2977 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2978
2979
2980def escape_url(url):
2981 """Escape URL as suggested by RFC 3986"""
14f25df2 2982 url_parsed = urllib.parse.urlparse(url)
d05cfe06 2983 return url_parsed._replace(
efbed08d 2984 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2985 path=escape_rfc3986(url_parsed.path),
2986 params=escape_rfc3986(url_parsed.params),
2987 query=escape_rfc3986(url_parsed.query),
2988 fragment=escape_rfc3986(url_parsed.fragment)
2989 ).geturl()
2990
62e609ab 2991
4dfbf869 2992def parse_qs(url):
14f25df2 2993 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 2994
2995
62e609ab
PH
2996def read_batch_urls(batch_fd):
2997 def fixup(url):
14f25df2 2998 if not isinstance(url, str):
62e609ab 2999 url = url.decode('utf-8', 'replace')
8c04f0be 3000 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3001 for bom in BOM_UTF8:
3002 if url.startswith(bom):
3003 url = url[len(bom):]
3004 url = url.lstrip()
3005 if not url or url.startswith(('#', ';', ']')):
62e609ab 3006 return False
8c04f0be 3007 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3008 # However, it can be safely stripped out if following a whitespace
8c04f0be 3009 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3010
3011 with contextlib.closing(batch_fd) as fd:
3012 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3013
3014
3015def urlencode_postdata(*args, **kargs):
14f25df2 3016 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3017
3018
38f9ef31 3019def update_url_query(url, query):
cacd9966
YCH
3020 if not query:
3021 return url
14f25df2 3022 parsed_url = urllib.parse.urlparse(url)
3023 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3024 qs.update(query)
14f25df2 3025 return urllib.parse.urlunparse(parsed_url._replace(
3026 query=urllib.parse.urlencode(qs, True)))
16392824 3027
8e60dc75 3028
c043c246 3029def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3030 req_headers = req.headers.copy()
c043c246 3031 req_headers.update(headers or {})
ed0291d1
S
3032 req_data = data or req.data
3033 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3034 req_get_method = req.get_method()
3035 if req_get_method == 'HEAD':
3036 req_type = HEADRequest
3037 elif req_get_method == 'PUT':
3038 req_type = PUTRequest
3039 else:
ac668111 3040 req_type = urllib.request.Request
ed0291d1
S
3041 new_req = req_type(
3042 req_url, data=req_data, headers=req_headers,
3043 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3044 if hasattr(req, 'timeout'):
3045 new_req.timeout = req.timeout
3046 return new_req
3047
3048
10c87c15 3049def _multipart_encode_impl(data, boundary):
0c265486
YCH
3050 content_type = 'multipart/form-data; boundary=%s' % boundary
3051
3052 out = b''
3053 for k, v in data.items():
3054 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3055 if isinstance(k, str):
0f06bcd7 3056 k = k.encode()
14f25df2 3057 if isinstance(v, str):
0f06bcd7 3058 v = v.encode()
0c265486
YCH
3059 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3060 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3061 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3062 if boundary.encode('ascii') in content:
3063 raise ValueError('Boundary overlaps with data')
3064 out += content
3065
3066 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3067
3068 return out, content_type
3069
3070
3071def multipart_encode(data, boundary=None):
3072 '''
3073 Encode a dict to RFC 7578-compliant form-data
3074
3075 data:
3076 A dict where keys and values can be either Unicode or bytes-like
3077 objects.
3078 boundary:
3079 If specified a Unicode object, it's used as the boundary. Otherwise
3080 a random boundary is generated.
3081
3082 Reference: https://tools.ietf.org/html/rfc7578
3083 '''
3084 has_specified_boundary = boundary is not None
3085
3086 while True:
3087 if boundary is None:
3088 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3089
3090 try:
10c87c15 3091 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3092 break
3093 except ValueError:
3094 if has_specified_boundary:
3095 raise
3096 boundary = None
3097
3098 return out, content_type
3099
3100
86296ad2 3101def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3102 for val in map(d.get, variadic(key_or_keys)):
3103 if val is not None and (val or not skip_false_values):
3104 return val
3105 return default
cbecc9b9
S
3106
3107
c4f60dd7 3108def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3109 for f in funcs:
a32a9a7e 3110 try:
c4f60dd7 3111 val = f(*args, **kwargs)
3112 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3113 pass
3114 else:
c4f60dd7 3115 if expected_type is None or isinstance(val, expected_type):
3116 return val
3117
3118
3119def try_get(src, getter, expected_type=None):
3120 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3121
3122
90137ca4 3123def filter_dict(dct, cndn=lambda _, v: v is not None):
3124 return {k: v for k, v in dct.items() if cndn(k, v)}
3125
3126
6cc62232
S
3127def merge_dicts(*dicts):
3128 merged = {}
3129 for a_dict in dicts:
3130 for k, v in a_dict.items():
90137ca4 3131 if (v is not None and k not in merged
3132 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3133 merged[k] = v
3134 return merged
3135
3136
8e60dc75 3137def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3138 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3139
16392824 3140
a1a530b0
PH
3141US_RATINGS = {
3142 'G': 0,
3143 'PG': 10,
3144 'PG-13': 13,
3145 'R': 16,
3146 'NC': 18,
3147}
fac55558
PH
3148
3149
a8795327 3150TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3151 'TV-Y': 0,
3152 'TV-Y7': 7,
3153 'TV-G': 0,
3154 'TV-PG': 0,
3155 'TV-14': 14,
3156 'TV-MA': 17,
a8795327
S
3157}
3158
3159
146c80e2 3160def parse_age_limit(s):
19a03940 3161 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3162 if type(s) is int: # noqa: E721
a8795327 3163 return s if 0 <= s <= 21 else None
19a03940 3164 elif not isinstance(s, str):
d838b1bd 3165 return None
146c80e2 3166 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3167 if m:
3168 return int(m.group('age'))
5c5fae6d 3169 s = s.upper()
a8795327
S
3170 if s in US_RATINGS:
3171 return US_RATINGS[s]
5a16c9d9 3172 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3173 if m:
5a16c9d9 3174 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3175 return None
146c80e2
S
3176
3177
fac55558 3178def strip_jsonp(code):
609a61e3 3179 return re.sub(
5552c9eb 3180 r'''(?sx)^
e9c671d5 3181 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3182 (?:\s*&&\s*(?P=func_name))?
3183 \s*\(\s*(?P<callback_data>.*)\);?
3184 \s*?(?://[^\n]*)*$''',
3185 r'\g<callback_data>', code)
478c2c61
PH
3186
3187
5c610515 3188def js_to_json(code, vars={}):
3189 # vars is a dict of var, val pairs to substitute
c843e685 3190 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3191 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3192 INTEGER_TABLE = (
86e5f3ed 3193 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3194 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3195 )
3196
e05f6939 3197 def fix_kv(m):
e7b6d122
PH
3198 v = m.group(0)
3199 if v in ('true', 'false', 'null'):
3200 return v
421ddcb8
C
3201 elif v in ('undefined', 'void 0'):
3202 return 'null'
8bdd16b4 3203 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3204 return ""
3205
3206 if v[0] in ("'", '"'):
3207 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3208 '"': '\\"',
bd1e4844 3209 "\\'": "'",
3210 '\\\n': '',
3211 '\\x': '\\u00',
3212 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3213 else:
3214 for regex, base in INTEGER_TABLE:
3215 im = re.match(regex, v)
3216 if im:
3217 i = int(im.group(1), base)
3218 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3219
5c610515 3220 if v in vars:
3221 return vars[v]
3222
e7b6d122 3223 return '"%s"' % v
e05f6939 3224
8072ef2b 3225 def create_map(mobj):
3226 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3227
febff4c1 3228 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
8072ef2b 3229 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
febff4c1 3230
bd1e4844 3231 return re.sub(r'''(?sx)
3232 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3233 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3234 {comment}|,(?={skip}[\]}}])|
421ddcb8 3235 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3236 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3237 [0-9]+(?={skip}:)|
3238 !+
4195096e 3239 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3240
3241
478c2c61
PH
3242def qualities(quality_ids):
3243 """ Get a numeric quality value out of a list of possible values """
3244 def q(qid):
3245 try:
3246 return quality_ids.index(qid)
3247 except ValueError:
3248 return -1
3249 return q
3250
acd69589 3251
8aa0e7cd 3252POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3253
3254
de6000d9 3255DEFAULT_OUTTMPL = {
3256 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3257 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3258}
3259OUTTMPL_TYPES = {
72755351 3260 'chapter': None,
de6000d9 3261 'subtitle': None,
3262 'thumbnail': None,
3263 'description': 'description',
3264 'annotation': 'annotations.xml',
3265 'infojson': 'info.json',
08438d2c 3266 'link': None,
3b603dbd 3267 'pl_video': None,
5112f26a 3268 'pl_thumbnail': None,
de6000d9 3269 'pl_description': 'description',
3270 'pl_infojson': 'info.json',
3271}
0a871f68 3272
143db31d 3273# As of [1] format syntax is:
3274# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3275# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3276STR_FORMAT_RE_TMPL = r'''(?x)
3277 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3278 %
524e2e4f 3279 (?P<has_key>\((?P<key>{0})\))?
752cda38 3280 (?P<format>
524e2e4f 3281 (?P<conversion>[#0\-+ ]+)?
3282 (?P<min_width>\d+)?
3283 (?P<precision>\.\d+)?
3284 (?P<len_mod>[hlL])? # unused in python
901130bb 3285 {1} # conversion type
752cda38 3286 )
143db31d 3287'''
3288
7d1eb38a 3289
901130bb 3290STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3291
7d1eb38a 3292
a020a0dc
PH
3293def limit_length(s, length):
3294 """ Add ellipses to overly long strings """
3295 if s is None:
3296 return None
3297 ELLIPSES = '...'
3298 if len(s) > length:
3299 return s[:length - len(ELLIPSES)] + ELLIPSES
3300 return s
48844745
PH
3301
3302
3303def version_tuple(v):
5f9b8394 3304 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3305
3306
3307def is_outdated_version(version, limit, assume_new=True):
3308 if not version:
3309 return not assume_new
3310 try:
3311 return version_tuple(version) < version_tuple(limit)
3312 except ValueError:
3313 return not assume_new
732ea2f0
PH
3314
3315
3316def ytdl_is_updateable():
7a5c1cfe 3317 """ Returns if yt-dlp can be updated with -U """
735d865e 3318
5d535b4a 3319 from .update import is_non_updateable
732ea2f0 3320
5d535b4a 3321 return not is_non_updateable()
7d4111ed
PH
3322
3323
3324def args_to_str(args):
3325 # Get a short string representation for a subprocess command
702ccf2d 3326 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3327
3328
9b9c5355 3329def error_to_compat_str(err):
cfb0511d 3330 return str(err)
fdae2358
S
3331
3332
a44ca5a4 3333def error_to_str(err):
3334 return f'{type(err).__name__}: {err}'
3335
3336
c460bdd5 3337def mimetype2ext(mt):
eb9ee194
S
3338 if mt is None:
3339 return None
3340
9359f3d4
F
3341 mt, _, params = mt.partition(';')
3342 mt = mt.strip()
3343
3344 FULL_MAP = {
765ac263 3345 'audio/mp4': 'm4a',
6c33d24b
YCH
3346 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3347 # it's the most popular one
3348 'audio/mpeg': 'mp3',
ba39289d 3349 'audio/x-wav': 'wav',
9359f3d4
F
3350 'audio/wav': 'wav',
3351 'audio/wave': 'wav',
3352 }
3353
3354 ext = FULL_MAP.get(mt)
765ac263
JMF
3355 if ext is not None:
3356 return ext
3357
9359f3d4 3358 SUBTYPE_MAP = {
f6861ec9 3359 '3gpp': '3gp',
cafcf657 3360 'smptett+xml': 'tt',
cafcf657 3361 'ttaf+xml': 'dfxp',
a0d8d704 3362 'ttml+xml': 'ttml',
f6861ec9 3363 'x-flv': 'flv',
a0d8d704 3364 'x-mp4-fragmented': 'mp4',
d4f05d47 3365 'x-ms-sami': 'sami',
a0d8d704 3366 'x-ms-wmv': 'wmv',
b4173f15
RA
3367 'mpegurl': 'm3u8',
3368 'x-mpegurl': 'm3u8',
3369 'vnd.apple.mpegurl': 'm3u8',
3370 'dash+xml': 'mpd',
b4173f15 3371 'f4m+xml': 'f4m',
f164b971 3372 'hds+xml': 'f4m',
e910fe2f 3373 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3374 'quicktime': 'mov',
98ce1a3f 3375 'mp2t': 'ts',
39e7107d 3376 'x-wav': 'wav',
9359f3d4
F
3377 'filmstrip+json': 'fs',
3378 'svg+xml': 'svg',
3379 }
3380
3381 _, _, subtype = mt.rpartition('/')
3382 ext = SUBTYPE_MAP.get(subtype.lower())
3383 if ext is not None:
3384 return ext
3385
3386 SUFFIX_MAP = {
3387 'json': 'json',
3388 'xml': 'xml',
3389 'zip': 'zip',
3390 'gzip': 'gz',
3391 }
3392
3393 _, _, suffix = subtype.partition('+')
3394 ext = SUFFIX_MAP.get(suffix)
3395 if ext is not None:
3396 return ext
3397
3398 return subtype.replace('+', '.')
c460bdd5
PH
3399
3400
2814f12b
THD
3401def ext2mimetype(ext_or_url):
3402 if not ext_or_url:
3403 return None
3404 if '.' not in ext_or_url:
3405 ext_or_url = f'file.{ext_or_url}'
3406 return mimetypes.guess_type(ext_or_url)[0]
3407
3408
4f3c5e06 3409def parse_codecs(codecs_str):
3410 # http://tools.ietf.org/html/rfc6381
3411 if not codecs_str:
3412 return {}
a0566bbf 3413 split_codecs = list(filter(None, map(
dbf5416a 3414 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3415 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3416 for full_codec in split_codecs:
9bd979ca 3417 parts = full_codec.split('.')
3418 codec = parts[0].replace('0', '')
3419 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3420 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3421 if not vcodec:
b69fd25c 3422 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3423 if codec in ('dvh1', 'dvhe'):
3424 hdr = 'DV'
9bd979ca 3425 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3426 hdr = 'HDR10'
3427 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3428 hdr = 'HDR10'
b69fd25c 3429 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3430 if not acodec:
3431 acodec = full_codec
4afa3ec4 3432 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3433 if not scodec:
3434 scodec = full_codec
4f3c5e06 3435 else:
19a03940 3436 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3437 if vcodec or acodec or scodec:
4f3c5e06 3438 return {
3439 'vcodec': vcodec or 'none',
3440 'acodec': acodec or 'none',
176f1866 3441 'dynamic_range': hdr,
3fe75fdc 3442 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3443 }
b69fd25c 3444 elif len(split_codecs) == 2:
3445 return {
3446 'vcodec': split_codecs[0],
3447 'acodec': split_codecs[1],
3448 }
4f3c5e06 3449 return {}
3450
3451
2ccd1b10 3452def urlhandle_detect_ext(url_handle):
79298173 3453 getheader = url_handle.headers.get
2ccd1b10 3454
b55ee18f
PH
3455 cd = getheader('Content-Disposition')
3456 if cd:
3457 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3458 if m:
3459 e = determine_ext(m.group('filename'), default_ext=None)
3460 if e:
3461 return e
3462
c460bdd5 3463 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3464
3465
1e399778
YCH
3466def encode_data_uri(data, mime_type):
3467 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3468
3469
05900629 3470def age_restricted(content_limit, age_limit):
6ec6cb4e 3471 """ Returns True iff the content should be blocked """
05900629
PH
3472
3473 if age_limit is None: # No limit set
3474 return False
3475 if content_limit is None:
3476 return False # Content available for everyone
3477 return age_limit < content_limit
61ca9a80
PH
3478
3479
3480def is_html(first_bytes):
3481 """ Detect whether a file contains HTML by examining its first bytes. """
3482
3483 BOMS = [
3484 (b'\xef\xbb\xbf', 'utf-8'),
3485 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3486 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3487 (b'\xff\xfe', 'utf-16-le'),
3488 (b'\xfe\xff', 'utf-16-be'),
3489 ]
80e8493e 3490
3491 encoding = 'utf-8'
61ca9a80 3492 for bom, enc in BOMS:
80e8493e 3493 while first_bytes.startswith(bom):
3494 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3495
80e8493e 3496 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3497
3498
3499def determine_protocol(info_dict):
3500 protocol = info_dict.get('protocol')
3501 if protocol is not None:
3502 return protocol
3503
7de837a5 3504 url = sanitize_url(info_dict['url'])
a055469f
PH
3505 if url.startswith('rtmp'):
3506 return 'rtmp'
3507 elif url.startswith('mms'):
3508 return 'mms'
3509 elif url.startswith('rtsp'):
3510 return 'rtsp'
3511
3512 ext = determine_ext(url)
3513 if ext == 'm3u8':
3514 return 'm3u8'
3515 elif ext == 'f4m':
3516 return 'f4m'
3517
14f25df2 3518 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3519
3520
c5e3f849 3521def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3522 """ Render a list of rows, each as a list of values.
3523 Text after a \t will be right aligned """
ec11a9f4 3524 def width(string):
c5e3f849 3525 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3526
3527 def get_max_lens(table):
ec11a9f4 3528 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3529
3530 def filter_using_list(row, filterArray):
d16df59d 3531 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3532
d16df59d 3533 max_lens = get_max_lens(data) if hide_empty else []
3534 header_row = filter_using_list(header_row, max_lens)
3535 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3536
cfb56d1a 3537 table = [header_row] + data
76d321f6 3538 max_lens = get_max_lens(table)
c5e3f849 3539 extra_gap += 1
76d321f6 3540 if delim:
c5e3f849 3541 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3542 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3543 for row in table:
3544 for pos, text in enumerate(map(str, row)):
c5e3f849 3545 if '\t' in text:
3546 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3547 else:
3548 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3549 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3550 return ret
347de493
PH
3551
3552
8f18aca8 3553def _match_one(filter_part, dct, incomplete):
77b87f05 3554 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3555 STRING_OPERATORS = {
3556 '*=': operator.contains,
3557 '^=': lambda attr, value: attr.startswith(value),
3558 '$=': lambda attr, value: attr.endswith(value),
3559 '~=': lambda attr, value: re.search(value, attr),
3560 }
347de493 3561 COMPARISON_OPERATORS = {
a047eeb6 3562 **STRING_OPERATORS,
3563 '<=': operator.le, # "<=" must be defined above "<"
347de493 3564 '<': operator.lt,
347de493 3565 '>=': operator.ge,
a047eeb6 3566 '>': operator.gt,
347de493 3567 '=': operator.eq,
347de493 3568 }
a047eeb6 3569
6db9c4d5 3570 if isinstance(incomplete, bool):
3571 is_incomplete = lambda _: incomplete
3572 else:
3573 is_incomplete = lambda k: k in incomplete
3574
64fa820c 3575 operator_rex = re.compile(r'''(?x)
347de493 3576 (?P<key>[a-z_]+)
77b87f05 3577 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3578 (?:
a047eeb6 3579 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3580 (?P<strval>.+?)
347de493 3581 )
347de493 3582 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3583 m = operator_rex.fullmatch(filter_part.strip())
347de493 3584 if m:
18f96d12 3585 m = m.groupdict()
3586 unnegated_op = COMPARISON_OPERATORS[m['op']]
3587 if m['negation']:
77b87f05
MT
3588 op = lambda attr, value: not unnegated_op(attr, value)
3589 else:
3590 op = unnegated_op
18f96d12 3591 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3592 if m['quote']:
3593 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3594 actual_value = dct.get(m['key'])
3595 numeric_comparison = None
f9934b96 3596 if isinstance(actual_value, (int, float)):
e5a088dc
S
3597 # If the original field is a string and matching comparisonvalue is
3598 # a number we should respect the origin of the original field
3599 # and process comparison value as a string (see
18f96d12 3600 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3601 try:
18f96d12 3602 numeric_comparison = int(comparison_value)
347de493 3603 except ValueError:
18f96d12 3604 numeric_comparison = parse_filesize(comparison_value)
3605 if numeric_comparison is None:
3606 numeric_comparison = parse_filesize(f'{comparison_value}B')
3607 if numeric_comparison is None:
3608 numeric_comparison = parse_duration(comparison_value)
3609 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3610 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3611 if actual_value is None:
6db9c4d5 3612 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3613 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3614
3615 UNARY_OPERATORS = {
1cc47c66
S
3616 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3617 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3618 }
64fa820c 3619 operator_rex = re.compile(r'''(?x)
347de493 3620 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3621 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3622 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3623 if m:
3624 op = UNARY_OPERATORS[m.group('op')]
3625 actual_value = dct.get(m.group('key'))
6db9c4d5 3626 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3627 return True
347de493
PH
3628 return op(actual_value)
3629
3630 raise ValueError('Invalid filter part %r' % filter_part)
3631
3632
8f18aca8 3633def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3634 """ Filter a dictionary with a simple string syntax.
3635 @returns Whether the filter passes
3636 @param incomplete Set of keys that is expected to be missing from dct.
3637 Can be True/False to indicate all/none of the keys may be missing.
3638 All conditions on incomplete keys pass if the key is missing
8f18aca8 3639 """
347de493 3640 return all(
8f18aca8 3641 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3642 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3643
3644
b1a7cd05 3645def match_filter_func(filters):
3646 if not filters:
d1b5f70b 3647 return None
492272fe 3648 filters = set(variadic(filters))
d1b5f70b 3649
492272fe 3650 interactive = '-' in filters
3651 if interactive:
3652 filters.remove('-')
3653
3654 def _match_func(info_dict, incomplete=False):
3655 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3656 return NO_DEFAULT if interactive and not incomplete else None
347de493 3657 else:
b1a7cd05 3658 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3659 filter_str = ') | ('.join(map(str.strip, filters))
3660 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3661 return _match_func
91410c9b
PH
3662
3663
5ec1b6b7 3664def download_range_func(chapters, ranges):
3665 def inner(info_dict, ydl):
3666 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3667 else 'Cannot match chapters since chapter information is unavailable')
5ec1b6b7 3668 for regex in chapters or []:
3669 for i, chapter in enumerate(info_dict.get('chapters') or []):
3670 if re.search(regex, chapter['title']):
3671 warning = None
3672 yield {**chapter, 'index': i}
56ba69e4 3673 if chapters and warning:
5ec1b6b7 3674 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3675
3676 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3677
3678 return inner
3679
3680
bf6427d2
YCH
3681def parse_dfxp_time_expr(time_expr):
3682 if not time_expr:
d631d5f9 3683 return
bf6427d2 3684
1d485a1a 3685 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3686 if mobj:
3687 return float(mobj.group('time_offset'))
3688
db2fe38b 3689 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3690 if mobj:
db2fe38b 3691 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3692
3693
c1c924ab 3694def srt_subtitles_timecode(seconds):
aa7785f8 3695 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3696
3697
3698def ass_subtitles_timecode(seconds):
3699 time = timetuple_from_msec(seconds * 1000)
3700 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3701
3702
3703def dfxp2srt(dfxp_data):
3869028f
YCH
3704 '''
3705 @param dfxp_data A bytes-like object containing DFXP data
3706 @returns A unicode object containing converted SRT data
3707 '''
5b995f71 3708 LEGACY_NAMESPACES = (
3869028f
YCH
3709 (b'http://www.w3.org/ns/ttml', [
3710 b'http://www.w3.org/2004/11/ttaf1',
3711 b'http://www.w3.org/2006/04/ttaf1',
3712 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3713 ]),
3869028f
YCH
3714 (b'http://www.w3.org/ns/ttml#styling', [
3715 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3716 ]),
3717 )
3718
3719 SUPPORTED_STYLING = [
3720 'color',
3721 'fontFamily',
3722 'fontSize',
3723 'fontStyle',
3724 'fontWeight',
3725 'textDecoration'
3726 ]
3727
4e335771 3728 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3729 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3730 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3731 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3732 })
bf6427d2 3733
5b995f71
RA
3734 styles = {}
3735 default_style = {}
3736
86e5f3ed 3737 class TTMLPElementParser:
5b995f71
RA
3738 _out = ''
3739 _unclosed_elements = []
3740 _applied_styles = []
bf6427d2 3741
2b14cb56 3742 def start(self, tag, attrib):
5b995f71
RA
3743 if tag in (_x('ttml:br'), 'br'):
3744 self._out += '\n'
3745 else:
3746 unclosed_elements = []
3747 style = {}
3748 element_style_id = attrib.get('style')
3749 if default_style:
3750 style.update(default_style)
3751 if element_style_id:
3752 style.update(styles.get(element_style_id, {}))
3753 for prop in SUPPORTED_STYLING:
3754 prop_val = attrib.get(_x('tts:' + prop))
3755 if prop_val:
3756 style[prop] = prop_val
3757 if style:
3758 font = ''
3759 for k, v in sorted(style.items()):
3760 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3761 continue
3762 if k == 'color':
3763 font += ' color="%s"' % v
3764 elif k == 'fontSize':
3765 font += ' size="%s"' % v
3766 elif k == 'fontFamily':
3767 font += ' face="%s"' % v
3768 elif k == 'fontWeight' and v == 'bold':
3769 self._out += '<b>'
3770 unclosed_elements.append('b')
3771 elif k == 'fontStyle' and v == 'italic':
3772 self._out += '<i>'
3773 unclosed_elements.append('i')
3774 elif k == 'textDecoration' and v == 'underline':
3775 self._out += '<u>'
3776 unclosed_elements.append('u')
3777 if font:
3778 self._out += '<font' + font + '>'
3779 unclosed_elements.append('font')
3780 applied_style = {}
3781 if self._applied_styles:
3782 applied_style.update(self._applied_styles[-1])
3783 applied_style.update(style)
3784 self._applied_styles.append(applied_style)
3785 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3786
2b14cb56 3787 def end(self, tag):
5b995f71
RA
3788 if tag not in (_x('ttml:br'), 'br'):
3789 unclosed_elements = self._unclosed_elements.pop()
3790 for element in reversed(unclosed_elements):
3791 self._out += '</%s>' % element
3792 if unclosed_elements and self._applied_styles:
3793 self._applied_styles.pop()
bf6427d2 3794
2b14cb56 3795 def data(self, data):
5b995f71 3796 self._out += data
2b14cb56 3797
3798 def close(self):
5b995f71 3799 return self._out.strip()
2b14cb56 3800
3801 def parse_node(node):
3802 target = TTMLPElementParser()
3803 parser = xml.etree.ElementTree.XMLParser(target=target)
3804 parser.feed(xml.etree.ElementTree.tostring(node))
3805 return parser.close()
bf6427d2 3806
5b995f71
RA
3807 for k, v in LEGACY_NAMESPACES:
3808 for ns in v:
3809 dfxp_data = dfxp_data.replace(ns, k)
3810
3869028f 3811 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3812 out = []
5b995f71 3813 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3814
3815 if not paras:
3816 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3817
5b995f71
RA
3818 repeat = False
3819 while True:
3820 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3821 style_id = style.get('id') or style.get(_x('xml:id'))
3822 if not style_id:
3823 continue
5b995f71
RA
3824 parent_style_id = style.get('style')
3825 if parent_style_id:
3826 if parent_style_id not in styles:
3827 repeat = True
3828 continue
3829 styles[style_id] = styles[parent_style_id].copy()
3830 for prop in SUPPORTED_STYLING:
3831 prop_val = style.get(_x('tts:' + prop))
3832 if prop_val:
3833 styles.setdefault(style_id, {})[prop] = prop_val
3834 if repeat:
3835 repeat = False
3836 else:
3837 break
3838
3839 for p in ('body', 'div'):
3840 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3841 if ele is None:
3842 continue
3843 style = styles.get(ele.get('style'))
3844 if not style:
3845 continue
3846 default_style.update(style)
3847
bf6427d2 3848 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3849 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3850 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3851 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3852 if begin_time is None:
3853 continue
7dff0363 3854 if not end_time:
d631d5f9
YCH
3855 if not dur:
3856 continue
3857 end_time = begin_time + dur
bf6427d2
YCH
3858 out.append('%d\n%s --> %s\n%s\n\n' % (
3859 index,
c1c924ab
YCH
3860 srt_subtitles_timecode(begin_time),
3861 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3862 parse_node(para)))
3863
3864 return ''.join(out)
3865
3866
c487cf00 3867def cli_option(params, command_option, param, separator=None):
66e289ba 3868 param = params.get(param)
c487cf00 3869 return ([] if param is None
3870 else [command_option, str(param)] if separator is None
3871 else [f'{command_option}{separator}{param}'])
66e289ba
S
3872
3873
3874def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3875 param = params.get(param)
c487cf00 3876 assert param in (True, False, None)
3877 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3878
3879
3880def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3881 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3882
3883
e92caff5 3884def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3885 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3886 if use_compat:
5b1ecbb3 3887 return argdict
3888 else:
3889 argdict = None
eab9b2bc 3890 if argdict is None:
5b1ecbb3 3891 return default
eab9b2bc 3892 assert isinstance(argdict, dict)
3893
e92caff5 3894 assert isinstance(keys, (list, tuple))
3895 for key_list in keys:
e92caff5 3896 arg_list = list(filter(
3897 lambda x: x is not None,
6606817a 3898 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3899 if arg_list:
3900 return [arg for args in arg_list for arg in args]
3901 return default
66e289ba 3902
6251555f 3903
330690a2 3904def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3905 main_key, exe = main_key.lower(), exe.lower()
3906 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3907 keys = [f'{root_key}{k}' for k in (keys or [''])]
3908 if root_key in keys:
3909 if main_key != exe:
3910 keys.append((main_key, exe))
3911 keys.append('default')
3912 else:
3913 use_compat = False
3914 return cli_configuration_args(argdict, keys, default, use_compat)
3915
66e289ba 3916
86e5f3ed 3917class ISO639Utils:
39672624
YCH
3918 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3919 _lang_map = {
3920 'aa': 'aar',
3921 'ab': 'abk',
3922 'ae': 'ave',
3923 'af': 'afr',
3924 'ak': 'aka',
3925 'am': 'amh',
3926 'an': 'arg',
3927 'ar': 'ara',
3928 'as': 'asm',
3929 'av': 'ava',
3930 'ay': 'aym',
3931 'az': 'aze',
3932 'ba': 'bak',
3933 'be': 'bel',
3934 'bg': 'bul',
3935 'bh': 'bih',
3936 'bi': 'bis',
3937 'bm': 'bam',
3938 'bn': 'ben',
3939 'bo': 'bod',
3940 'br': 'bre',
3941 'bs': 'bos',
3942 'ca': 'cat',
3943 'ce': 'che',
3944 'ch': 'cha',
3945 'co': 'cos',
3946 'cr': 'cre',
3947 'cs': 'ces',
3948 'cu': 'chu',
3949 'cv': 'chv',
3950 'cy': 'cym',
3951 'da': 'dan',
3952 'de': 'deu',
3953 'dv': 'div',
3954 'dz': 'dzo',
3955 'ee': 'ewe',
3956 'el': 'ell',
3957 'en': 'eng',
3958 'eo': 'epo',
3959 'es': 'spa',
3960 'et': 'est',
3961 'eu': 'eus',
3962 'fa': 'fas',
3963 'ff': 'ful',
3964 'fi': 'fin',
3965 'fj': 'fij',
3966 'fo': 'fao',
3967 'fr': 'fra',
3968 'fy': 'fry',
3969 'ga': 'gle',
3970 'gd': 'gla',
3971 'gl': 'glg',
3972 'gn': 'grn',
3973 'gu': 'guj',
3974 'gv': 'glv',
3975 'ha': 'hau',
3976 'he': 'heb',
b7acc835 3977 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3978 'hi': 'hin',
3979 'ho': 'hmo',
3980 'hr': 'hrv',
3981 'ht': 'hat',
3982 'hu': 'hun',
3983 'hy': 'hye',
3984 'hz': 'her',
3985 'ia': 'ina',
3986 'id': 'ind',
b7acc835 3987 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3988 'ie': 'ile',
3989 'ig': 'ibo',
3990 'ii': 'iii',
3991 'ik': 'ipk',
3992 'io': 'ido',
3993 'is': 'isl',
3994 'it': 'ita',
3995 'iu': 'iku',
3996 'ja': 'jpn',
3997 'jv': 'jav',
3998 'ka': 'kat',
3999 'kg': 'kon',
4000 'ki': 'kik',
4001 'kj': 'kua',
4002 'kk': 'kaz',
4003 'kl': 'kal',
4004 'km': 'khm',
4005 'kn': 'kan',
4006 'ko': 'kor',
4007 'kr': 'kau',
4008 'ks': 'kas',
4009 'ku': 'kur',
4010 'kv': 'kom',
4011 'kw': 'cor',
4012 'ky': 'kir',
4013 'la': 'lat',
4014 'lb': 'ltz',
4015 'lg': 'lug',
4016 'li': 'lim',
4017 'ln': 'lin',
4018 'lo': 'lao',
4019 'lt': 'lit',
4020 'lu': 'lub',
4021 'lv': 'lav',
4022 'mg': 'mlg',
4023 'mh': 'mah',
4024 'mi': 'mri',
4025 'mk': 'mkd',
4026 'ml': 'mal',
4027 'mn': 'mon',
4028 'mr': 'mar',
4029 'ms': 'msa',
4030 'mt': 'mlt',
4031 'my': 'mya',
4032 'na': 'nau',
4033 'nb': 'nob',
4034 'nd': 'nde',
4035 'ne': 'nep',
4036 'ng': 'ndo',
4037 'nl': 'nld',
4038 'nn': 'nno',
4039 'no': 'nor',
4040 'nr': 'nbl',
4041 'nv': 'nav',
4042 'ny': 'nya',
4043 'oc': 'oci',
4044 'oj': 'oji',
4045 'om': 'orm',
4046 'or': 'ori',
4047 'os': 'oss',
4048 'pa': 'pan',
4049 'pi': 'pli',
4050 'pl': 'pol',
4051 'ps': 'pus',
4052 'pt': 'por',
4053 'qu': 'que',
4054 'rm': 'roh',
4055 'rn': 'run',
4056 'ro': 'ron',
4057 'ru': 'rus',
4058 'rw': 'kin',
4059 'sa': 'san',
4060 'sc': 'srd',
4061 'sd': 'snd',
4062 'se': 'sme',
4063 'sg': 'sag',
4064 'si': 'sin',
4065 'sk': 'slk',
4066 'sl': 'slv',
4067 'sm': 'smo',
4068 'sn': 'sna',
4069 'so': 'som',
4070 'sq': 'sqi',
4071 'sr': 'srp',
4072 'ss': 'ssw',
4073 'st': 'sot',
4074 'su': 'sun',
4075 'sv': 'swe',
4076 'sw': 'swa',
4077 'ta': 'tam',
4078 'te': 'tel',
4079 'tg': 'tgk',
4080 'th': 'tha',
4081 'ti': 'tir',
4082 'tk': 'tuk',
4083 'tl': 'tgl',
4084 'tn': 'tsn',
4085 'to': 'ton',
4086 'tr': 'tur',
4087 'ts': 'tso',
4088 'tt': 'tat',
4089 'tw': 'twi',
4090 'ty': 'tah',
4091 'ug': 'uig',
4092 'uk': 'ukr',
4093 'ur': 'urd',
4094 'uz': 'uzb',
4095 've': 'ven',
4096 'vi': 'vie',
4097 'vo': 'vol',
4098 'wa': 'wln',
4099 'wo': 'wol',
4100 'xh': 'xho',
4101 'yi': 'yid',
e9a50fba 4102 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4103 'yo': 'yor',
4104 'za': 'zha',
4105 'zh': 'zho',
4106 'zu': 'zul',
4107 }
4108
4109 @classmethod
4110 def short2long(cls, code):
4111 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4112 return cls._lang_map.get(code[:2])
4113
4114 @classmethod
4115 def long2short(cls, code):
4116 """Convert language code from ISO 639-2/T to ISO 639-1"""
4117 for short_name, long_name in cls._lang_map.items():
4118 if long_name == code:
4119 return short_name
4120
4121
86e5f3ed 4122class ISO3166Utils:
4eb10f66
YCH
4123 # From http://data.okfn.org/data/core/country-list
4124 _country_map = {
4125 'AF': 'Afghanistan',
4126 'AX': 'Åland Islands',
4127 'AL': 'Albania',
4128 'DZ': 'Algeria',
4129 'AS': 'American Samoa',
4130 'AD': 'Andorra',
4131 'AO': 'Angola',
4132 'AI': 'Anguilla',
4133 'AQ': 'Antarctica',
4134 'AG': 'Antigua and Barbuda',
4135 'AR': 'Argentina',
4136 'AM': 'Armenia',
4137 'AW': 'Aruba',
4138 'AU': 'Australia',
4139 'AT': 'Austria',
4140 'AZ': 'Azerbaijan',
4141 'BS': 'Bahamas',
4142 'BH': 'Bahrain',
4143 'BD': 'Bangladesh',
4144 'BB': 'Barbados',
4145 'BY': 'Belarus',
4146 'BE': 'Belgium',
4147 'BZ': 'Belize',
4148 'BJ': 'Benin',
4149 'BM': 'Bermuda',
4150 'BT': 'Bhutan',
4151 'BO': 'Bolivia, Plurinational State of',
4152 'BQ': 'Bonaire, Sint Eustatius and Saba',
4153 'BA': 'Bosnia and Herzegovina',
4154 'BW': 'Botswana',
4155 'BV': 'Bouvet Island',
4156 'BR': 'Brazil',
4157 'IO': 'British Indian Ocean Territory',
4158 'BN': 'Brunei Darussalam',
4159 'BG': 'Bulgaria',
4160 'BF': 'Burkina Faso',
4161 'BI': 'Burundi',
4162 'KH': 'Cambodia',
4163 'CM': 'Cameroon',
4164 'CA': 'Canada',
4165 'CV': 'Cape Verde',
4166 'KY': 'Cayman Islands',
4167 'CF': 'Central African Republic',
4168 'TD': 'Chad',
4169 'CL': 'Chile',
4170 'CN': 'China',
4171 'CX': 'Christmas Island',
4172 'CC': 'Cocos (Keeling) Islands',
4173 'CO': 'Colombia',
4174 'KM': 'Comoros',
4175 'CG': 'Congo',
4176 'CD': 'Congo, the Democratic Republic of the',
4177 'CK': 'Cook Islands',
4178 'CR': 'Costa Rica',
4179 'CI': 'Côte d\'Ivoire',
4180 'HR': 'Croatia',
4181 'CU': 'Cuba',
4182 'CW': 'Curaçao',
4183 'CY': 'Cyprus',
4184 'CZ': 'Czech Republic',
4185 'DK': 'Denmark',
4186 'DJ': 'Djibouti',
4187 'DM': 'Dominica',
4188 'DO': 'Dominican Republic',
4189 'EC': 'Ecuador',
4190 'EG': 'Egypt',
4191 'SV': 'El Salvador',
4192 'GQ': 'Equatorial Guinea',
4193 'ER': 'Eritrea',
4194 'EE': 'Estonia',
4195 'ET': 'Ethiopia',
4196 'FK': 'Falkland Islands (Malvinas)',
4197 'FO': 'Faroe Islands',
4198 'FJ': 'Fiji',
4199 'FI': 'Finland',
4200 'FR': 'France',
4201 'GF': 'French Guiana',
4202 'PF': 'French Polynesia',
4203 'TF': 'French Southern Territories',
4204 'GA': 'Gabon',
4205 'GM': 'Gambia',
4206 'GE': 'Georgia',
4207 'DE': 'Germany',
4208 'GH': 'Ghana',
4209 'GI': 'Gibraltar',
4210 'GR': 'Greece',
4211 'GL': 'Greenland',
4212 'GD': 'Grenada',
4213 'GP': 'Guadeloupe',
4214 'GU': 'Guam',
4215 'GT': 'Guatemala',
4216 'GG': 'Guernsey',
4217 'GN': 'Guinea',
4218 'GW': 'Guinea-Bissau',
4219 'GY': 'Guyana',
4220 'HT': 'Haiti',
4221 'HM': 'Heard Island and McDonald Islands',
4222 'VA': 'Holy See (Vatican City State)',
4223 'HN': 'Honduras',
4224 'HK': 'Hong Kong',
4225 'HU': 'Hungary',
4226 'IS': 'Iceland',
4227 'IN': 'India',
4228 'ID': 'Indonesia',
4229 'IR': 'Iran, Islamic Republic of',
4230 'IQ': 'Iraq',
4231 'IE': 'Ireland',
4232 'IM': 'Isle of Man',
4233 'IL': 'Israel',
4234 'IT': 'Italy',
4235 'JM': 'Jamaica',
4236 'JP': 'Japan',
4237 'JE': 'Jersey',
4238 'JO': 'Jordan',
4239 'KZ': 'Kazakhstan',
4240 'KE': 'Kenya',
4241 'KI': 'Kiribati',
4242 'KP': 'Korea, Democratic People\'s Republic of',
4243 'KR': 'Korea, Republic of',
4244 'KW': 'Kuwait',
4245 'KG': 'Kyrgyzstan',
4246 'LA': 'Lao People\'s Democratic Republic',
4247 'LV': 'Latvia',
4248 'LB': 'Lebanon',
4249 'LS': 'Lesotho',
4250 'LR': 'Liberia',
4251 'LY': 'Libya',
4252 'LI': 'Liechtenstein',
4253 'LT': 'Lithuania',
4254 'LU': 'Luxembourg',
4255 'MO': 'Macao',
4256 'MK': 'Macedonia, the Former Yugoslav Republic of',
4257 'MG': 'Madagascar',
4258 'MW': 'Malawi',
4259 'MY': 'Malaysia',
4260 'MV': 'Maldives',
4261 'ML': 'Mali',
4262 'MT': 'Malta',
4263 'MH': 'Marshall Islands',
4264 'MQ': 'Martinique',
4265 'MR': 'Mauritania',
4266 'MU': 'Mauritius',
4267 'YT': 'Mayotte',
4268 'MX': 'Mexico',
4269 'FM': 'Micronesia, Federated States of',
4270 'MD': 'Moldova, Republic of',
4271 'MC': 'Monaco',
4272 'MN': 'Mongolia',
4273 'ME': 'Montenegro',
4274 'MS': 'Montserrat',
4275 'MA': 'Morocco',
4276 'MZ': 'Mozambique',
4277 'MM': 'Myanmar',
4278 'NA': 'Namibia',
4279 'NR': 'Nauru',
4280 'NP': 'Nepal',
4281 'NL': 'Netherlands',
4282 'NC': 'New Caledonia',
4283 'NZ': 'New Zealand',
4284 'NI': 'Nicaragua',
4285 'NE': 'Niger',
4286 'NG': 'Nigeria',
4287 'NU': 'Niue',
4288 'NF': 'Norfolk Island',
4289 'MP': 'Northern Mariana Islands',
4290 'NO': 'Norway',
4291 'OM': 'Oman',
4292 'PK': 'Pakistan',
4293 'PW': 'Palau',
4294 'PS': 'Palestine, State of',
4295 'PA': 'Panama',
4296 'PG': 'Papua New Guinea',
4297 'PY': 'Paraguay',
4298 'PE': 'Peru',
4299 'PH': 'Philippines',
4300 'PN': 'Pitcairn',
4301 'PL': 'Poland',
4302 'PT': 'Portugal',
4303 'PR': 'Puerto Rico',
4304 'QA': 'Qatar',
4305 'RE': 'Réunion',
4306 'RO': 'Romania',
4307 'RU': 'Russian Federation',
4308 'RW': 'Rwanda',
4309 'BL': 'Saint Barthélemy',
4310 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4311 'KN': 'Saint Kitts and Nevis',
4312 'LC': 'Saint Lucia',
4313 'MF': 'Saint Martin (French part)',
4314 'PM': 'Saint Pierre and Miquelon',
4315 'VC': 'Saint Vincent and the Grenadines',
4316 'WS': 'Samoa',
4317 'SM': 'San Marino',
4318 'ST': 'Sao Tome and Principe',
4319 'SA': 'Saudi Arabia',
4320 'SN': 'Senegal',
4321 'RS': 'Serbia',
4322 'SC': 'Seychelles',
4323 'SL': 'Sierra Leone',
4324 'SG': 'Singapore',
4325 'SX': 'Sint Maarten (Dutch part)',
4326 'SK': 'Slovakia',
4327 'SI': 'Slovenia',
4328 'SB': 'Solomon Islands',
4329 'SO': 'Somalia',
4330 'ZA': 'South Africa',
4331 'GS': 'South Georgia and the South Sandwich Islands',
4332 'SS': 'South Sudan',
4333 'ES': 'Spain',
4334 'LK': 'Sri Lanka',
4335 'SD': 'Sudan',
4336 'SR': 'Suriname',
4337 'SJ': 'Svalbard and Jan Mayen',
4338 'SZ': 'Swaziland',
4339 'SE': 'Sweden',
4340 'CH': 'Switzerland',
4341 'SY': 'Syrian Arab Republic',
4342 'TW': 'Taiwan, Province of China',
4343 'TJ': 'Tajikistan',
4344 'TZ': 'Tanzania, United Republic of',
4345 'TH': 'Thailand',
4346 'TL': 'Timor-Leste',
4347 'TG': 'Togo',
4348 'TK': 'Tokelau',
4349 'TO': 'Tonga',
4350 'TT': 'Trinidad and Tobago',
4351 'TN': 'Tunisia',
4352 'TR': 'Turkey',
4353 'TM': 'Turkmenistan',
4354 'TC': 'Turks and Caicos Islands',
4355 'TV': 'Tuvalu',
4356 'UG': 'Uganda',
4357 'UA': 'Ukraine',
4358 'AE': 'United Arab Emirates',
4359 'GB': 'United Kingdom',
4360 'US': 'United States',
4361 'UM': 'United States Minor Outlying Islands',
4362 'UY': 'Uruguay',
4363 'UZ': 'Uzbekistan',
4364 'VU': 'Vanuatu',
4365 'VE': 'Venezuela, Bolivarian Republic of',
4366 'VN': 'Viet Nam',
4367 'VG': 'Virgin Islands, British',
4368 'VI': 'Virgin Islands, U.S.',
4369 'WF': 'Wallis and Futuna',
4370 'EH': 'Western Sahara',
4371 'YE': 'Yemen',
4372 'ZM': 'Zambia',
4373 'ZW': 'Zimbabwe',
2f97cc61 4374 # Not ISO 3166 codes, but used for IP blocks
4375 'AP': 'Asia/Pacific Region',
4376 'EU': 'Europe',
4eb10f66
YCH
4377 }
4378
4379 @classmethod
4380 def short2full(cls, code):
4381 """Convert an ISO 3166-2 country code to the corresponding full name"""
4382 return cls._country_map.get(code.upper())
4383
4384
86e5f3ed 4385class GeoUtils:
773f291d
S
4386 # Major IPv4 address blocks per country
4387 _country_ip_map = {
53896ca5 4388 'AD': '46.172.224.0/19',
773f291d
S
4389 'AE': '94.200.0.0/13',
4390 'AF': '149.54.0.0/17',
4391 'AG': '209.59.64.0/18',
4392 'AI': '204.14.248.0/21',
4393 'AL': '46.99.0.0/16',
4394 'AM': '46.70.0.0/15',
4395 'AO': '105.168.0.0/13',
53896ca5
S
4396 'AP': '182.50.184.0/21',
4397 'AQ': '23.154.160.0/24',
773f291d
S
4398 'AR': '181.0.0.0/12',
4399 'AS': '202.70.112.0/20',
53896ca5 4400 'AT': '77.116.0.0/14',
773f291d
S
4401 'AU': '1.128.0.0/11',
4402 'AW': '181.41.0.0/18',
53896ca5
S
4403 'AX': '185.217.4.0/22',
4404 'AZ': '5.197.0.0/16',
773f291d
S
4405 'BA': '31.176.128.0/17',
4406 'BB': '65.48.128.0/17',
4407 'BD': '114.130.0.0/16',
4408 'BE': '57.0.0.0/8',
53896ca5 4409 'BF': '102.178.0.0/15',
773f291d
S
4410 'BG': '95.42.0.0/15',
4411 'BH': '37.131.0.0/17',
4412 'BI': '154.117.192.0/18',
4413 'BJ': '137.255.0.0/16',
53896ca5 4414 'BL': '185.212.72.0/23',
773f291d
S
4415 'BM': '196.12.64.0/18',
4416 'BN': '156.31.0.0/16',
4417 'BO': '161.56.0.0/16',
4418 'BQ': '161.0.80.0/20',
53896ca5 4419 'BR': '191.128.0.0/12',
773f291d
S
4420 'BS': '24.51.64.0/18',
4421 'BT': '119.2.96.0/19',
4422 'BW': '168.167.0.0/16',
4423 'BY': '178.120.0.0/13',
4424 'BZ': '179.42.192.0/18',
4425 'CA': '99.224.0.0/11',
4426 'CD': '41.243.0.0/16',
53896ca5
S
4427 'CF': '197.242.176.0/21',
4428 'CG': '160.113.0.0/16',
773f291d 4429 'CH': '85.0.0.0/13',
53896ca5 4430 'CI': '102.136.0.0/14',
773f291d
S
4431 'CK': '202.65.32.0/19',
4432 'CL': '152.172.0.0/14',
53896ca5 4433 'CM': '102.244.0.0/14',
773f291d
S
4434 'CN': '36.128.0.0/10',
4435 'CO': '181.240.0.0/12',
4436 'CR': '201.192.0.0/12',
4437 'CU': '152.206.0.0/15',
4438 'CV': '165.90.96.0/19',
4439 'CW': '190.88.128.0/17',
53896ca5 4440 'CY': '31.153.0.0/16',
773f291d
S
4441 'CZ': '88.100.0.0/14',
4442 'DE': '53.0.0.0/8',
4443 'DJ': '197.241.0.0/17',
4444 'DK': '87.48.0.0/12',
4445 'DM': '192.243.48.0/20',
4446 'DO': '152.166.0.0/15',
4447 'DZ': '41.96.0.0/12',
4448 'EC': '186.68.0.0/15',
4449 'EE': '90.190.0.0/15',
4450 'EG': '156.160.0.0/11',
4451 'ER': '196.200.96.0/20',
4452 'ES': '88.0.0.0/11',
4453 'ET': '196.188.0.0/14',
4454 'EU': '2.16.0.0/13',
4455 'FI': '91.152.0.0/13',
4456 'FJ': '144.120.0.0/16',
53896ca5 4457 'FK': '80.73.208.0/21',
773f291d
S
4458 'FM': '119.252.112.0/20',
4459 'FO': '88.85.32.0/19',
4460 'FR': '90.0.0.0/9',
4461 'GA': '41.158.0.0/15',
4462 'GB': '25.0.0.0/8',
4463 'GD': '74.122.88.0/21',
4464 'GE': '31.146.0.0/16',
4465 'GF': '161.22.64.0/18',
4466 'GG': '62.68.160.0/19',
53896ca5
S
4467 'GH': '154.160.0.0/12',
4468 'GI': '95.164.0.0/16',
773f291d
S
4469 'GL': '88.83.0.0/19',
4470 'GM': '160.182.0.0/15',
4471 'GN': '197.149.192.0/18',
4472 'GP': '104.250.0.0/19',
4473 'GQ': '105.235.224.0/20',
4474 'GR': '94.64.0.0/13',
4475 'GT': '168.234.0.0/16',
4476 'GU': '168.123.0.0/16',
4477 'GW': '197.214.80.0/20',
4478 'GY': '181.41.64.0/18',
4479 'HK': '113.252.0.0/14',
4480 'HN': '181.210.0.0/16',
4481 'HR': '93.136.0.0/13',
4482 'HT': '148.102.128.0/17',
4483 'HU': '84.0.0.0/14',
4484 'ID': '39.192.0.0/10',
4485 'IE': '87.32.0.0/12',
4486 'IL': '79.176.0.0/13',
4487 'IM': '5.62.80.0/20',
4488 'IN': '117.192.0.0/10',
4489 'IO': '203.83.48.0/21',
4490 'IQ': '37.236.0.0/14',
4491 'IR': '2.176.0.0/12',
4492 'IS': '82.221.0.0/16',
4493 'IT': '79.0.0.0/10',
4494 'JE': '87.244.64.0/18',
4495 'JM': '72.27.0.0/17',
4496 'JO': '176.29.0.0/16',
53896ca5 4497 'JP': '133.0.0.0/8',
773f291d
S
4498 'KE': '105.48.0.0/12',
4499 'KG': '158.181.128.0/17',
4500 'KH': '36.37.128.0/17',
4501 'KI': '103.25.140.0/22',
4502 'KM': '197.255.224.0/20',
53896ca5 4503 'KN': '198.167.192.0/19',
773f291d
S
4504 'KP': '175.45.176.0/22',
4505 'KR': '175.192.0.0/10',
4506 'KW': '37.36.0.0/14',
4507 'KY': '64.96.0.0/15',
4508 'KZ': '2.72.0.0/13',
4509 'LA': '115.84.64.0/18',
4510 'LB': '178.135.0.0/16',
53896ca5 4511 'LC': '24.92.144.0/20',
773f291d
S
4512 'LI': '82.117.0.0/19',
4513 'LK': '112.134.0.0/15',
53896ca5 4514 'LR': '102.183.0.0/16',
773f291d
S
4515 'LS': '129.232.0.0/17',
4516 'LT': '78.56.0.0/13',
4517 'LU': '188.42.0.0/16',
4518 'LV': '46.109.0.0/16',
4519 'LY': '41.252.0.0/14',
4520 'MA': '105.128.0.0/11',
4521 'MC': '88.209.64.0/18',
4522 'MD': '37.246.0.0/16',
4523 'ME': '178.175.0.0/17',
4524 'MF': '74.112.232.0/21',
4525 'MG': '154.126.0.0/17',
4526 'MH': '117.103.88.0/21',
4527 'MK': '77.28.0.0/15',
4528 'ML': '154.118.128.0/18',
4529 'MM': '37.111.0.0/17',
4530 'MN': '49.0.128.0/17',
4531 'MO': '60.246.0.0/16',
4532 'MP': '202.88.64.0/20',
4533 'MQ': '109.203.224.0/19',
4534 'MR': '41.188.64.0/18',
4535 'MS': '208.90.112.0/22',
4536 'MT': '46.11.0.0/16',
4537 'MU': '105.16.0.0/12',
4538 'MV': '27.114.128.0/18',
53896ca5 4539 'MW': '102.70.0.0/15',
773f291d
S
4540 'MX': '187.192.0.0/11',
4541 'MY': '175.136.0.0/13',
4542 'MZ': '197.218.0.0/15',
4543 'NA': '41.182.0.0/16',
4544 'NC': '101.101.0.0/18',
4545 'NE': '197.214.0.0/18',
4546 'NF': '203.17.240.0/22',
4547 'NG': '105.112.0.0/12',
4548 'NI': '186.76.0.0/15',
4549 'NL': '145.96.0.0/11',
4550 'NO': '84.208.0.0/13',
4551 'NP': '36.252.0.0/15',
4552 'NR': '203.98.224.0/19',
4553 'NU': '49.156.48.0/22',
4554 'NZ': '49.224.0.0/14',
4555 'OM': '5.36.0.0/15',
4556 'PA': '186.72.0.0/15',
4557 'PE': '186.160.0.0/14',
4558 'PF': '123.50.64.0/18',
4559 'PG': '124.240.192.0/19',
4560 'PH': '49.144.0.0/13',
4561 'PK': '39.32.0.0/11',
4562 'PL': '83.0.0.0/11',
4563 'PM': '70.36.0.0/20',
4564 'PR': '66.50.0.0/16',
4565 'PS': '188.161.0.0/16',
4566 'PT': '85.240.0.0/13',
4567 'PW': '202.124.224.0/20',
4568 'PY': '181.120.0.0/14',
4569 'QA': '37.210.0.0/15',
53896ca5 4570 'RE': '102.35.0.0/16',
773f291d 4571 'RO': '79.112.0.0/13',
53896ca5 4572 'RS': '93.86.0.0/15',
773f291d 4573 'RU': '5.136.0.0/13',
53896ca5 4574 'RW': '41.186.0.0/16',
773f291d
S
4575 'SA': '188.48.0.0/13',
4576 'SB': '202.1.160.0/19',
4577 'SC': '154.192.0.0/11',
53896ca5 4578 'SD': '102.120.0.0/13',
773f291d 4579 'SE': '78.64.0.0/12',
53896ca5 4580 'SG': '8.128.0.0/10',
773f291d
S
4581 'SI': '188.196.0.0/14',
4582 'SK': '78.98.0.0/15',
53896ca5 4583 'SL': '102.143.0.0/17',
773f291d
S
4584 'SM': '89.186.32.0/19',
4585 'SN': '41.82.0.0/15',
53896ca5 4586 'SO': '154.115.192.0/18',
773f291d
S
4587 'SR': '186.179.128.0/17',
4588 'SS': '105.235.208.0/21',
4589 'ST': '197.159.160.0/19',
4590 'SV': '168.243.0.0/16',
4591 'SX': '190.102.0.0/20',
4592 'SY': '5.0.0.0/16',
4593 'SZ': '41.84.224.0/19',
4594 'TC': '65.255.48.0/20',
4595 'TD': '154.68.128.0/19',
4596 'TG': '196.168.0.0/14',
4597 'TH': '171.96.0.0/13',
4598 'TJ': '85.9.128.0/18',
4599 'TK': '27.96.24.0/21',
4600 'TL': '180.189.160.0/20',
4601 'TM': '95.85.96.0/19',
4602 'TN': '197.0.0.0/11',
4603 'TO': '175.176.144.0/21',
4604 'TR': '78.160.0.0/11',
4605 'TT': '186.44.0.0/15',
4606 'TV': '202.2.96.0/19',
4607 'TW': '120.96.0.0/11',
4608 'TZ': '156.156.0.0/14',
53896ca5
S
4609 'UA': '37.52.0.0/14',
4610 'UG': '102.80.0.0/13',
4611 'US': '6.0.0.0/8',
773f291d 4612 'UY': '167.56.0.0/13',
53896ca5 4613 'UZ': '84.54.64.0/18',
773f291d 4614 'VA': '212.77.0.0/19',
53896ca5 4615 'VC': '207.191.240.0/21',
773f291d 4616 'VE': '186.88.0.0/13',
53896ca5 4617 'VG': '66.81.192.0/20',
773f291d
S
4618 'VI': '146.226.0.0/16',
4619 'VN': '14.160.0.0/11',
4620 'VU': '202.80.32.0/20',
4621 'WF': '117.20.32.0/21',
4622 'WS': '202.4.32.0/19',
4623 'YE': '134.35.0.0/16',
4624 'YT': '41.242.116.0/22',
4625 'ZA': '41.0.0.0/11',
53896ca5
S
4626 'ZM': '102.144.0.0/13',
4627 'ZW': '102.177.192.0/18',
773f291d
S
4628 }
4629
4630 @classmethod
5f95927a
S
4631 def random_ipv4(cls, code_or_block):
4632 if len(code_or_block) == 2:
4633 block = cls._country_ip_map.get(code_or_block.upper())
4634 if not block:
4635 return None
4636 else:
4637 block = code_or_block
773f291d 4638 addr, preflen = block.split('/')
ac668111 4639 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4640 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4641 return str(socket.inet_ntoa(
ac668111 4642 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4643
4644
ac668111 4645class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4646 def __init__(self, proxies=None):
4647 # Set default handlers
4648 for type in ('http', 'https'):
4649 setattr(self, '%s_open' % type,
4650 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4651 meth(r, proxy, type))
ac668111 4652 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4653
91410c9b 4654 def proxy_open(self, req, proxy, type):
2461f79d 4655 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4656 if req_proxy is not None:
4657 proxy = req_proxy
2461f79d
PH
4658 del req.headers['Ytdl-request-proxy']
4659
4660 if proxy == '__noproxy__':
4661 return None # No Proxy
14f25df2 4662 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4663 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4664 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4665 return None
ac668111 4666 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4667 self, req, proxy, type)
5bc880b9
YCH
4668
4669
0a5445dd
YCH
4670# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4671# released into Public Domain
4672# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4673
4674def long_to_bytes(n, blocksize=0):
4675 """long_to_bytes(n:long, blocksize:int) : string
4676 Convert a long integer to a byte string.
4677
4678 If optional blocksize is given and greater than zero, pad the front of the
4679 byte string with binary zeros so that the length is a multiple of
4680 blocksize.
4681 """
4682 # after much testing, this algorithm was deemed to be the fastest
4683 s = b''
4684 n = int(n)
4685 while n > 0:
ac668111 4686 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4687 n = n >> 32
4688 # strip off leading zeros
4689 for i in range(len(s)):
4690 if s[i] != b'\000'[0]:
4691 break
4692 else:
4693 # only happens when n == 0
4694 s = b'\000'
4695 i = 0
4696 s = s[i:]
4697 # add back some pad bytes. this could be done more efficiently w.r.t. the
4698 # de-padding being done above, but sigh...
4699 if blocksize > 0 and len(s) % blocksize:
4700 s = (blocksize - len(s) % blocksize) * b'\000' + s
4701 return s
4702
4703
4704def bytes_to_long(s):
4705 """bytes_to_long(string) : long
4706 Convert a byte string to a long integer.
4707
4708 This is (essentially) the inverse of long_to_bytes().
4709 """
4710 acc = 0
4711 length = len(s)
4712 if length % 4:
4713 extra = (4 - length % 4)
4714 s = b'\000' * extra + s
4715 length = length + extra
4716 for i in range(0, length, 4):
ac668111 4717 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4718 return acc
4719
4720
5bc880b9
YCH
4721def ohdave_rsa_encrypt(data, exponent, modulus):
4722 '''
4723 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4724
4725 Input:
4726 data: data to encrypt, bytes-like object
4727 exponent, modulus: parameter e and N of RSA algorithm, both integer
4728 Output: hex string of encrypted data
4729
4730 Limitation: supports one block encryption only
4731 '''
4732
4733 payload = int(binascii.hexlify(data[::-1]), 16)
4734 encrypted = pow(payload, exponent, modulus)
4735 return '%x' % encrypted
81bdc8fd
YCH
4736
4737
f48409c7
YCH
4738def pkcs1pad(data, length):
4739 """
4740 Padding input data with PKCS#1 scheme
4741
4742 @param {int[]} data input data
4743 @param {int} length target length
4744 @returns {int[]} padded data
4745 """
4746 if len(data) > length - 11:
4747 raise ValueError('Input data too long for PKCS#1 padding')
4748
4749 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4750 return [0, 2] + pseudo_random + [0] + data
4751
4752
7b2c3f47 4753def _base_n_table(n, table):
4754 if not table and not n:
4755 raise ValueError('Either table or n must be specified')
612f2be5 4756 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4757
4758 if n != len(table):
4759 raise ValueError(f'base {n} exceeds table length {len(table)}')
4760 return table
59f898b7 4761
5eb6bdce 4762
7b2c3f47 4763def encode_base_n(num, n=None, table=None):
4764 """Convert given int to a base-n string"""
612f2be5 4765 table = _base_n_table(n, table)
7b2c3f47 4766 if not num:
5eb6bdce
YCH
4767 return table[0]
4768
7b2c3f47 4769 result, base = '', len(table)
81bdc8fd 4770 while num:
7b2c3f47 4771 result = table[num % base] + result
612f2be5 4772 num = num // base
7b2c3f47 4773 return result
4774
4775
4776def decode_base_n(string, n=None, table=None):
4777 """Convert given base-n string to int"""
4778 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4779 result, base = 0, len(table)
4780 for char in string:
4781 result = result * base + table[char]
4782 return result
4783
4784
4785def decode_base(value, digits):
4786 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4787 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4788 return decode_base_n(value, table=digits)
f52354a8
YCH
4789
4790
4791def decode_packed_codes(code):
06b3fe29 4792 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4793 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4794 base = int(base)
4795 count = int(count)
4796 symbols = symbols.split('|')
4797 symbol_table = {}
4798
4799 while count:
4800 count -= 1
5eb6bdce 4801 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4802 symbol_table[base_n_count] = symbols[count] or base_n_count
4803
4804 return re.sub(
4805 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4806 obfuscated_code)
e154c651 4807
4808
1ced2221
S
4809def caesar(s, alphabet, shift):
4810 if shift == 0:
4811 return s
4812 l = len(alphabet)
4813 return ''.join(
4814 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4815 for c in s)
4816
4817
4818def rot47(s):
4819 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4820
4821
e154c651 4822def parse_m3u8_attributes(attrib):
4823 info = {}
4824 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4825 if val.startswith('"'):
4826 val = val[1:-1]
4827 info[key] = val
4828 return info
1143535d
YCH
4829
4830
4831def urshift(val, n):
4832 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4833
4834
4835# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4836# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4837def decode_png(png_data):
4838 # Reference: https://www.w3.org/TR/PNG/
4839 header = png_data[8:]
4840
4841 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4842 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4843
4844 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4845 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4846
4847 chunks = []
4848
4849 while header:
4850 length = unpack_integer(header[:4])
4851 header = header[4:]
4852
4853 chunk_type = header[:4]
4854 header = header[4:]
4855
4856 chunk_data = header[:length]
4857 header = header[length:]
4858
4859 header = header[4:] # Skip CRC
4860
4861 chunks.append({
4862 'type': chunk_type,
4863 'length': length,
4864 'data': chunk_data
4865 })
4866
4867 ihdr = chunks[0]['data']
4868
4869 width = unpack_integer(ihdr[:4])
4870 height = unpack_integer(ihdr[4:8])
4871
4872 idat = b''
4873
4874 for chunk in chunks:
4875 if chunk['type'] == b'IDAT':
4876 idat += chunk['data']
4877
4878 if not idat:
86e5f3ed 4879 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4880
4881 decompressed_data = bytearray(zlib.decompress(idat))
4882
4883 stride = width * 3
4884 pixels = []
4885
4886 def _get_pixel(idx):
4887 x = idx % stride
4888 y = idx // stride
4889 return pixels[y][x]
4890
4891 for y in range(height):
4892 basePos = y * (1 + stride)
4893 filter_type = decompressed_data[basePos]
4894
4895 current_row = []
4896
4897 pixels.append(current_row)
4898
4899 for x in range(stride):
4900 color = decompressed_data[1 + basePos + x]
4901 basex = y * stride + x
4902 left = 0
4903 up = 0
4904
4905 if x > 2:
4906 left = _get_pixel(basex - 3)
4907 if y > 0:
4908 up = _get_pixel(basex - stride)
4909
4910 if filter_type == 1: # Sub
4911 color = (color + left) & 0xff
4912 elif filter_type == 2: # Up
4913 color = (color + up) & 0xff
4914 elif filter_type == 3: # Average
4915 color = (color + ((left + up) >> 1)) & 0xff
4916 elif filter_type == 4: # Paeth
4917 a = left
4918 b = up
4919 c = 0
4920
4921 if x > 2 and y > 0:
4922 c = _get_pixel(basex - stride - 3)
4923
4924 p = a + b - c
4925
4926 pa = abs(p - a)
4927 pb = abs(p - b)
4928 pc = abs(p - c)
4929
4930 if pa <= pb and pa <= pc:
4931 color = (color + a) & 0xff
4932 elif pb <= pc:
4933 color = (color + b) & 0xff
4934 else:
4935 color = (color + c) & 0xff
4936
4937 current_row.append(color)
4938
4939 return width, height, pixels
efa97bdc
YCH
4940
4941
4942def write_xattr(path, key, value):
6f7563be 4943 # Windows: Write xattrs to NTFS Alternate Data Streams:
4944 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4945 if compat_os_name == 'nt':
4946 assert ':' not in key
4947 assert os.path.exists(path)
efa97bdc
YCH
4948
4949 try:
6f7563be 4950 with open(f'{path}:{key}', 'wb') as f:
4951 f.write(value)
86e5f3ed 4952 except OSError as e:
efa97bdc 4953 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4954 return
efa97bdc 4955
6f7563be 4956 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4957
6f7563be 4958 setxattr = None
4959 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4960 # Unicode arguments are not supported in pyxattr until version 0.5.0
4961 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4962 if version_tuple(xattr.__version__) >= (0, 5, 0):
4963 setxattr = xattr.set
4964 elif xattr:
4965 setxattr = xattr.setxattr
efa97bdc 4966
6f7563be 4967 if setxattr:
4968 try:
4969 setxattr(path, key, value)
4970 except OSError as e:
4971 raise XAttrMetadataError(e.errno, e.strerror)
4972 return
efa97bdc 4973
6f7563be 4974 # UNIX Method 2. Use setfattr/xattr executables
4975 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4976 else 'xattr' if check_executable('xattr', ['-h']) else None)
4977 if not exe:
4978 raise XAttrUnavailableError(
4979 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4980 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4981
0f06bcd7 4982 value = value.decode()
6f7563be 4983 try:
f0c9fb96 4984 _, stderr, returncode = Popen.run(
6f7563be 4985 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4986 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4987 except OSError as e:
4988 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4989 if returncode:
4990 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4991
4992
4993def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4994 start_date = datetime.date(1950, 1, 1)
4995 end_date = datetime.date(1995, 12, 31)
4996 offset = random.randint(0, (end_date - start_date).days)
4997 random_date = start_date + datetime.timedelta(offset)
0c265486 4998 return {
aa374bc7
AS
4999 year_field: str(random_date.year),
5000 month_field: str(random_date.month),
5001 day_field: str(random_date.day),
0c265486 5002 }
732044af 5003
c76eb41b 5004
732044af 5005# Templates for internet shortcut files, which are plain text files.
e5a998f3 5006DOT_URL_LINK_TEMPLATE = '''\
732044af 5007[InternetShortcut]
5008URL=%(url)s
e5a998f3 5009'''
732044af 5010
e5a998f3 5011DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5012<?xml version="1.0" encoding="UTF-8"?>
5013<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5014<plist version="1.0">
5015<dict>
5016\t<key>URL</key>
5017\t<string>%(url)s</string>
5018</dict>
5019</plist>
e5a998f3 5020'''
732044af 5021
e5a998f3 5022DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5023[Desktop Entry]
5024Encoding=UTF-8
5025Name=%(filename)s
5026Type=Link
5027URL=%(url)s
5028Icon=text-html
e5a998f3 5029'''
732044af 5030
08438d2c 5031LINK_TEMPLATES = {
5032 'url': DOT_URL_LINK_TEMPLATE,
5033 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5034 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5035}
5036
732044af 5037
5038def iri_to_uri(iri):
5039 """
5040 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5041
5042 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5043 """
5044
14f25df2 5045 iri_parts = urllib.parse.urlparse(iri)
732044af 5046
5047 if '[' in iri_parts.netloc:
5048 raise ValueError('IPv6 URIs are not, yet, supported.')
5049 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5050
5051 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5052
5053 net_location = ''
5054 if iri_parts.username:
f9934b96 5055 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5056 if iri_parts.password is not None:
f9934b96 5057 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5058 net_location += '@'
5059
0f06bcd7 5060 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5061 # The 'idna' encoding produces ASCII text.
5062 if iri_parts.port is not None and iri_parts.port != 80:
5063 net_location += ':' + str(iri_parts.port)
5064
f9934b96 5065 return urllib.parse.urlunparse(
732044af 5066 (iri_parts.scheme,
5067 net_location,
5068
f9934b96 5069 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5070
5071 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5072 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5073
5074 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5075 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5076
f9934b96 5077 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5078
5079 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5080
5081
5082def to_high_limit_path(path):
5083 if sys.platform in ['win32', 'cygwin']:
5084 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5085 return '\\\\?\\' + os.path.abspath(path)
732044af 5086
5087 return path
76d321f6 5088
c76eb41b 5089
7b2c3f47 5090def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5091 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5092 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5093 return default
7b2c3f47 5094 return template % func(val)
00dd0cd5 5095
5096
5097def clean_podcast_url(url):
5098 return re.sub(r'''(?x)
5099 (?:
5100 (?:
5101 chtbl\.com/track|
5102 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5103 play\.podtrac\.com
5104 )/[^/]+|
5105 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5106 flex\.acast\.com|
5107 pd(?:
5108 cn\.co| # https://podcorn.com/analytics-prefix/
5109 st\.fm # https://podsights.com/docs/
5110 )/e
5111 )/''', '', url)
ffcb8191
THD
5112
5113
5114_HEX_TABLE = '0123456789abcdef'
5115
5116
5117def random_uuidv4():
5118 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5119
5120
5121def make_dir(path, to_screen=None):
5122 try:
5123 dn = os.path.dirname(path)
5124 if dn and not os.path.exists(dn):
5125 os.makedirs(dn)
5126 return True
86e5f3ed 5127 except OSError as err:
0202b52a 5128 if callable(to_screen) is not None:
5129 to_screen('unable to create directory ' + error_to_compat_str(err))
5130 return False
f74980cb 5131
5132
5133def get_executable_path():
b5899f4f 5134 from .update import _get_variant_and_executable_path
c487cf00 5135
b5899f4f 5136 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5137
5138
2f567473 5139def load_plugins(name, suffix, namespace):
3ae5e797 5140 classes = {}
19a03940 5141 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5142 plugins_spec = importlib.util.spec_from_file_location(
5143 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5144 plugins = importlib.util.module_from_spec(plugins_spec)
5145 sys.modules[plugins_spec.name] = plugins
5146 plugins_spec.loader.exec_module(plugins)
f74980cb 5147 for name in dir(plugins):
2f567473 5148 if name in namespace:
5149 continue
5150 if not name.endswith(suffix):
f74980cb 5151 continue
5152 klass = getattr(plugins, name)
3ae5e797 5153 classes[name] = namespace[name] = klass
f74980cb 5154 return classes
06167fbb 5155
5156
325ebc17 5157def traverse_obj(
352d63fd 5158 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5159 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5160 ''' Traverse nested list/dict/tuple
8f334380 5161 @param path_list A list of paths which are checked one by one.
19a03940 5162 Each path is a list of keys where each key is a:
5163 - None: Do nothing
5164 - string: A dictionary key
5165 - int: An index into a list
5166 - tuple: A list of keys all of which will be traversed
5167 - Ellipsis: Fetch all values in the object
5168 - Function: Takes the key and value as arguments
5169 and returns whether the key matches or not
325ebc17 5170 @param default Default value to return
352d63fd 5171 @param expected_type Only accept final value of this type (Can also be any callable)
5172 @param get_all Return all the values obtained from a path or only the first one
324ad820 5173 @param casesense Whether to consider dictionary keys as case sensitive
5174 @param is_user_input Whether the keys are generated from user input. If True,
5175 strings are converted to int/slice if necessary
5176 @param traverse_string Whether to traverse inside strings. If True, any
5177 non-compatible object will also be converted into a string
8f334380 5178 # TODO: Write tests
324ad820 5179 '''
325ebc17 5180 if not casesense:
dbf5416a 5181 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5182 path_list = (map(_lower, variadic(path)) for path in path_list)
5183
5184 def _traverse_obj(obj, path, _current_depth=0):
5185 nonlocal depth
5186 path = tuple(variadic(path))
5187 for i, key in enumerate(path):
1797b073 5188 if None in (key, obj):
5189 return obj
8f334380 5190 if isinstance(key, (list, tuple)):
5191 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5192 key = ...
5193 if key is ...:
5194 obj = (obj.values() if isinstance(obj, dict)
5195 else obj if isinstance(obj, (list, tuple, LazyList))
5196 else str(obj) if traverse_string else [])
5197 _current_depth += 1
5198 depth = max(depth, _current_depth)
5199 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5200 elif callable(key):
5201 if isinstance(obj, (list, tuple, LazyList)):
5202 obj = enumerate(obj)
5203 elif isinstance(obj, dict):
5204 obj = obj.items()
5205 else:
5206 if not traverse_string:
5207 return None
5208 obj = str(obj)
5209 _current_depth += 1
5210 depth = max(depth, _current_depth)
e6f868a6 5211 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5212 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5213 obj = (obj.get(key) if casesense or (key in obj)
5214 else next((v for k, v in obj.items() if _lower(k) == key), None))
5215 else:
5216 if is_user_input:
5217 key = (int_or_none(key) if ':' not in key
5218 else slice(*map(int_or_none, key.split(':'))))
8f334380 5219 if key == slice(None):
575e17a1 5220 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5221 if not isinstance(key, (int, slice)):
9fea350f 5222 return None
8f334380 5223 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5224 if not traverse_string:
5225 return None
5226 obj = str(obj)
5227 try:
5228 obj = obj[key]
5229 except IndexError:
324ad820 5230 return None
325ebc17 5231 return obj
5232
352d63fd 5233 if isinstance(expected_type, type):
5234 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5235 else:
7b2c3f47 5236 type_test = expected_type or IDENTITY
352d63fd 5237
8f334380 5238 for path in path_list:
5239 depth = 0
5240 val = _traverse_obj(obj, path)
325ebc17 5241 if val is not None:
8f334380 5242 if depth:
5243 for _ in range(depth - 1):
6586bca9 5244 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5245 val = [v for v in map(type_test, val) if v is not None]
8f334380 5246 if val:
352d63fd 5247 return val if get_all else val[0]
5248 else:
5249 val = type_test(val)
5250 if val is not None:
8f334380 5251 return val
325ebc17 5252 return default
324ad820 5253
5254
5255def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5256 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5257 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5258 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5259
5260
ff91cf74 5261def get_first(obj, keys, **kwargs):
5262 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5263
5264
4b4b7f74 5265def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5266 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5267
5268
3e9b66d7
LNO
5269def time_seconds(**kwargs):
5270 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5271 return t.timestamp()
5272
5273
49fa4d9a
N
5274# create a JSON Web Signature (jws) with HS256 algorithm
5275# the resulting format is in JWS Compact Serialization
5276# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5277# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5278def jwt_encode_hs256(payload_data, key, headers={}):
5279 header_data = {
5280 'alg': 'HS256',
5281 'typ': 'JWT',
5282 }
5283 if headers:
5284 header_data.update(headers)
0f06bcd7 5285 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5286 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5287 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5288 signature_b64 = base64.b64encode(h.digest())
5289 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5290 return token
819e0531 5291
5292
16b0d7e6 5293# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5294def jwt_decode_hs256(jwt):
5295 header_b64, payload_b64, signature_b64 = jwt.split('.')
5296 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5297 return payload_data
5298
5299
53973b4d 5300WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5301
5302
0b9c08b4 5303@functools.cache
819e0531 5304def supports_terminal_sequences(stream):
5305 if compat_os_name == 'nt':
8a82af35 5306 if not WINDOWS_VT_MODE:
819e0531 5307 return False
5308 elif not os.getenv('TERM'):
5309 return False
5310 try:
5311 return stream.isatty()
5312 except BaseException:
5313 return False
5314
5315
53973b4d 5316def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5317 if get_windows_version() < (10, 0, 10586):
53973b4d 5318 return
5319 global WINDOWS_VT_MODE
53973b4d 5320 try:
f0c9fb96 5321 Popen.run('', shell=True)
53973b4d 5322 except Exception:
5323 return
5324
5325 WINDOWS_VT_MODE = True
5326 supports_terminal_sequences.cache_clear()
5327
5328
ec11a9f4 5329_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5330
5331
5332def remove_terminal_sequences(string):
5333 return _terminal_sequences_re.sub('', string)
5334
5335
5336def number_of_digits(number):
5337 return len('%d' % number)
34921b43 5338
5339
5340def join_nonempty(*values, delim='-', from_dict=None):
5341 if from_dict is not None:
7b2c3f47 5342 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5343 return delim.join(map(str, filter(None, values)))
06e57990 5344
5345
27231526
ZM
5346def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5347 """
5348 Find the largest format dimensions in terms of video width and, for each thumbnail:
5349 * Modify the URL: Match the width with the provided regex and replace with the former width
5350 * Update dimensions
5351
5352 This function is useful with video services that scale the provided thumbnails on demand
5353 """
5354 _keys = ('width', 'height')
5355 max_dimensions = max(
86e5f3ed 5356 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5357 default=(0, 0))
5358 if not max_dimensions[0]:
5359 return thumbnails
5360 return [
5361 merge_dicts(
5362 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5363 dict(zip(_keys, max_dimensions)), thumbnail)
5364 for thumbnail in thumbnails
5365 ]
5366
5367
93c8410d
LNO
5368def parse_http_range(range):
5369 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5370 if not range:
5371 return None, None, None
5372 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5373 if not crg:
5374 return None, None, None
5375 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5376
5377
6b9e832d 5378def read_stdin(what):
5379 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5380 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5381 return sys.stdin
5382
5383
06e57990 5384class Config:
5385 own_args = None
9e491463 5386 parsed_args = None
06e57990 5387 filename = None
5388 __initialized = False
5389
5390 def __init__(self, parser, label=None):
9e491463 5391 self.parser, self.label = parser, label
06e57990 5392 self._loaded_paths, self.configs = set(), []
5393
5394 def init(self, args=None, filename=None):
5395 assert not self.__initialized
65662dff 5396 directory = ''
06e57990 5397 if filename:
5398 location = os.path.realpath(filename)
65662dff 5399 directory = os.path.dirname(location)
06e57990 5400 if location in self._loaded_paths:
5401 return False
5402 self._loaded_paths.add(location)
5403
9e491463 5404 self.own_args, self.__initialized = args, True
5405 opts, _ = self.parser.parse_known_args(args)
5406 self.parsed_args, self.filename = args, filename
5407
5408 for location in opts.config_locations or []:
6b9e832d 5409 if location == '-':
5410 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5411 continue
65662dff 5412 location = os.path.join(directory, expand_path(location))
06e57990 5413 if os.path.isdir(location):
5414 location = os.path.join(location, 'yt-dlp.conf')
5415 if not os.path.exists(location):
9e491463 5416 self.parser.error(f'config location {location} does not exist')
06e57990 5417 self.append_config(self.read_file(location), location)
5418 return True
5419
5420 def __str__(self):
5421 label = join_nonempty(
5422 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5423 delim=' ')
5424 return join_nonempty(
5425 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5426 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5427 delim='\n')
5428
5429 @staticmethod
5430 def read_file(filename, default=[]):
5431 try:
5432 optionf = open(filename)
86e5f3ed 5433 except OSError:
06e57990 5434 return default # silently skip if file is not present
5435 try:
5436 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5437 contents = optionf.read()
f9934b96 5438 res = shlex.split(contents, comments=True)
44a6fcff 5439 except Exception as err:
5440 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5441 finally:
5442 optionf.close()
5443 return res
5444
5445 @staticmethod
5446 def hide_login_info(opts):
86e5f3ed 5447 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5448 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5449
5450 def _scrub_eq(o):
5451 m = eqre.match(o)
5452 if m:
5453 return m.group('key') + '=PRIVATE'
5454 else:
5455 return o
5456
5457 opts = list(map(_scrub_eq, opts))
5458 for idx, opt in enumerate(opts):
5459 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5460 opts[idx + 1] = 'PRIVATE'
5461 return opts
5462
5463 def append_config(self, *args, label=None):
9e491463 5464 config = type(self)(self.parser, label)
06e57990 5465 config._loaded_paths = self._loaded_paths
5466 if config.init(*args):
5467 self.configs.append(config)
5468
5469 @property
5470 def all_args(self):
5471 for config in reversed(self.configs):
5472 yield from config.all_args
9e491463 5473 yield from self.parsed_args or []
5474
5475 def parse_known_args(self, **kwargs):
5476 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5477
5478 def parse_args(self):
9e491463 5479 return self.parser.parse_args(self.all_args)
da42679b
LNO
5480
5481
5482class WebSocketsWrapper():
5483 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5484 pool = None
da42679b 5485
3cea3edd 5486 def __init__(self, url, headers=None, connect=True):
059bc4db 5487 self.loop = asyncio.new_event_loop()
9cd08050 5488 # XXX: "loop" is deprecated
5489 self.conn = websockets.connect(
5490 url, extra_headers=headers, ping_interval=None,
5491 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5492 if connect:
5493 self.__enter__()
15dfb392 5494 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5495
5496 def __enter__(self):
3cea3edd 5497 if not self.pool:
9cd08050 5498 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5499 return self
5500
5501 def send(self, *args):
5502 self.run_with_loop(self.pool.send(*args), self.loop)
5503
5504 def recv(self, *args):
5505 return self.run_with_loop(self.pool.recv(*args), self.loop)
5506
5507 def __exit__(self, type, value, traceback):
5508 try:
5509 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5510 finally:
5511 self.loop.close()
15dfb392 5512 self._cancel_all_tasks(self.loop)
da42679b
LNO
5513
5514 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5515 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5516 @staticmethod
5517 def run_with_loop(main, loop):
059bc4db 5518 if not asyncio.iscoroutine(main):
da42679b
LNO
5519 raise ValueError(f'a coroutine was expected, got {main!r}')
5520
5521 try:
5522 return loop.run_until_complete(main)
5523 finally:
5524 loop.run_until_complete(loop.shutdown_asyncgens())
5525 if hasattr(loop, 'shutdown_default_executor'):
5526 loop.run_until_complete(loop.shutdown_default_executor())
5527
5528 @staticmethod
5529 def _cancel_all_tasks(loop):
059bc4db 5530 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5531
5532 if not to_cancel:
5533 return
5534
5535 for task in to_cancel:
5536 task.cancel()
5537
9cd08050 5538 # XXX: "loop" is removed in python 3.10+
da42679b 5539 loop.run_until_complete(
059bc4db 5540 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5541
5542 for task in to_cancel:
5543 if task.cancelled():
5544 continue
5545 if task.exception() is not None:
5546 loop.call_exception_handler({
5547 'message': 'unhandled exception during asyncio.run() shutdown',
5548 'exception': task.exception(),
5549 'task': task,
5550 })
5551
5552
8b7539d2 5553def merge_headers(*dicts):
08d30158 5554 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5555 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5556
5557
b1f94422 5558def cached_method(f):
5559 """Cache a method"""
5560 signature = inspect.signature(f)
5561
5562 @functools.wraps(f)
5563 def wrapper(self, *args, **kwargs):
5564 bound_args = signature.bind(self, *args, **kwargs)
5565 bound_args.apply_defaults()
5566 key = tuple(bound_args.arguments.values())
5567
5568 if not hasattr(self, '__cached_method__cache'):
5569 self.__cached_method__cache = {}
5570 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5571 if key not in cache:
5572 cache[key] = f(self, *args, **kwargs)
5573 return cache[key]
5574 return wrapper
5575
5576
28787f16 5577class classproperty:
b1f94422 5578 """property access for class methods"""
c487cf00 5579
5580 def __init__(self, func):
5581 functools.update_wrapper(self, func)
5582 self.func = func
28787f16 5583
5584 def __get__(self, _, cls):
c487cf00 5585 return self.func(cls)
19a03940 5586
5587
64fa820c 5588class Namespace(types.SimpleNamespace):
591bb9d3 5589 """Immutable namespace"""
591bb9d3 5590
7896214c 5591 def __iter__(self):
64fa820c 5592 return iter(self.__dict__.values())
7896214c 5593
64fa820c 5594 @property
5595 def items_(self):
5596 return self.__dict__.items()
9b8ee23b 5597
5598
5599# Deprecated
5600has_certifi = bool(certifi)
5601has_websockets = bool(websockets)