]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Do not load system certificates when `certifi` is used
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
15dfb392 1import atexit
1e399778 2import base64
5bc880b9 3import binascii
912b38b4 4import calendar
676eb3f2 5import codecs
c380cc28 6import collections
62e609ab 7import contextlib
e3946f98 8import ctypes
c496ca96 9import datetime
0c265486 10import email.header
f8271158 11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
019a94f7 20import importlib.util
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
14f25df2 43import urllib.error
f8271158 44import urllib.parse
ac668111 45import urllib.request
bcf89ce6 46import xml.etree.ElementTree
d77c3dfd 47import zlib
d77c3dfd 48
c487cf00 49from .compat import asyncio, functools # isort: split
8c25f81b 50from .compat import (
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
f8271158 53 compat_HTMLParseError,
efa97bdc 54 compat_os_name,
702ccf2d 55 compat_shlex_quote,
8c25f81b 56)
ac668111 57from .dependencies import brotli, certifi, websockets, xattr
f8271158 58from .socks import ProxyType, sockssocket
71aff188 59
4644ac55 60
51fb4995
YCH
61def register_socks_protocols():
62 # "Register" SOCKS protocols
d5ae6bb5
YCH
63 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
64 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 65 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 66 if scheme not in urllib.parse.uses_netloc:
67 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
68
69
468e2e92
FV
70# This is not clearly defined otherwise
71compiled_regex_type = type(re.compile(''))
72
f7a147e3
S
73
74def random_user_agent():
75 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
76 _CHROME_VERSIONS = (
19b4c74d 77 '90.0.4430.212',
78 '90.0.4430.24',
79 '90.0.4430.70',
80 '90.0.4430.72',
81 '90.0.4430.85',
82 '90.0.4430.93',
83 '91.0.4472.101',
84 '91.0.4472.106',
85 '91.0.4472.114',
86 '91.0.4472.124',
87 '91.0.4472.164',
88 '91.0.4472.19',
89 '91.0.4472.77',
90 '92.0.4515.107',
91 '92.0.4515.115',
92 '92.0.4515.131',
93 '92.0.4515.159',
94 '92.0.4515.43',
95 '93.0.4556.0',
96 '93.0.4577.15',
97 '93.0.4577.63',
98 '93.0.4577.82',
99 '94.0.4606.41',
100 '94.0.4606.54',
101 '94.0.4606.61',
102 '94.0.4606.71',
103 '94.0.4606.81',
104 '94.0.4606.85',
105 '95.0.4638.17',
106 '95.0.4638.50',
107 '95.0.4638.54',
108 '95.0.4638.69',
109 '95.0.4638.74',
110 '96.0.4664.18',
111 '96.0.4664.45',
112 '96.0.4664.55',
113 '96.0.4664.93',
114 '97.0.4692.20',
f7a147e3
S
115 )
116 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
117
118
4390d5ec 119SUPPORTED_ENCODINGS = [
120 'gzip', 'deflate'
121]
9b8ee23b 122if brotli:
4390d5ec 123 SUPPORTED_ENCODINGS.append('br')
124
3e669f36 125std_headers = {
f7a147e3 126 'User-Agent': random_user_agent(),
59ae15a5 127 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 128 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 129 'Sec-Fetch-Mode': 'navigate',
3e669f36 130}
f427df17 131
5f6a1245 132
fb37eb25
S
133USER_AGENTS = {
134 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
135}
136
137
bf42a990 138NO_DEFAULT = object()
7b2c3f47 139IDENTITY = lambda x: x
bf42a990 140
7105440c
YCH
141ENGLISH_MONTH_NAMES = [
142 'January', 'February', 'March', 'April', 'May', 'June',
143 'July', 'August', 'September', 'October', 'November', 'December']
144
f6717dec
S
145MONTH_NAMES = {
146 'en': ENGLISH_MONTH_NAMES,
147 'fr': [
3e4185c3
S
148 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
149 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 150}
a942d6cb 151
a7aaa398
S
152KNOWN_EXTENSIONS = (
153 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
154 'flv', 'f4v', 'f4a', 'f4b',
155 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
156 'mkv', 'mka', 'mk3d',
157 'avi', 'divx',
158 'mov',
159 'asf', 'wmv', 'wma',
160 '3gp', '3g2',
161 'mp3',
162 'flac',
163 'ape',
164 'wav',
165 'f4f', 'f4m', 'm3u8', 'smil')
166
c587cbb7 167# needed for sanitizing filenames in restricted mode
c8827027 168ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
169 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
170 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 171
46f59e89
S
172DATE_FORMATS = (
173 '%d %B %Y',
174 '%d %b %Y',
175 '%B %d %Y',
cb655f34
S
176 '%B %dst %Y',
177 '%B %dnd %Y',
9d30c213 178 '%B %drd %Y',
cb655f34 179 '%B %dth %Y',
46f59e89 180 '%b %d %Y',
cb655f34
S
181 '%b %dst %Y',
182 '%b %dnd %Y',
9d30c213 183 '%b %drd %Y',
cb655f34 184 '%b %dth %Y',
46f59e89
S
185 '%b %dst %Y %I:%M',
186 '%b %dnd %Y %I:%M',
9d30c213 187 '%b %drd %Y %I:%M',
46f59e89
S
188 '%b %dth %Y %I:%M',
189 '%Y %m %d',
190 '%Y-%m-%d',
bccdbd22 191 '%Y.%m.%d.',
46f59e89 192 '%Y/%m/%d',
81c13222 193 '%Y/%m/%d %H:%M',
46f59e89 194 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
195 '%Y%m%d%H%M',
196 '%Y%m%d%H%M%S',
4f3fa23e 197 '%Y%m%d',
0c1c6f4b 198 '%Y-%m-%d %H:%M',
46f59e89
S
199 '%Y-%m-%d %H:%M:%S',
200 '%Y-%m-%d %H:%M:%S.%f',
5014558a 201 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
202 '%d.%m.%Y %H:%M',
203 '%d.%m.%Y %H.%M',
204 '%Y-%m-%dT%H:%M:%SZ',
205 '%Y-%m-%dT%H:%M:%S.%fZ',
206 '%Y-%m-%dT%H:%M:%S.%f0Z',
207 '%Y-%m-%dT%H:%M:%S',
208 '%Y-%m-%dT%H:%M:%S.%f',
209 '%Y-%m-%dT%H:%M',
c6eed6b8
S
210 '%b %d %Y at %H:%M',
211 '%b %d %Y at %H:%M:%S',
b555ae9b
S
212 '%B %d %Y at %H:%M',
213 '%B %d %Y at %H:%M:%S',
a63d9bd0 214 '%H:%M %d-%b-%Y',
46f59e89
S
215)
216
217DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
218DATE_FORMATS_DAY_FIRST.extend([
219 '%d-%m-%Y',
220 '%d.%m.%Y',
221 '%d.%m.%y',
222 '%d/%m/%Y',
223 '%d/%m/%y',
224 '%d/%m/%Y %H:%M:%S',
225])
226
227DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
228DATE_FORMATS_MONTH_FIRST.extend([
229 '%m-%d-%Y',
230 '%m.%d.%Y',
231 '%m/%d/%Y',
232 '%m/%d/%y',
233 '%m/%d/%Y %H:%M:%S',
234])
235
06b3fe29 236PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 237JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 238
1d485a1a 239NUMBER_RE = r'\d+(?:\.\d+)?'
240
7105440c 241
0b9c08b4 242@functools.cache
d77c3dfd 243def preferredencoding():
59ae15a5 244 """Get preferred encoding.
d77c3dfd 245
59ae15a5
PH
246 Returns the best encoding scheme for the system, based on
247 locale.getpreferredencoding() and some further tweaks.
248 """
249 try:
250 pref = locale.getpreferredencoding()
28e614de 251 'TEST'.encode(pref)
70a1165b 252 except Exception:
59ae15a5 253 pref = 'UTF-8'
bae611f2 254
59ae15a5 255 return pref
d77c3dfd 256
f4bfd65f 257
181c8655 258def write_json_file(obj, fn):
1394646a 259 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 260
cfb0511d 261 tf = tempfile.NamedTemporaryFile(
262 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
263 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
264
265 try:
266 with tf:
45d86abe 267 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
268 if sys.platform == 'win32':
269 # Need to remove existing file on Windows, else os.rename raises
270 # WindowsError or FileExistsError.
19a03940 271 with contextlib.suppress(OSError):
1394646a 272 os.unlink(fn)
19a03940 273 with contextlib.suppress(OSError):
9cd5f54e
R
274 mask = os.umask(0)
275 os.umask(mask)
276 os.chmod(tf.name, 0o666 & ~mask)
181c8655 277 os.rename(tf.name, fn)
70a1165b 278 except Exception:
19a03940 279 with contextlib.suppress(OSError):
181c8655 280 os.remove(tf.name)
181c8655
PH
281 raise
282
283
cfb0511d 284def find_xpath_attr(node, xpath, key, val=None):
285 """ Find the xpath xpath[@key=val] """
286 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 287 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 288 return node.find(expr)
59ae56fa 289
d7e66d39
JMF
290# On python2.6 the xml.etree.ElementTree.Element methods don't support
291# the namespace parameter
5f6a1245
JW
292
293
d7e66d39
JMF
294def xpath_with_ns(path, ns_map):
295 components = [c.split(':') for c in path.split('/')]
296 replaced = []
297 for c in components:
298 if len(c) == 1:
299 replaced.append(c[0])
300 else:
301 ns, tag = c
302 replaced.append('{%s}%s' % (ns_map[ns], tag))
303 return '/'.join(replaced)
304
d77c3dfd 305
a41fb80c 306def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 307 def _find_xpath(xpath):
f9934b96 308 return node.find(xpath)
578c0745 309
14f25df2 310 if isinstance(xpath, str):
578c0745
S
311 n = _find_xpath(xpath)
312 else:
313 for xp in xpath:
314 n = _find_xpath(xp)
315 if n is not None:
316 break
d74bebd5 317
8e636da4 318 if n is None:
bf42a990
S
319 if default is not NO_DEFAULT:
320 return default
321 elif fatal:
bf0ff932
PH
322 name = xpath if name is None else name
323 raise ExtractorError('Could not find XML element %s' % name)
324 else:
325 return None
a41fb80c
S
326 return n
327
328
329def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
330 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
331 if n is None or n == default:
332 return n
333 if n.text is None:
334 if default is not NO_DEFAULT:
335 return default
336 elif fatal:
337 name = xpath if name is None else name
338 raise ExtractorError('Could not find XML element\'s text %s' % name)
339 else:
340 return None
341 return n.text
a41fb80c
S
342
343
344def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
345 n = find_xpath_attr(node, xpath, key)
346 if n is None:
347 if default is not NO_DEFAULT:
348 return default
349 elif fatal:
86e5f3ed 350 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
351 raise ExtractorError('Could not find XML attribute %s' % name)
352 else:
353 return None
354 return n.attrib[key]
bf0ff932
PH
355
356
c487cf00 357def get_element_by_id(id, html, **kwargs):
43e8fafd 358 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 359 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 360
12ea2f30 361
c487cf00 362def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 363 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 364 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
365
366
84c237fb 367def get_element_by_class(class_name, html):
2af12ad9
TC
368 """Return the content of the first tag with the specified class in the passed HTML document"""
369 retval = get_elements_by_class(class_name, html)
370 return retval[0] if retval else None
371
372
6f32a0b5
ZM
373def get_element_html_by_class(class_name, html):
374 """Return the html of the first tag with the specified class in the passed HTML document"""
375 retval = get_elements_html_by_class(class_name, html)
376 return retval[0] if retval else None
377
378
c487cf00 379def get_element_by_attribute(attribute, value, html, **kwargs):
380 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
381 return retval[0] if retval else None
382
383
c487cf00 384def get_element_html_by_attribute(attribute, value, html, **kargs):
385 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
386 return retval[0] if retval else None
387
388
c487cf00 389def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
390 """Return the content of all tags with the specified class in the passed HTML document as a list"""
391 return get_elements_by_attribute(
64fa820c 392 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
393 html, escape_value=False)
394
395
6f32a0b5
ZM
396def get_elements_html_by_class(class_name, html):
397 """Return the html of all tags with the specified class in the passed HTML document as a list"""
398 return get_elements_html_by_attribute(
64fa820c 399 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
400 html, escape_value=False)
401
402
403def get_elements_by_attribute(*args, **kwargs):
43e8fafd 404 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
405 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
406
407
408def get_elements_html_by_attribute(*args, **kwargs):
409 """Return the html of the tag with the specified attribute in the passed HTML document"""
410 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
411
412
413def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
414 """
415 Return the text (content) and the html (whole) of the tag with the specified
416 attribute in the passed HTML document
417 """
9e6dd238 418
86e5f3ed 419 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 420
84c237fb
YCH
421 value = re.escape(value) if escape_value else value
422
86e5f3ed 423 partial_element_re = rf'''(?x)
6f32a0b5 424 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 425 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 426 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
427 '''
38285056 428
0254f162
ZM
429 for m in re.finditer(partial_element_re, html):
430 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 431
0254f162
ZM
432 yield (
433 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
434 whole
435 )
a921f407 436
c5229f39 437
ac668111 438class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
439 """
440 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
441 closing tag for the first opening tag it has encountered, and can be used
442 as a context manager
443 """
444
445 class HTMLBreakOnClosingTagException(Exception):
446 pass
447
448 def __init__(self):
449 self.tagstack = collections.deque()
ac668111 450 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
451
452 def __enter__(self):
453 return self
454
455 def __exit__(self, *_):
456 self.close()
457
458 def close(self):
459 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
460 # so data remains buffered; we no longer have any interest in it, thus
461 # override this method to discard it
462 pass
463
464 def handle_starttag(self, tag, _):
465 self.tagstack.append(tag)
466
467 def handle_endtag(self, tag):
468 if not self.tagstack:
469 raise compat_HTMLParseError('no tags in the stack')
470 while self.tagstack:
471 inner_tag = self.tagstack.pop()
472 if inner_tag == tag:
473 break
474 else:
475 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
476 if not self.tagstack:
477 raise self.HTMLBreakOnClosingTagException()
478
479
480def get_element_text_and_html_by_tag(tag, html):
481 """
482 For the first element with the specified tag in the passed HTML document
483 return its' content (text) and the whole element (html)
484 """
485 def find_or_raise(haystack, needle, exc):
486 try:
487 return haystack.index(needle)
488 except ValueError:
489 raise exc
490 closing_tag = f'</{tag}>'
491 whole_start = find_or_raise(
492 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
493 content_start = find_or_raise(
494 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
495 content_start += whole_start + 1
496 with HTMLBreakOnClosingTagParser() as parser:
497 parser.feed(html[whole_start:content_start])
498 if not parser.tagstack or parser.tagstack[0] != tag:
499 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
500 offset = content_start
501 while offset < len(html):
502 next_closing_tag_start = find_or_raise(
503 html[offset:], closing_tag,
504 compat_HTMLParseError(f'closing {tag} tag not found'))
505 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
506 try:
507 parser.feed(html[offset:offset + next_closing_tag_end])
508 offset += next_closing_tag_end
509 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
510 return html[content_start:offset + next_closing_tag_start], \
511 html[whole_start:offset + next_closing_tag_end]
512 raise compat_HTMLParseError('unexpected end of html')
513
514
ac668111 515class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 516 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 517
8bb56eee 518 def __init__(self):
c5229f39 519 self.attrs = {}
ac668111 520 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
521
522 def handle_starttag(self, tag, attrs):
523 self.attrs = dict(attrs)
524
c5229f39 525
ac668111 526class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
527 """HTML parser to gather the attributes for the elements of a list"""
528
529 def __init__(self):
ac668111 530 html.parser.HTMLParser.__init__(self)
73673ccf
FF
531 self.items = []
532 self._level = 0
533
534 def handle_starttag(self, tag, attrs):
535 if tag == 'li' and self._level == 0:
536 self.items.append(dict(attrs))
537 self._level += 1
538
539 def handle_endtag(self, tag):
540 self._level -= 1
541
542
8bb56eee
BF
543def extract_attributes(html_element):
544 """Given a string for an HTML element such as
545 <el
546 a="foo" B="bar" c="&98;az" d=boz
547 empty= noval entity="&amp;"
548 sq='"' dq="'"
549 >
550 Decode and return a dictionary of attributes.
551 {
552 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
553 'empty': '', 'noval': None, 'entity': '&',
554 'sq': '"', 'dq': '\''
555 }.
8bb56eee
BF
556 """
557 parser = HTMLAttributeParser()
19a03940 558 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
559 parser.feed(html_element)
560 parser.close()
8bb56eee 561 return parser.attrs
9e6dd238 562
c5229f39 563
73673ccf
FF
564def parse_list(webpage):
565 """Given a string for an series of HTML <li> elements,
566 return a dictionary of their attributes"""
567 parser = HTMLListAttrsParser()
568 parser.feed(webpage)
569 parser.close()
570 return parser.items
571
572
9e6dd238 573def clean_html(html):
59ae15a5 574 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
575
576 if html is None: # Convenience for sanitizing descriptions etc.
577 return html
578
49185227 579 html = re.sub(r'\s+', ' ', html)
580 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
581 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
582 # Strip html tags
583 html = re.sub('<.*?>', '', html)
584 # Replace html entities
585 html = unescapeHTML(html)
7decf895 586 return html.strip()
9e6dd238
FV
587
588
b7c47b74 589class LenientJSONDecoder(json.JSONDecoder):
590 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
591 self.transform_source, self.ignore_extra = transform_source, ignore_extra
592 super().__init__(*args, **kwargs)
593
594 def decode(self, s):
595 if self.transform_source:
596 s = self.transform_source(s)
597 if self.ignore_extra:
598 return self.raw_decode(s.lstrip())[0]
599 return super().decode(s)
600
601
d77c3dfd 602def sanitize_open(filename, open_mode):
59ae15a5
PH
603 """Try to open the given filename, and slightly tweak it if this fails.
604
605 Attempts to open the given filename. If this fails, it tries to change
606 the filename slightly, step by step, until it's either able to open it
607 or it fails and raises a final exception, like the standard open()
608 function.
609
610 It returns the tuple (stream, definitive_file_name).
611 """
0edb3e33 612 if filename == '-':
613 if sys.platform == 'win32':
614 import msvcrt
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 617
0edb3e33 618 for attempt in range(2):
619 try:
620 try:
89737671 621 if sys.platform == 'win32':
b506289f 622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the file on windows (for now).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 625 raise LockingUnsupportedError()
0edb3e33 626 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 627 except OSError:
0edb3e33 628 stream = open(filename, open_mode)
8a82af35 629 return stream, filename
86e5f3ed 630 except OSError as err:
0edb3e33 631 if attempt or err.errno in (errno.EACCES,):
632 raise
633 old_filename, filename = filename, sanitize_path(filename)
634 if old_filename == filename:
635 raise
d77c3dfd
FV
636
637
638def timeconvert(timestr):
59ae15a5
PH
639 """Convert RFC 2822 defined time string into system timestamp"""
640 timestamp = None
641 timetuple = email.utils.parsedate_tz(timestr)
642 if timetuple is not None:
643 timestamp = email.utils.mktime_tz(timetuple)
644 return timestamp
1c469a94 645
5f6a1245 646
5c3895ff 647def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 648 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 652 """
5c3895ff 653 if s == '':
654 return ''
655
59ae15a5 656 def replace_insane(char):
c587cbb7
AT
657 if restricted and char in ACCENT_CHARS:
658 return ACCENT_CHARS[char]
91dd88b9 659 elif not restricted and char == '\n':
5c3895ff 660 return '\0 '
91dd88b9 661 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
662 return ''
663 elif char == '"':
664 return '' if restricted else '\''
665 elif char == ':':
5c3895ff 666 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 667 elif char in '\\/|*<>':
5c3895ff 668 return '\0_'
669 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
670 return '\0_'
59ae15a5
PH
671 return char
672
5c3895ff 673 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 674 result = ''.join(map(replace_insane, s))
5c3895ff 675 if is_id is NO_DEFAULT:
ae61d108 676 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
677 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 678 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
679 result = result.replace('\0', '') or '_'
680
796173d0
PH
681 if not is_id:
682 while '__' in result:
683 result = result.replace('__', '_')
684 result = result.strip('_')
685 # Common case of "Foreign band name - English song title"
686 if restricted and result.startswith('-_'):
687 result = result[2:]
5a42414b
PH
688 if result.startswith('-'):
689 result = '_' + result[len('-'):]
a7440261 690 result = result.lstrip('.')
796173d0
PH
691 if not result:
692 result = '_'
59ae15a5 693 return result
d77c3dfd 694
5f6a1245 695
c2934512 696def sanitize_path(s, force=False):
a2aaf4db 697 """Sanitizes and normalizes path on Windows"""
c2934512 698 if sys.platform == 'win32':
c4218ac3 699 force = False
c2934512 700 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 701 elif force:
702 drive_or_unc = ''
703 else:
a2aaf4db 704 return s
c2934512 705
be531ef1
S
706 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
707 if drive_or_unc:
a2aaf4db
S
708 norm_path.pop(0)
709 sanitized_path = [
ec85ded8 710 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 711 for path_part in norm_path]
be531ef1
S
712 if drive_or_unc:
713 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 714 elif force and s and s[0] == os.path.sep:
c4218ac3 715 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
716 return os.path.join(*sanitized_path)
717
718
17bcc626 719def sanitize_url(url):
befa4708
S
720 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
721 # the number of unwanted failures due to missing protocol
21633673 722 if url is None:
723 return
724 elif url.startswith('//'):
befa4708
S
725 return 'http:%s' % url
726 # Fix some common typos seen so far
727 COMMON_TYPOS = (
067aa17e 728 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
729 (r'^httpss://', r'https://'),
730 # https://bx1.be/lives/direct-tv/
731 (r'^rmtp([es]?)://', r'rtmp\1://'),
732 )
733 for mistake, fixup in COMMON_TYPOS:
734 if re.match(mistake, url):
735 return re.sub(mistake, fixup, url)
bc6b9bcd 736 return url
17bcc626
S
737
738
5435dcf9 739def extract_basic_auth(url):
14f25df2 740 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
741 if parts.username is None:
742 return url, None
14f25df2 743 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
744 parts.hostname if parts.port is None
745 else '%s:%d' % (parts.hostname, parts.port))))
746 auth_payload = base64.b64encode(
0f06bcd7 747 ('%s:%s' % (parts.username, parts.password or '')).encode())
748 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
749
750
67dda517 751def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 752 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
753 if auth_header is not None:
754 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
755 headers['Authorization'] = auth_header
ac668111 756 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
757
758
51098426
S
759def expand_path(s):
760 """Expand shell variables and ~"""
761 return os.path.expandvars(compat_expanduser(s))
762
763
7e9a6125 764def orderedSet(iterable, *, lazy=False):
765 """Remove all duplicates from the input iterable"""
766 def _iter():
767 seen = [] # Do not use set since the items can be unhashable
768 for x in iterable:
769 if x not in seen:
770 seen.append(x)
771 yield x
772
773 return _iter() if lazy else list(_iter())
d77c3dfd 774
912b38b4 775
55b2f099 776def _htmlentity_transform(entity_with_semicolon):
4e408e47 777 """Transforms an HTML entity to a character."""
55b2f099
YCH
778 entity = entity_with_semicolon[:-1]
779
4e408e47 780 # Known non-numeric HTML entity
ac668111 781 if entity in html.entities.name2codepoint:
782 return chr(html.entities.name2codepoint[entity])
4e408e47 783
55b2f099
YCH
784 # TODO: HTML5 allows entities without a semicolon. For example,
785 # '&Eacuteric' should be decoded as 'Éric'.
ac668111 786 if entity_with_semicolon in html.entities.html5:
787 return html.entities.html5[entity_with_semicolon]
55b2f099 788
91757b0f 789 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
790 if mobj is not None:
791 numstr = mobj.group(1)
28e614de 792 if numstr.startswith('x'):
4e408e47 793 base = 16
28e614de 794 numstr = '0%s' % numstr
4e408e47
PH
795 else:
796 base = 10
067aa17e 797 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 798 with contextlib.suppress(ValueError):
ac668111 799 return chr(int(numstr, base))
4e408e47
PH
800
801 # Unknown entity in name, return its literal representation
7a3f0c00 802 return '&%s;' % entity
4e408e47
PH
803
804
d77c3dfd 805def unescapeHTML(s):
912b38b4
PH
806 if s is None:
807 return None
19a03940 808 assert isinstance(s, str)
d77c3dfd 809
4e408e47 810 return re.sub(
95f3f7c2 811 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 812
8bf48f23 813
cdb19aa4 814def escapeHTML(text):
815 return (
816 text
817 .replace('&', '&amp;')
818 .replace('<', '&lt;')
819 .replace('>', '&gt;')
820 .replace('"', '&quot;')
821 .replace("'", '&#39;')
822 )
823
824
f5b1bca9 825def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 826 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
827 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
828 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 829
830
d3c93ec2 831class Popen(subprocess.Popen):
832 if sys.platform == 'win32':
833 _startupinfo = subprocess.STARTUPINFO()
834 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
835 else:
836 _startupinfo = None
837
f0c9fb96 838 def __init__(self, *args, text=False, **kwargs):
839 if text is True:
840 kwargs['universal_newlines'] = True # For 3.6 compatibility
841 kwargs.setdefault('encoding', 'utf-8')
842 kwargs.setdefault('errors', 'replace')
86e5f3ed 843 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 844
845 def communicate_or_kill(self, *args, **kwargs):
8a82af35 846 try:
847 return self.communicate(*args, **kwargs)
848 except BaseException: # Including KeyboardInterrupt
f0c9fb96 849 self.kill(timeout=None)
8a82af35 850 raise
d3c93ec2 851
f0c9fb96 852 def kill(self, *, timeout=0):
853 super().kill()
854 if timeout != 0:
855 self.wait(timeout=timeout)
856
857 @classmethod
858 def run(cls, *args, **kwargs):
859 with cls(*args, **kwargs) as proc:
860 stdout, stderr = proc.communicate_or_kill()
861 return stdout or '', stderr or '', proc.returncode
862
d3c93ec2 863
aa49acd1
S
864def get_subprocess_encoding():
865 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
866 # For subprocess calls, encode with locale encoding
867 # Refer to http://stackoverflow.com/a/9951851/35070
868 encoding = preferredencoding()
869 else:
870 encoding = sys.getfilesystemencoding()
871 if encoding is None:
872 encoding = 'utf-8'
873 return encoding
874
875
8bf48f23 876def encodeFilename(s, for_subprocess=False):
19a03940 877 assert isinstance(s, str)
cfb0511d 878 return s
aa49acd1
S
879
880
881def decodeFilename(b, for_subprocess=False):
cfb0511d 882 return b
8bf48f23 883
f07b74fc
PH
884
885def encodeArgument(s):
cfb0511d 886 # Legacy code that uses byte strings
887 # Uncomment the following line after fixing all post processors
14f25df2 888 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 889 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
890
891
aa49acd1 892def decodeArgument(b):
cfb0511d 893 return b
aa49acd1
S
894
895
8271226a
PH
896def decodeOption(optval):
897 if optval is None:
898 return optval
899 if isinstance(optval, bytes):
900 optval = optval.decode(preferredencoding())
901
14f25df2 902 assert isinstance(optval, str)
8271226a 903 return optval
1c256f70 904
5f6a1245 905
aa7785f8 906_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
907
908
909def timetuple_from_msec(msec):
910 secs, msec = divmod(msec, 1000)
911 mins, secs = divmod(secs, 60)
912 hrs, mins = divmod(mins, 60)
913 return _timetuple(hrs, mins, secs, msec)
914
915
cdb19aa4 916def formatSeconds(secs, delim=':', msec=False):
aa7785f8 917 time = timetuple_from_msec(secs * 1000)
918 if time.hours:
919 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
920 elif time.minutes:
921 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 922 else:
aa7785f8 923 ret = '%d' % time.seconds
924 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 925
a0ddb8a2 926
77562778 927def _ssl_load_windows_store_certs(ssl_context, storename):
928 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
929 try:
930 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
931 if encoding == 'x509_asn' and (
932 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
933 except PermissionError:
934 return
935 for cert in certs:
19a03940 936 with contextlib.suppress(ssl.SSLError):
77562778 937 ssl_context.load_verify_locations(cadata=cert)
a2366922 938
77562778 939
940def make_HTTPS_handler(params, **kwargs):
941 opts_check_certificate = not params.get('nocheckcertificate')
942 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
943 context.check_hostname = opts_check_certificate
f81c62a6 944 if params.get('legacyserverconnect'):
945 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 946 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
947 context.set_ciphers('DEFAULT')
8a82af35 948
77562778 949 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
950 if opts_check_certificate:
d5820461 951 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
952 context.load_verify_locations(cafile=certifi.where())
168bbc4f 953 else:
954 try:
955 context.load_default_certs()
956 # Work around the issue in load_default_certs when there are bad certificates. See:
957 # https://github.com/yt-dlp/yt-dlp/issues/1060,
958 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
959 except ssl.SSLError:
960 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
961 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
962 for storename in ('CA', 'ROOT'):
963 _ssl_load_windows_store_certs(context, storename)
964 context.set_default_verify_paths()
8a82af35 965
bb58c9ed 966 client_certfile = params.get('client_certificate')
967 if client_certfile:
968 try:
969 context.load_cert_chain(
970 client_certfile, keyfile=params.get('client_certificate_key'),
971 password=params.get('client_certificate_password'))
972 except ssl.SSLError:
973 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 974
975 # Some servers may reject requests if ALPN extension is not sent. See:
976 # https://github.com/python/cpython/issues/85140
977 # https://github.com/yt-dlp/yt-dlp/issues/3878
978 with contextlib.suppress(NotImplementedError):
979 context.set_alpn_protocols(['http/1.1'])
980
77562778 981 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 982
732ea2f0 983
5873d4cc 984def bug_reports_message(before=';'):
57e0f077 985 from .update import REPOSITORY
986
987 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
988 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
989
990 before = before.rstrip()
991 if not before or before.endswith(('.', '!', '?')):
992 msg = msg[0].title() + msg[1:]
993
994 return (before + ' ' if before else '') + msg
08f2a92c
JMF
995
996
bf5b9d85
PM
997class YoutubeDLError(Exception):
998 """Base exception for YoutubeDL errors."""
aa9369a2 999 msg = None
1000
1001 def __init__(self, msg=None):
1002 if msg is not None:
1003 self.msg = msg
1004 elif self.msg is None:
1005 self.msg = type(self).__name__
1006 super().__init__(self.msg)
bf5b9d85
PM
1007
1008
ac668111 1009network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1010if hasattr(ssl, 'CertificateError'):
1011 network_exceptions.append(ssl.CertificateError)
1012network_exceptions = tuple(network_exceptions)
1013
1014
bf5b9d85 1015class ExtractorError(YoutubeDLError):
1c256f70 1016 """Error during info extraction."""
5f6a1245 1017
1151c407 1018 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1019 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1020 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1021 """
3158150c 1022 if sys.exc_info()[0] in network_exceptions:
9a82b238 1023 expected = True
d5979c5d 1024
7265a219 1025 self.orig_msg = str(msg)
1c256f70 1026 self.traceback = tb
1151c407 1027 self.expected = expected
2eabb802 1028 self.cause = cause
d11271dd 1029 self.video_id = video_id
1151c407 1030 self.ie = ie
1031 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1032 if isinstance(self.exc_info[1], ExtractorError):
1033 self.exc_info = self.exc_info[1].exc_info
1151c407 1034
86e5f3ed 1035 super().__init__(''.join((
a70635b8 1036 format_field(ie, None, '[%s] '),
1037 format_field(video_id, None, '%s: '),
7265a219 1038 msg,
a70635b8 1039 format_field(cause, None, ' (caused by %r)'),
1151c407 1040 '' if expected else bug_reports_message())))
1c256f70 1041
01951dda 1042 def format_traceback(self):
497d2fab 1043 return join_nonempty(
1044 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1045 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1046 delim='\n') or None
01951dda 1047
1c256f70 1048
416c7fcb
PH
1049class UnsupportedError(ExtractorError):
1050 def __init__(self, url):
86e5f3ed 1051 super().__init__(
416c7fcb
PH
1052 'Unsupported URL: %s' % url, expected=True)
1053 self.url = url
1054
1055
55b3e45b
JMF
1056class RegexNotFoundError(ExtractorError):
1057 """Error when a regex didn't match"""
1058 pass
1059
1060
773f291d
S
1061class GeoRestrictedError(ExtractorError):
1062 """Geographic restriction Error exception.
1063
1064 This exception may be thrown when a video is not available from your
1065 geographic location due to geographic restrictions imposed by a website.
1066 """
b6e0c7d2 1067
0db3bae8 1068 def __init__(self, msg, countries=None, **kwargs):
1069 kwargs['expected'] = True
86e5f3ed 1070 super().__init__(msg, **kwargs)
773f291d
S
1071 self.countries = countries
1072
1073
bf5b9d85 1074class DownloadError(YoutubeDLError):
59ae15a5 1075 """Download Error exception.
d77c3dfd 1076
59ae15a5
PH
1077 This exception may be thrown by FileDownloader objects if they are not
1078 configured to continue on errors. They will contain the appropriate
1079 error message.
1080 """
5f6a1245 1081
8cc83b8d
FV
1082 def __init__(self, msg, exc_info=None):
1083 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1084 super().__init__(msg)
8cc83b8d 1085 self.exc_info = exc_info
d77c3dfd
FV
1086
1087
498f5606 1088class EntryNotInPlaylist(YoutubeDLError):
1089 """Entry not in playlist exception.
1090
1091 This exception will be thrown by YoutubeDL when a requested entry
1092 is not found in the playlist info_dict
1093 """
aa9369a2 1094 msg = 'Entry not found in info'
498f5606 1095
1096
bf5b9d85 1097class SameFileError(YoutubeDLError):
59ae15a5 1098 """Same File exception.
d77c3dfd 1099
59ae15a5
PH
1100 This exception will be thrown by FileDownloader objects if they detect
1101 multiple files would have to be downloaded to the same file on disk.
1102 """
aa9369a2 1103 msg = 'Fixed output name but more than one file to download'
1104
1105 def __init__(self, filename=None):
1106 if filename is not None:
1107 self.msg += f': {filename}'
1108 super().__init__(self.msg)
d77c3dfd
FV
1109
1110
bf5b9d85 1111class PostProcessingError(YoutubeDLError):
59ae15a5 1112 """Post Processing exception.
d77c3dfd 1113
59ae15a5
PH
1114 This exception may be raised by PostProcessor's .run() method to
1115 indicate an error in the postprocessing task.
1116 """
5f6a1245 1117
5f6a1245 1118
48f79687 1119class DownloadCancelled(YoutubeDLError):
1120 """ Exception raised when the download queue should be interrupted """
1121 msg = 'The download was cancelled'
8b0d7497 1122
8b0d7497 1123
48f79687 1124class ExistingVideoReached(DownloadCancelled):
1125 """ --break-on-existing triggered """
1126 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1127
48f79687 1128
1129class RejectedVideoReached(DownloadCancelled):
1130 """ --break-on-reject triggered """
1131 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1132
1133
48f79687 1134class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1135 """ --max-downloads limit has been reached. """
48f79687 1136 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1137
1138
f2ebc5c7 1139class ReExtractInfo(YoutubeDLError):
1140 """ Video info needs to be re-extracted. """
1141
1142 def __init__(self, msg, expected=False):
1143 super().__init__(msg)
1144 self.expected = expected
1145
1146
1147class ThrottledDownload(ReExtractInfo):
48f79687 1148 """ Download speed below --throttled-rate. """
aa9369a2 1149 msg = 'The download speed is below throttle limit'
d77c3dfd 1150
43b22906 1151 def __init__(self):
1152 super().__init__(self.msg, expected=False)
f2ebc5c7 1153
d77c3dfd 1154
bf5b9d85 1155class UnavailableVideoError(YoutubeDLError):
59ae15a5 1156 """Unavailable Format exception.
d77c3dfd 1157
59ae15a5
PH
1158 This exception will be thrown when a video is requested
1159 in a format that is not available for that video.
1160 """
aa9369a2 1161 msg = 'Unable to download video'
1162
1163 def __init__(self, err=None):
1164 if err is not None:
1165 self.msg += f': {err}'
1166 super().__init__(self.msg)
d77c3dfd
FV
1167
1168
bf5b9d85 1169class ContentTooShortError(YoutubeDLError):
59ae15a5 1170 """Content Too Short exception.
d77c3dfd 1171
59ae15a5
PH
1172 This exception may be raised by FileDownloader objects when a file they
1173 download is too small for what the server announced first, indicating
1174 the connection was probably interrupted.
1175 """
d77c3dfd 1176
59ae15a5 1177 def __init__(self, downloaded, expected):
86e5f3ed 1178 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1179 # Both in bytes
59ae15a5
PH
1180 self.downloaded = downloaded
1181 self.expected = expected
d77c3dfd 1182
5f6a1245 1183
bf5b9d85 1184class XAttrMetadataError(YoutubeDLError):
efa97bdc 1185 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1186 super().__init__(msg)
efa97bdc 1187 self.code = code
bd264412 1188 self.msg = msg
efa97bdc
YCH
1189
1190 # Parsing code and msg
3089bc74 1191 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1192 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1193 self.reason = 'NO_SPACE'
1194 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1195 self.reason = 'VALUE_TOO_LONG'
1196 else:
1197 self.reason = 'NOT_SUPPORTED'
1198
1199
bf5b9d85 1200class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1201 pass
1202
1203
c5a59d93 1204def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1205 hc = http_class(*args, **kwargs)
be4a824d 1206 source_address = ydl_handler._params.get('source_address')
8959018a 1207
be4a824d 1208 if source_address is not None:
8959018a
AU
1209 # This is to workaround _create_connection() from socket where it will try all
1210 # address data from getaddrinfo() including IPv6. This filters the result from
1211 # getaddrinfo() based on the source_address value.
1212 # This is based on the cpython socket.create_connection() function.
1213 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1214 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1215 host, port = address
1216 err = None
1217 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1218 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1219 ip_addrs = [addr for addr in addrs if addr[0] == af]
1220 if addrs and not ip_addrs:
1221 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1222 raise OSError(
9e21e6d9
S
1223 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1224 % (ip_version, source_address[0]))
8959018a
AU
1225 for res in ip_addrs:
1226 af, socktype, proto, canonname, sa = res
1227 sock = None
1228 try:
1229 sock = socket.socket(af, socktype, proto)
1230 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1231 sock.settimeout(timeout)
1232 sock.bind(source_address)
1233 sock.connect(sa)
1234 err = None # Explicitly break reference cycle
1235 return sock
86e5f3ed 1236 except OSError as _:
8959018a
AU
1237 err = _
1238 if sock is not None:
1239 sock.close()
1240 if err is not None:
1241 raise err
1242 else:
86e5f3ed 1243 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1244 if hasattr(hc, '_create_connection'):
1245 hc._create_connection = _create_connection
cfb0511d 1246 hc.source_address = (source_address, 0)
be4a824d
PH
1247
1248 return hc
1249
1250
87f0e62d 1251def handle_youtubedl_headers(headers):
992fc9d6
YCH
1252 filtered_headers = headers
1253
1254 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1255 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1256 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1257
992fc9d6 1258 return filtered_headers
87f0e62d
YCH
1259
1260
ac668111 1261class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1262 """Handler for HTTP requests and responses.
1263
1264 This class, when installed with an OpenerDirector, automatically adds
1265 the standard headers to every HTTP request and handles gzipped and
1266 deflated responses from web servers. If compression is to be avoided in
1267 a particular request, the original request in the program code only has
0424ec30 1268 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1269 removed before making the real request.
1270
1271 Part of this code was copied from:
1272
1273 http://techknack.net/python-urllib2-handlers/
1274
1275 Andrew Rowls, the author of that code, agreed to release it to the
1276 public domain.
1277 """
1278
be4a824d 1279 def __init__(self, params, *args, **kwargs):
ac668111 1280 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1281 self._params = params
1282
1283 def http_open(self, req):
ac668111 1284 conn_class = http.client.HTTPConnection
71aff188
YCH
1285
1286 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1287 if socks_proxy:
1288 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1289 del req.headers['Ytdl-socks-proxy']
1290
be4a824d 1291 return self.do_open(functools.partial(
71aff188 1292 _create_http_connection, self, conn_class, False),
be4a824d
PH
1293 req)
1294
59ae15a5
PH
1295 @staticmethod
1296 def deflate(data):
fc2119f2 1297 if not data:
1298 return data
59ae15a5
PH
1299 try:
1300 return zlib.decompress(data, -zlib.MAX_WBITS)
1301 except zlib.error:
1302 return zlib.decompress(data)
1303
4390d5ec 1304 @staticmethod
1305 def brotli(data):
1306 if not data:
1307 return data
9b8ee23b 1308 return brotli.decompress(data)
4390d5ec 1309
acebc9cd 1310 def http_request(self, req):
51f267d9
S
1311 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1312 # always respected by websites, some tend to give out URLs with non percent-encoded
1313 # non-ASCII characters (see telemb.py, ard.py [#3412])
1314 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1315 # To work around aforementioned issue we will replace request's original URL with
1316 # percent-encoded one
1317 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1318 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1319 url = req.get_full_url()
1320 url_escaped = escape_url(url)
1321
1322 # Substitute URL if any change after escaping
1323 if url != url_escaped:
15d260eb 1324 req = update_Request(req, url=url_escaped)
51f267d9 1325
8b7539d2 1326 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1327 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1328 # The dict keys are capitalized because of this bug by urllib
1329 if h.capitalize() not in req.headers:
33ac271b 1330 req.add_header(h, v)
87f0e62d 1331
af14914b 1332 if 'Accept-encoding' not in req.headers:
1333 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1334
87f0e62d 1335 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1336
379a4f16 1337 return super().do_request_(req)
59ae15a5 1338
acebc9cd 1339 def http_response(self, req, resp):
59ae15a5
PH
1340 old_resp = resp
1341 # gzip
1342 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1343 content = resp.read()
1344 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1345 try:
1346 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1347 except OSError as original_ioerror:
aa3e9507
PH
1348 # There may be junk add the end of the file
1349 # See http://stackoverflow.com/q/4928560/35070 for details
1350 for i in range(1, 1024):
1351 try:
1352 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1353 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1354 except OSError:
aa3e9507
PH
1355 continue
1356 break
1357 else:
1358 raise original_ioerror
ac668111 1359 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1360 resp.msg = old_resp.msg
c047270c 1361 del resp.headers['Content-encoding']
59ae15a5
PH
1362 # deflate
1363 if resp.headers.get('Content-encoding', '') == 'deflate':
1364 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1365 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1366 resp.msg = old_resp.msg
c047270c 1367 del resp.headers['Content-encoding']
4390d5ec 1368 # brotli
1369 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1370 resp = urllib.request.addinfourl(
4390d5ec 1371 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1372 resp.msg = old_resp.msg
1373 del resp.headers['Content-encoding']
ad729172 1374 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1375 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1376 if 300 <= resp.code < 400:
1377 location = resp.headers.get('Location')
1378 if location:
1379 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1380 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1381 location_escaped = escape_url(location)
1382 if location != location_escaped:
1383 del resp.headers['Location']
1384 resp.headers['Location'] = location_escaped
59ae15a5 1385 return resp
0f8d03f8 1386
acebc9cd
PH
1387 https_request = http_request
1388 https_response = http_response
bf50b038 1389
5de90176 1390
71aff188
YCH
1391def make_socks_conn_class(base_class, socks_proxy):
1392 assert issubclass(base_class, (
ac668111 1393 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1394
14f25df2 1395 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1396 if url_components.scheme.lower() == 'socks5':
1397 socks_type = ProxyType.SOCKS5
1398 elif url_components.scheme.lower() in ('socks', 'socks4'):
1399 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1400 elif url_components.scheme.lower() == 'socks4a':
1401 socks_type = ProxyType.SOCKS4A
71aff188 1402
cdd94c2e
YCH
1403 def unquote_if_non_empty(s):
1404 if not s:
1405 return s
ac668111 1406 return urllib.parse.unquote_plus(s)
cdd94c2e 1407
71aff188
YCH
1408 proxy_args = (
1409 socks_type,
1410 url_components.hostname, url_components.port or 1080,
1411 True, # Remote DNS
cdd94c2e
YCH
1412 unquote_if_non_empty(url_components.username),
1413 unquote_if_non_empty(url_components.password),
71aff188
YCH
1414 )
1415
1416 class SocksConnection(base_class):
1417 def connect(self):
1418 self.sock = sockssocket()
1419 self.sock.setproxy(*proxy_args)
19a03940 1420 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1421 self.sock.settimeout(self.timeout)
1422 self.sock.connect((self.host, self.port))
1423
ac668111 1424 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1425 if hasattr(self, '_context'): # Python > 2.6
1426 self.sock = self._context.wrap_socket(
1427 self.sock, server_hostname=self.host)
1428 else:
1429 self.sock = ssl.wrap_socket(self.sock)
1430
1431 return SocksConnection
1432
1433
ac668111 1434class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1435 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1436 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1437 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1438 self._params = params
1439
1440 def https_open(self, req):
4f264c02 1441 kwargs = {}
71aff188
YCH
1442 conn_class = self._https_conn_class
1443
4f264c02
JMF
1444 if hasattr(self, '_context'): # python > 2.6
1445 kwargs['context'] = self._context
1446 if hasattr(self, '_check_hostname'): # python 3.x
1447 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1448
1449 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1450 if socks_proxy:
1451 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1452 del req.headers['Ytdl-socks-proxy']
1453
4f28b537 1454 try:
1455 return self.do_open(
1456 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1457 except urllib.error.URLError as e:
1458 if (isinstance(e.reason, ssl.SSLError)
1459 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1460 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1461 raise
be4a824d
PH
1462
1463
ac668111 1464class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1465 """
1466 See [1] for cookie file format.
1467
1468 1. https://curl.haxx.se/docs/http-cookies.html
1469 """
e7e62441 1470 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1471 _ENTRY_LEN = 7
1472 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1473# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1474
1475'''
1476 _CookieFileEntry = collections.namedtuple(
1477 'CookieFileEntry',
1478 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1479
d76fa1f3 1480 def __init__(self, filename=None, *args, **kwargs):
1481 super().__init__(None, *args, **kwargs)
1482 if self.is_path(filename):
1483 filename = os.fspath(filename)
1484 self.filename = filename
1485
24146491 1486 @staticmethod
1487 def _true_or_false(cndn):
1488 return 'TRUE' if cndn else 'FALSE'
1489
d76fa1f3 1490 @staticmethod
1491 def is_path(file):
1492 return isinstance(file, (str, bytes, os.PathLike))
1493
1494 @contextlib.contextmanager
1495 def open(self, file, *, write=False):
1496 if self.is_path(file):
1497 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1498 yield f
1499 else:
1500 if write:
1501 file.truncate(0)
1502 yield file
1503
24146491 1504 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1505 now = time.time()
1506 for cookie in self:
1507 if (not ignore_discard and cookie.discard
1508 or not ignore_expires and cookie.is_expired(now)):
1509 continue
1510 name, value = cookie.name, cookie.value
1511 if value is None:
1512 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1513 # with no name, whereas http.cookiejar regards it as a
1514 # cookie with no value.
1515 name, value = '', name
1516 f.write('%s\n' % '\t'.join((
1517 cookie.domain,
1518 self._true_or_false(cookie.domain.startswith('.')),
1519 cookie.path,
1520 self._true_or_false(cookie.secure),
1521 str_or_none(cookie.expires, default=''),
1522 name, value
1523 )))
1524
1525 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1526 """
1527 Save cookies to a file.
24146491 1528 Code is taken from CPython 3.6
1529 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1530
c380cc28
S
1531 if filename is None:
1532 if self.filename is not None:
1533 filename = self.filename
1534 else:
ac668111 1535 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1536
24146491 1537 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1538 for cookie in self:
1539 if cookie.expires is None:
1540 cookie.expires = 0
c380cc28 1541
d76fa1f3 1542 with self.open(filename, write=True) as f:
c380cc28 1543 f.write(self._HEADER)
24146491 1544 self._really_save(f, *args, **kwargs)
1bab3437
S
1545
1546 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1547 """Load cookies from a file."""
1548 if filename is None:
1549 if self.filename is not None:
1550 filename = self.filename
1551 else:
ac668111 1552 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1553
c380cc28
S
1554 def prepare_line(line):
1555 if line.startswith(self._HTTPONLY_PREFIX):
1556 line = line[len(self._HTTPONLY_PREFIX):]
1557 # comments and empty lines are fine
1558 if line.startswith('#') or not line.strip():
1559 return line
1560 cookie_list = line.split('\t')
1561 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1562 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1563 cookie = self._CookieFileEntry(*cookie_list)
1564 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1565 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1566 return line
1567
e7e62441 1568 cf = io.StringIO()
d76fa1f3 1569 with self.open(filename) as f:
e7e62441 1570 for line in f:
c380cc28
S
1571 try:
1572 cf.write(prepare_line(line))
ac668111 1573 except http.cookiejar.LoadError as e:
94aa0644 1574 if f'{line.strip()} '[0] in '[{"':
ac668111 1575 raise http.cookiejar.LoadError(
94aa0644
L
1576 'Cookies file must be Netscape formatted, not JSON. See '
1577 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1578 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1579 continue
e7e62441 1580 cf.seek(0)
1581 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1582 # Session cookies are denoted by either `expires` field set to
1583 # an empty string or 0. MozillaCookieJar only recognizes the former
1584 # (see [1]). So we need force the latter to be recognized as session
1585 # cookies on our own.
1586 # Session cookies may be important for cookies-based authentication,
1587 # e.g. usually, when user does not check 'Remember me' check box while
1588 # logging in on a site, some important cookies are stored as session
1589 # cookies so that not recognizing them will result in failed login.
1590 # 1. https://bugs.python.org/issue17164
1591 for cookie in self:
1592 # Treat `expires=0` cookies as session cookies
1593 if cookie.expires == 0:
1594 cookie.expires = None
1595 cookie.discard = True
1596
1597
ac668111 1598class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1599 def __init__(self, cookiejar=None):
ac668111 1600 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1601
1602 def http_response(self, request, response):
ac668111 1603 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1604
ac668111 1605 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1606 https_response = http_response
1607
1608
ac668111 1609class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1610 """YoutubeDL redirect handler
1611
1612 The code is based on HTTPRedirectHandler implementation from CPython [1].
1613
1614 This redirect handler solves two issues:
1615 - ensures redirect URL is always unicode under python 2
1616 - introduces support for experimental HTTP response status code
1617 308 Permanent Redirect [2] used by some sites [3]
1618
1619 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1620 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1621 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1622 """
1623
ac668111 1624 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1625
1626 def redirect_request(self, req, fp, code, msg, headers, newurl):
1627 """Return a Request or None in response to a redirect.
1628
1629 This is called by the http_error_30x methods when a
1630 redirection response is received. If a redirection should
1631 take place, return a new Request to allow http_error_30x to
1632 perform the redirect. Otherwise, raise HTTPError if no-one
1633 else should try to handle this url. Return None if you can't
1634 but another Handler might.
1635 """
1636 m = req.get_method()
1637 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1638 or code in (301, 302, 303) and m == "POST")):
14f25df2 1639 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1640 # Strictly (according to RFC 2616), 301 or 302 in response to
1641 # a POST MUST NOT cause a redirection without confirmation
1642 # from the user (of urllib.request, in this case). In practice,
1643 # essentially all clients do redirect in this case, so we do
1644 # the same.
1645
201c1459 1646 # Be conciliant with URIs containing a space. This is mainly
1647 # redundant with the more complete encoding done in http_error_302(),
1648 # but it is kept for compatibility with other callers.
1649 newurl = newurl.replace(' ', '%20')
1650
1651 CONTENT_HEADERS = ("content-length", "content-type")
1652 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1653 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1654
1655 # A 303 must either use GET or HEAD for subsequent request
1656 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1657 if code == 303 and m != 'HEAD':
1658 m = 'GET'
1659 # 301 and 302 redirects are commonly turned into a GET from a POST
1660 # for subsequent requests by browsers, so we'll do the same.
1661 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1662 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1663 if code in (301, 302) and m == 'POST':
1664 m = 'GET'
1665
ac668111 1666 return urllib.request.Request(
201c1459 1667 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1668 unverifiable=True, method=m)
fca6dba8
S
1669
1670
46f59e89
S
1671def extract_timezone(date_str):
1672 m = re.search(
f137e4c2 1673 r'''(?x)
1674 ^.{8,}? # >=8 char non-TZ prefix, if present
1675 (?P<tz>Z| # just the UTC Z, or
1676 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1677 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1678 [ ]? # optional space
1679 (?P<sign>\+|-) # +/-
1680 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1681 $)
1682 ''', date_str)
46f59e89
S
1683 if not m:
1684 timezone = datetime.timedelta()
1685 else:
1686 date_str = date_str[:-len(m.group('tz'))]
1687 if not m.group('sign'):
1688 timezone = datetime.timedelta()
1689 else:
1690 sign = 1 if m.group('sign') == '+' else -1
1691 timezone = datetime.timedelta(
1692 hours=sign * int(m.group('hours')),
1693 minutes=sign * int(m.group('minutes')))
1694 return timezone, date_str
1695
1696
08b38d54 1697def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1698 """ Return a UNIX timestamp from the given date """
1699
1700 if date_str is None:
1701 return None
1702
52c3a6e4
S
1703 date_str = re.sub(r'\.[0-9]+', '', date_str)
1704
08b38d54 1705 if timezone is None:
46f59e89
S
1706 timezone, date_str = extract_timezone(date_str)
1707
19a03940 1708 with contextlib.suppress(ValueError):
86e5f3ed 1709 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1710 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1711 return calendar.timegm(dt.timetuple())
912b38b4
PH
1712
1713
46f59e89
S
1714def date_formats(day_first=True):
1715 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1716
1717
42bdd9d0 1718def unified_strdate(date_str, day_first=True):
bf50b038 1719 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1720
1721 if date_str is None:
1722 return None
bf50b038 1723 upload_date = None
5f6a1245 1724 # Replace commas
026fcc04 1725 date_str = date_str.replace(',', ' ')
42bdd9d0 1726 # Remove AM/PM + timezone
9bb8e0a3 1727 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1728 _, date_str = extract_timezone(date_str)
42bdd9d0 1729
46f59e89 1730 for expression in date_formats(day_first):
19a03940 1731 with contextlib.suppress(ValueError):
bf50b038 1732 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1733 if upload_date is None:
1734 timetuple = email.utils.parsedate_tz(date_str)
1735 if timetuple:
19a03940 1736 with contextlib.suppress(ValueError):
c6b9cf05 1737 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1738 if upload_date is not None:
14f25df2 1739 return str(upload_date)
bf50b038 1740
5f6a1245 1741
46f59e89
S
1742def unified_timestamp(date_str, day_first=True):
1743 if date_str is None:
1744 return None
1745
2ae2ffda 1746 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1747
7dc2a74e 1748 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1749 timezone, date_str = extract_timezone(date_str)
1750
1751 # Remove AM/PM + timezone
1752 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1753
deef3195
S
1754 # Remove unrecognized timezones from ISO 8601 alike timestamps
1755 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1756 if m:
1757 date_str = date_str[:-len(m.group('tz'))]
1758
f226880c
PH
1759 # Python only supports microseconds, so remove nanoseconds
1760 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1761 if m:
1762 date_str = m.group(1)
1763
46f59e89 1764 for expression in date_formats(day_first):
19a03940 1765 with contextlib.suppress(ValueError):
7dc2a74e 1766 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1767 return calendar.timegm(dt.timetuple())
46f59e89
S
1768 timetuple = email.utils.parsedate_tz(date_str)
1769 if timetuple:
7dc2a74e 1770 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1771
1772
28e614de 1773def determine_ext(url, default_ext='unknown_video'):
85750f89 1774 if url is None or '.' not in url:
f4776371 1775 return default_ext
9cb9a5df 1776 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1777 if re.match(r'^[A-Za-z0-9]+$', guess):
1778 return guess
a7aaa398
S
1779 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1780 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1781 return guess.rstrip('/')
73e79f2a 1782 else:
cbdbb766 1783 return default_ext
73e79f2a 1784
5f6a1245 1785
824fa511
S
1786def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1787 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1788
5f6a1245 1789
9e62f283 1790def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1791 R"""
1792 Return a datetime object from a string.
1793 Supported format:
1794 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1795
1796 @param format strftime format of DATE
1797 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1798 auto: round to the unit provided in date_str (if applicable).
9e62f283 1799 """
1800 auto_precision = False
1801 if precision == 'auto':
1802 auto_precision = True
1803 precision = 'microsecond'
396a76f7 1804 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1805 if date_str in ('now', 'today'):
37254abc 1806 return today
f8795e10
PH
1807 if date_str == 'yesterday':
1808 return today - datetime.timedelta(days=1)
9e62f283 1809 match = re.match(
3d38b2d6 1810 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1811 date_str)
37254abc 1812 if match is not None:
9e62f283 1813 start_time = datetime_from_str(match.group('start'), precision, format)
1814 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1815 unit = match.group('unit')
9e62f283 1816 if unit == 'month' or unit == 'year':
1817 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1818 unit = 'day'
9e62f283 1819 else:
1820 if unit == 'week':
1821 unit = 'day'
1822 time *= 7
1823 delta = datetime.timedelta(**{unit + 's': time})
1824 new_date = start_time + delta
1825 if auto_precision:
1826 return datetime_round(new_date, unit)
1827 return new_date
1828
1829 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1830
1831
d49f8db3 1832def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1833 R"""
1834 Return a date object from a string using datetime_from_str
9e62f283 1835
3d38b2d6 1836 @param strict Restrict allowed patterns to "YYYYMMDD" and
1837 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1838 """
3d38b2d6 1839 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1840 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1841 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1842
1843
1844def datetime_add_months(dt, months):
1845 """Increment/Decrement a datetime object by months."""
1846 month = dt.month + months - 1
1847 year = dt.year + month // 12
1848 month = month % 12 + 1
1849 day = min(dt.day, calendar.monthrange(year, month)[1])
1850 return dt.replace(year, month, day)
1851
1852
1853def datetime_round(dt, precision='day'):
1854 """
1855 Round a datetime object's time to a specific precision
1856 """
1857 if precision == 'microsecond':
1858 return dt
1859
1860 unit_seconds = {
1861 'day': 86400,
1862 'hour': 3600,
1863 'minute': 60,
1864 'second': 1,
1865 }
1866 roundto = lambda x, n: ((x + n / 2) // n) * n
1867 timestamp = calendar.timegm(dt.timetuple())
1868 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1869
1870
e63fc1be 1871def hyphenate_date(date_str):
1872 """
1873 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1874 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1875 if match is not None:
1876 return '-'.join(match.groups())
1877 else:
1878 return date_str
1879
5f6a1245 1880
86e5f3ed 1881class DateRange:
bd558525 1882 """Represents a time interval between two dates"""
5f6a1245 1883
bd558525
JMF
1884 def __init__(self, start=None, end=None):
1885 """start and end must be strings in the format accepted by date"""
1886 if start is not None:
d49f8db3 1887 self.start = date_from_str(start, strict=True)
bd558525
JMF
1888 else:
1889 self.start = datetime.datetime.min.date()
1890 if end is not None:
d49f8db3 1891 self.end = date_from_str(end, strict=True)
bd558525
JMF
1892 else:
1893 self.end = datetime.datetime.max.date()
37254abc 1894 if self.start > self.end:
bd558525 1895 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1896
bd558525
JMF
1897 @classmethod
1898 def day(cls, day):
1899 """Returns a range that only contains the given day"""
5f6a1245
JW
1900 return cls(day, day)
1901
bd558525
JMF
1902 def __contains__(self, date):
1903 """Check if the date is in the range"""
37254abc
JMF
1904 if not isinstance(date, datetime.date):
1905 date = date_from_str(date)
1906 return self.start <= date <= self.end
5f6a1245 1907
bd558525 1908 def __str__(self):
86e5f3ed 1909 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1910
1911
1912def platform_name():
14f25df2 1913 """ Returns the platform name as a str """
b1f94422 1914 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1915 return platform.platform()
c496ca96 1916
b1f94422 1917
1918@functools.cache
1919def system_identifier():
1920 python_implementation = platform.python_implementation()
1921 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1922 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1923
1924 return 'Python %s (%s %s) - %s %s' % (
1925 platform.python_version(),
1926 python_implementation,
1927 platform.architecture()[0],
1928 platform.platform(),
1929 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1930 )
c257baff
PH
1931
1932
0b9c08b4 1933@functools.cache
49fa4d9a 1934def get_windows_version():
8a82af35 1935 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1936 if compat_os_name == 'nt':
1937 return version_tuple(platform.win32_ver()[1])
1938 else:
8a82af35 1939 return ()
49fa4d9a
N
1940
1941
734f90bb 1942def write_string(s, out=None, encoding=None):
19a03940 1943 assert isinstance(s, str)
1944 out = out or sys.stderr
7459e3a2 1945
fe1daad3 1946 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1947 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1948
8a82af35 1949 enc, buffer = None, out
cfb0511d 1950 if 'b' in getattr(out, 'mode', ''):
c487cf00 1951 enc = encoding or preferredencoding()
104aa738 1952 elif hasattr(out, 'buffer'):
8a82af35 1953 buffer = out.buffer
104aa738 1954 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1955
8a82af35 1956 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1957 out.flush()
1958
1959
48ea9cea
PH
1960def bytes_to_intlist(bs):
1961 if not bs:
1962 return []
1963 if isinstance(bs[0], int): # Python 3
1964 return list(bs)
1965 else:
1966 return [ord(c) for c in bs]
1967
c257baff 1968
cba892fa 1969def intlist_to_bytes(xs):
1970 if not xs:
1971 return b''
ac668111 1972 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1973
1974
8a82af35 1975class LockingUnsupportedError(OSError):
1890fc63 1976 msg = 'File locking is not supported'
0edb3e33 1977
1978 def __init__(self):
1979 super().__init__(self.msg)
1980
1981
c1c9a79c
PH
1982# Cross-platform file locking
1983if sys.platform == 'win32':
1984 import ctypes.wintypes
1985 import msvcrt
1986
1987 class OVERLAPPED(ctypes.Structure):
1988 _fields_ = [
1989 ('Internal', ctypes.wintypes.LPVOID),
1990 ('InternalHigh', ctypes.wintypes.LPVOID),
1991 ('Offset', ctypes.wintypes.DWORD),
1992 ('OffsetHigh', ctypes.wintypes.DWORD),
1993 ('hEvent', ctypes.wintypes.HANDLE),
1994 ]
1995
1996 kernel32 = ctypes.windll.kernel32
1997 LockFileEx = kernel32.LockFileEx
1998 LockFileEx.argtypes = [
1999 ctypes.wintypes.HANDLE, # hFile
2000 ctypes.wintypes.DWORD, # dwFlags
2001 ctypes.wintypes.DWORD, # dwReserved
2002 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2003 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2004 ctypes.POINTER(OVERLAPPED) # Overlapped
2005 ]
2006 LockFileEx.restype = ctypes.wintypes.BOOL
2007 UnlockFileEx = kernel32.UnlockFileEx
2008 UnlockFileEx.argtypes = [
2009 ctypes.wintypes.HANDLE, # hFile
2010 ctypes.wintypes.DWORD, # dwReserved
2011 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2012 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2013 ctypes.POINTER(OVERLAPPED) # Overlapped
2014 ]
2015 UnlockFileEx.restype = ctypes.wintypes.BOOL
2016 whole_low = 0xffffffff
2017 whole_high = 0x7fffffff
2018
747c0bd1 2019 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2020 overlapped = OVERLAPPED()
2021 overlapped.Offset = 0
2022 overlapped.OffsetHigh = 0
2023 overlapped.hEvent = 0
2024 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2025
2026 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2027 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2028 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2029 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2030 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2031
2032 def _unlock_file(f):
2033 assert f._lock_file_overlapped_p
2034 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2035 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2036 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2037
2038else:
399a76e6
YCH
2039 try:
2040 import fcntl
c1c9a79c 2041
a3125791 2042 def _lock_file(f, exclusive, block):
b63837bc 2043 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2044 if not block:
2045 flags |= fcntl.LOCK_NB
acea8d7c 2046 try:
b63837bc 2047 fcntl.flock(f, flags)
acea8d7c
JK
2048 except BlockingIOError:
2049 raise
2050 except OSError: # AOSP does not have flock()
b63837bc 2051 fcntl.lockf(f, flags)
c1c9a79c 2052
399a76e6 2053 def _unlock_file(f):
acea8d7c
JK
2054 try:
2055 fcntl.flock(f, fcntl.LOCK_UN)
2056 except OSError:
2057 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2058
399a76e6 2059 except ImportError:
399a76e6 2060
a3125791 2061 def _lock_file(f, exclusive, block):
0edb3e33 2062 raise LockingUnsupportedError()
399a76e6
YCH
2063
2064 def _unlock_file(f):
0edb3e33 2065 raise LockingUnsupportedError()
c1c9a79c
PH
2066
2067
86e5f3ed 2068class locked_file:
0edb3e33 2069 locked = False
747c0bd1 2070
a3125791 2071 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2072 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2073 raise NotImplementedError(mode)
2074 self.mode, self.block = mode, block
2075
2076 writable = any(f in mode for f in 'wax+')
2077 readable = any(f in mode for f in 'r+')
2078 flags = functools.reduce(operator.ior, (
2079 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2080 getattr(os, 'O_BINARY', 0), # Windows only
2081 getattr(os, 'O_NOINHERIT', 0), # Windows only
2082 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2083 os.O_APPEND if 'a' in mode else 0,
2084 os.O_EXCL if 'x' in mode else 0,
2085 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2086 ))
2087
98804d03 2088 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2089
2090 def __enter__(self):
a3125791 2091 exclusive = 'r' not in self.mode
c1c9a79c 2092 try:
a3125791 2093 _lock_file(self.f, exclusive, self.block)
0edb3e33 2094 self.locked = True
86e5f3ed 2095 except OSError:
c1c9a79c
PH
2096 self.f.close()
2097 raise
fcfa8853 2098 if 'w' in self.mode:
131e14dc
JK
2099 try:
2100 self.f.truncate()
2101 except OSError as e:
1890fc63 2102 if e.errno not in (
2103 errno.ESPIPE, # Illegal seek - expected for FIFO
2104 errno.EINVAL, # Invalid argument - expected for /dev/null
2105 ):
2106 raise
c1c9a79c
PH
2107 return self
2108
0edb3e33 2109 def unlock(self):
2110 if not self.locked:
2111 return
c1c9a79c 2112 try:
0edb3e33 2113 _unlock_file(self.f)
c1c9a79c 2114 finally:
0edb3e33 2115 self.locked = False
c1c9a79c 2116
0edb3e33 2117 def __exit__(self, *_):
2118 try:
2119 self.unlock()
2120 finally:
2121 self.f.close()
4eb7f1d1 2122
0edb3e33 2123 open = __enter__
2124 close = __exit__
a3125791 2125
0edb3e33 2126 def __getattr__(self, attr):
2127 return getattr(self.f, attr)
a3125791 2128
0edb3e33 2129 def __iter__(self):
2130 return iter(self.f)
a3125791 2131
4eb7f1d1 2132
0b9c08b4 2133@functools.cache
4644ac55
S
2134def get_filesystem_encoding():
2135 encoding = sys.getfilesystemencoding()
2136 return encoding if encoding is not None else 'utf-8'
2137
2138
4eb7f1d1 2139def shell_quote(args):
a6a173c2 2140 quoted_args = []
4644ac55 2141 encoding = get_filesystem_encoding()
a6a173c2
JMF
2142 for a in args:
2143 if isinstance(a, bytes):
2144 # We may get a filename encoded with 'encodeFilename'
2145 a = a.decode(encoding)
aefce8e6 2146 quoted_args.append(compat_shlex_quote(a))
28e614de 2147 return ' '.join(quoted_args)
9d4660ca
PH
2148
2149
2150def smuggle_url(url, data):
2151 """ Pass additional data in a URL for internal use. """
2152
81953d1a
RA
2153 url, idata = unsmuggle_url(url, {})
2154 data.update(idata)
14f25df2 2155 sdata = urllib.parse.urlencode(
28e614de
PH
2156 {'__youtubedl_smuggle': json.dumps(data)})
2157 return url + '#' + sdata
9d4660ca
PH
2158
2159
79f82953 2160def unsmuggle_url(smug_url, default=None):
83e865a3 2161 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2162 return smug_url, default
28e614de 2163 url, _, sdata = smug_url.rpartition('#')
14f25df2 2164 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2165 data = json.loads(jsond)
2166 return url, data
02dbf93f
PH
2167
2168
e0fd9573 2169def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2170 """ Formats numbers with decimal sufixes like K, M, etc """
2171 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2172 if num is None or num < 0:
e0fd9573 2173 return None
eeb2a770 2174 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2175 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2176 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2177 if factor == 1024:
2178 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2179 converted = num / (factor ** exponent)
abbeeebc 2180 return fmt % (converted, suffix)
e0fd9573 2181
2182
02dbf93f 2183def format_bytes(bytes):
f02d24d8 2184 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2185
1c088fa8 2186
fb47597b
S
2187def lookup_unit_table(unit_table, s):
2188 units_re = '|'.join(re.escape(u) for u in unit_table)
2189 m = re.match(
782b1b5b 2190 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2191 if not m:
2192 return None
2193 num_str = m.group('num').replace(',', '.')
2194 mult = unit_table[m.group('unit')]
2195 return int(float(num_str) * mult)
2196
2197
be64b5b0
PH
2198def parse_filesize(s):
2199 if s is None:
2200 return None
2201
dfb1b146 2202 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2203 # but we support those too
2204 _UNIT_TABLE = {
2205 'B': 1,
2206 'b': 1,
70852b47 2207 'bytes': 1,
be64b5b0
PH
2208 'KiB': 1024,
2209 'KB': 1000,
2210 'kB': 1024,
2211 'Kb': 1000,
13585d76 2212 'kb': 1000,
70852b47
YCH
2213 'kilobytes': 1000,
2214 'kibibytes': 1024,
be64b5b0
PH
2215 'MiB': 1024 ** 2,
2216 'MB': 1000 ** 2,
2217 'mB': 1024 ** 2,
2218 'Mb': 1000 ** 2,
13585d76 2219 'mb': 1000 ** 2,
70852b47
YCH
2220 'megabytes': 1000 ** 2,
2221 'mebibytes': 1024 ** 2,
be64b5b0
PH
2222 'GiB': 1024 ** 3,
2223 'GB': 1000 ** 3,
2224 'gB': 1024 ** 3,
2225 'Gb': 1000 ** 3,
13585d76 2226 'gb': 1000 ** 3,
70852b47
YCH
2227 'gigabytes': 1000 ** 3,
2228 'gibibytes': 1024 ** 3,
be64b5b0
PH
2229 'TiB': 1024 ** 4,
2230 'TB': 1000 ** 4,
2231 'tB': 1024 ** 4,
2232 'Tb': 1000 ** 4,
13585d76 2233 'tb': 1000 ** 4,
70852b47
YCH
2234 'terabytes': 1000 ** 4,
2235 'tebibytes': 1024 ** 4,
be64b5b0
PH
2236 'PiB': 1024 ** 5,
2237 'PB': 1000 ** 5,
2238 'pB': 1024 ** 5,
2239 'Pb': 1000 ** 5,
13585d76 2240 'pb': 1000 ** 5,
70852b47
YCH
2241 'petabytes': 1000 ** 5,
2242 'pebibytes': 1024 ** 5,
be64b5b0
PH
2243 'EiB': 1024 ** 6,
2244 'EB': 1000 ** 6,
2245 'eB': 1024 ** 6,
2246 'Eb': 1000 ** 6,
13585d76 2247 'eb': 1000 ** 6,
70852b47
YCH
2248 'exabytes': 1000 ** 6,
2249 'exbibytes': 1024 ** 6,
be64b5b0
PH
2250 'ZiB': 1024 ** 7,
2251 'ZB': 1000 ** 7,
2252 'zB': 1024 ** 7,
2253 'Zb': 1000 ** 7,
13585d76 2254 'zb': 1000 ** 7,
70852b47
YCH
2255 'zettabytes': 1000 ** 7,
2256 'zebibytes': 1024 ** 7,
be64b5b0
PH
2257 'YiB': 1024 ** 8,
2258 'YB': 1000 ** 8,
2259 'yB': 1024 ** 8,
2260 'Yb': 1000 ** 8,
13585d76 2261 'yb': 1000 ** 8,
70852b47
YCH
2262 'yottabytes': 1000 ** 8,
2263 'yobibytes': 1024 ** 8,
be64b5b0
PH
2264 }
2265
fb47597b
S
2266 return lookup_unit_table(_UNIT_TABLE, s)
2267
2268
2269def parse_count(s):
2270 if s is None:
be64b5b0
PH
2271 return None
2272
352d5da8 2273 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2274
2275 if re.match(r'^[\d,.]+$', s):
2276 return str_to_int(s)
2277
2278 _UNIT_TABLE = {
2279 'k': 1000,
2280 'K': 1000,
2281 'm': 1000 ** 2,
2282 'M': 1000 ** 2,
2283 'kk': 1000 ** 2,
2284 'KK': 1000 ** 2,
352d5da8 2285 'b': 1000 ** 3,
2286 'B': 1000 ** 3,
fb47597b 2287 }
be64b5b0 2288
352d5da8 2289 ret = lookup_unit_table(_UNIT_TABLE, s)
2290 if ret is not None:
2291 return ret
2292
2293 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2294 if mobj:
2295 return str_to_int(mobj.group(1))
be64b5b0 2296
2f7ae819 2297
5d45484c 2298def parse_resolution(s, *, lenient=False):
b871d7e9
S
2299 if s is None:
2300 return {}
2301
5d45484c
LNO
2302 if lenient:
2303 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2304 else:
2305 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2306 if mobj:
2307 return {
2308 'width': int(mobj.group('w')),
2309 'height': int(mobj.group('h')),
2310 }
2311
17ec8bcf 2312 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2313 if mobj:
2314 return {'height': int(mobj.group(1))}
2315
2316 mobj = re.search(r'\b([48])[kK]\b', s)
2317 if mobj:
2318 return {'height': int(mobj.group(1)) * 540}
2319
2320 return {}
2321
2322
0dc41787 2323def parse_bitrate(s):
14f25df2 2324 if not isinstance(s, str):
0dc41787
S
2325 return
2326 mobj = re.search(r'\b(\d+)\s*kbps', s)
2327 if mobj:
2328 return int(mobj.group(1))
2329
2330
a942d6cb 2331def month_by_name(name, lang='en'):
caefb1de
PH
2332 """ Return the number of a month by (locale-independently) English name """
2333
f6717dec 2334 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2335
caefb1de 2336 try:
f6717dec 2337 return month_names.index(name) + 1
7105440c
YCH
2338 except ValueError:
2339 return None
2340
2341
2342def month_by_abbreviation(abbrev):
2343 """ Return the number of a month by (locale-independently) English
2344 abbreviations """
2345
2346 try:
2347 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2348 except ValueError:
2349 return None
18258362
JMF
2350
2351
5aafe895 2352def fix_xml_ampersands(xml_str):
18258362 2353 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2354 return re.sub(
2355 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2356 '&amp;',
5aafe895 2357 xml_str)
e3946f98
PH
2358
2359
2360def setproctitle(title):
14f25df2 2361 assert isinstance(title, str)
c1c05c67
YCH
2362
2363 # ctypes in Jython is not complete
2364 # http://bugs.jython.org/issue2148
2365 if sys.platform.startswith('java'):
2366 return
2367
e3946f98 2368 try:
611c1dd9 2369 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2370 except OSError:
2371 return
2f49bcd6
RC
2372 except TypeError:
2373 # LoadLibrary in Windows Python 2.7.13 only expects
2374 # a bytestring, but since unicode_literals turns
2375 # every string into a unicode string, it fails.
2376 return
0f06bcd7 2377 title_bytes = title.encode()
6eefe533
PH
2378 buf = ctypes.create_string_buffer(len(title_bytes))
2379 buf.value = title_bytes
e3946f98 2380 try:
6eefe533 2381 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2382 except AttributeError:
2383 return # Strange libc, just skip this
d7dda168
PH
2384
2385
2386def remove_start(s, start):
46bc9b7d 2387 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2388
2389
2b9faf55 2390def remove_end(s, end):
46bc9b7d 2391 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2392
2393
31b2051e
S
2394def remove_quotes(s):
2395 if s is None or len(s) < 2:
2396 return s
2397 for quote in ('"', "'", ):
2398 if s[0] == quote and s[-1] == quote:
2399 return s[1:-1]
2400 return s
2401
2402
b6e0c7d2 2403def get_domain(url):
ae61d108 2404 return '.'.join(urllib.parse.urlparse(url).netloc.rsplit('.', 2)[-2:])
b6e0c7d2
U
2405
2406
29eb5174 2407def url_basename(url):
14f25df2 2408 path = urllib.parse.urlparse(url).path
28e614de 2409 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2410
2411
02dc0a36
S
2412def base_url(url):
2413 return re.match(r'https?://[^?#&]+/', url).group()
2414
2415
e34c3361 2416def urljoin(base, path):
4b5de77b 2417 if isinstance(path, bytes):
0f06bcd7 2418 path = path.decode()
14f25df2 2419 if not isinstance(path, str) or not path:
e34c3361 2420 return None
fad4ceb5 2421 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2422 return path
4b5de77b 2423 if isinstance(base, bytes):
0f06bcd7 2424 base = base.decode()
14f25df2 2425 if not isinstance(base, str) or not re.match(
4b5de77b 2426 r'^(?:https?:)?//', base):
e34c3361 2427 return None
14f25df2 2428 return urllib.parse.urljoin(base, path)
e34c3361
S
2429
2430
ac668111 2431class HEADRequest(urllib.request.Request):
aa94a6d3 2432 def get_method(self):
611c1dd9 2433 return 'HEAD'
7217e148
PH
2434
2435
ac668111 2436class PUTRequest(urllib.request.Request):
95cf60e8
S
2437 def get_method(self):
2438 return 'PUT'
2439
2440
9732d77e 2441def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2442 if get_attr and v is not None:
2443 v = getattr(v, get_attr, None)
1812afb7
S
2444 try:
2445 return int(v) * invscale // scale
31c49255 2446 except (ValueError, TypeError, OverflowError):
af98f8ff 2447 return default
9732d77e 2448
9572013d 2449
40a90862 2450def str_or_none(v, default=None):
14f25df2 2451 return default if v is None else str(v)
40a90862 2452
9732d77e
PH
2453
2454def str_to_int(int_str):
48d4681e 2455 """ A more relaxed version of int_or_none """
f9934b96 2456 if isinstance(int_str, int):
348c6bf1 2457 return int_str
14f25df2 2458 elif isinstance(int_str, str):
42db58ec
S
2459 int_str = re.sub(r'[,\.\+]', '', int_str)
2460 return int_or_none(int_str)
608d11f5
PH
2461
2462
9732d77e 2463def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2464 if v is None:
2465 return default
2466 try:
2467 return float(v) * invscale / scale
5e1271c5 2468 except (ValueError, TypeError):
caf80631 2469 return default
43f775e4
PH
2470
2471
c7e327c4
S
2472def bool_or_none(v, default=None):
2473 return v if isinstance(v, bool) else default
2474
2475
53cd37ba 2476def strip_or_none(v, default=None):
14f25df2 2477 return v.strip() if isinstance(v, str) else default
b72b4431
S
2478
2479
af03000a 2480def url_or_none(url):
14f25df2 2481 if not url or not isinstance(url, str):
af03000a
S
2482 return None
2483 url = url.strip()
29f7c58a 2484 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2485
2486
3e9b66d7 2487def request_to_url(req):
ac668111 2488 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2489 return req.get_full_url()
2490 else:
2491 return req
2492
2493
e29663c6 2494def strftime_or_none(timestamp, date_format, default=None):
2495 datetime_object = None
2496 try:
f9934b96 2497 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2498 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2499 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2500 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2501 return datetime_object.strftime(date_format)
2502 except (ValueError, TypeError, AttributeError):
2503 return default
2504
2505
608d11f5 2506def parse_duration(s):
f9934b96 2507 if not isinstance(s, str):
608d11f5 2508 return None
ca7b3246 2509 s = s.strip()
38d79fd1 2510 if not s:
2511 return None
ca7b3246 2512
acaff495 2513 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2514 m = re.match(r'''(?x)
2515 (?P<before_secs>
2516 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2517 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2518 (?P<ms>[.:][0-9]+)?Z?$
2519 ''', s)
acaff495 2520 if m:
8bd1c00b 2521 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2522 else:
2523 m = re.match(
056653bb
S
2524 r'''(?ix)(?:P?
2525 (?:
1c1b2f96 2526 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2527 )?
2528 (?:
1c1b2f96 2529 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2530 )?
2531 (?:
1c1b2f96 2532 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2533 )?
8f4b58d7 2534 (?:
1c1b2f96 2535 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2536 )?
056653bb 2537 T)?
acaff495 2538 (?:
1c1b2f96 2539 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2540 )?
2541 (?:
1c1b2f96 2542 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2543 )?
2544 (?:
2545 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2546 )?Z?$''', s)
acaff495 2547 if m:
2548 days, hours, mins, secs, ms = m.groups()
2549 else:
15846398 2550 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2551 if m:
2552 hours, mins = m.groups()
2553 else:
2554 return None
2555
acaff495 2556 if ms:
19a03940 2557 ms = ms.replace(':', '.')
2558 return sum(float(part or 0) * mult for part, mult in (
2559 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2560
2561
e65e4c88 2562def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2563 name, real_ext = os.path.splitext(filename)
e65e4c88 2564 return (
86e5f3ed 2565 f'{name}.{ext}{real_ext}'
e65e4c88 2566 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2567 else f'{filename}.{ext}')
d70ad093
PH
2568
2569
b3ed15b7
S
2570def replace_extension(filename, ext, expected_real_ext=None):
2571 name, real_ext = os.path.splitext(filename)
86e5f3ed 2572 return '{}.{}'.format(
b3ed15b7
S
2573 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2574 ext)
2575
2576
d70ad093
PH
2577def check_executable(exe, args=[]):
2578 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2579 args can be a list of arguments for a short output (like -version) """
2580 try:
f0c9fb96 2581 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2582 except OSError:
2583 return False
2584 return exe
b7ab0590
PH
2585
2586
8a7f68d0 2587def _get_exe_version_output(exe, args, *, to_screen=None):
2588 if to_screen:
2589 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2590 try:
b64d04c1 2591 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2592 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2593 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2594 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2595 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2596 except OSError:
2597 return False
f0c9fb96 2598 return stdout
cae97f65
PH
2599
2600
2601def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2602 assert isinstance(output, str)
cae97f65
PH
2603 if version_re is None:
2604 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2605 m = re.search(version_re, output)
95807118
PH
2606 if m:
2607 return m.group(1)
2608 else:
2609 return unrecognized
2610
2611
9af98e17 2612def get_exe_version(exe, args=['--version'],
2613 version_re=None, unrecognized='present'):
2614 """ Returns the version of the specified executable,
2615 or False if the executable is not present """
2616 out = _get_exe_version_output(exe, args)
2617 return detect_exe_version(out, version_re, unrecognized) if out else False
2618
2619
7e88d7d7 2620def frange(start=0, stop=None, step=1):
2621 """Float range"""
2622 if stop is None:
2623 start, stop = 0, start
2624 sign = [-1, 1][step > 0] if step else 0
2625 while sign * start < sign * stop:
2626 yield start
2627 start += step
2628
2629
cb89cfc1 2630class LazyList(collections.abc.Sequence):
0f06bcd7 2631 """Lazy immutable list from an iterable
2632 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2633
8e5fecc8 2634 class IndexError(IndexError):
2635 pass
2636
282f5709 2637 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2638 self._iterable = iter(iterable)
2639 self._cache = [] if _cache is None else _cache
2640 self._reversed = reverse
483336e7 2641
2642 def __iter__(self):
0f06bcd7 2643 if self._reversed:
28419ca2 2644 # We need to consume the entire iterable to iterate in reverse
981052c9 2645 yield from self.exhaust()
28419ca2 2646 return
0f06bcd7 2647 yield from self._cache
2648 for item in self._iterable:
2649 self._cache.append(item)
483336e7 2650 yield item
2651
0f06bcd7 2652 def _exhaust(self):
2653 self._cache.extend(self._iterable)
2654 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2655 return self._cache
28419ca2 2656
981052c9 2657 def exhaust(self):
0f06bcd7 2658 """Evaluate the entire iterable"""
2659 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2660
28419ca2 2661 @staticmethod
0f06bcd7 2662 def _reverse_index(x):
e0f2b4b4 2663 return None if x is None else -(x + 1)
483336e7 2664
2665 def __getitem__(self, idx):
2666 if isinstance(idx, slice):
0f06bcd7 2667 if self._reversed:
2668 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2669 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2670 elif isinstance(idx, int):
0f06bcd7 2671 if self._reversed:
2672 idx = self._reverse_index(idx)
e0f2b4b4 2673 start, stop, step = idx, idx, 0
483336e7 2674 else:
2675 raise TypeError('indices must be integers or slices')
e0f2b4b4 2676 if ((start or 0) < 0 or (stop or 0) < 0
2677 or (start is None and step < 0)
2678 or (stop is None and step > 0)):
483336e7 2679 # We need to consume the entire iterable to be able to slice from the end
2680 # Obviously, never use this with infinite iterables
0f06bcd7 2681 self._exhaust()
8e5fecc8 2682 try:
0f06bcd7 2683 return self._cache[idx]
8e5fecc8 2684 except IndexError as e:
2685 raise self.IndexError(e) from e
0f06bcd7 2686 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2687 if n > 0:
0f06bcd7 2688 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2689 try:
0f06bcd7 2690 return self._cache[idx]
8e5fecc8 2691 except IndexError as e:
2692 raise self.IndexError(e) from e
483336e7 2693
2694 def __bool__(self):
2695 try:
0f06bcd7 2696 self[-1] if self._reversed else self[0]
8e5fecc8 2697 except self.IndexError:
483336e7 2698 return False
2699 return True
2700
2701 def __len__(self):
0f06bcd7 2702 self._exhaust()
2703 return len(self._cache)
483336e7 2704
282f5709 2705 def __reversed__(self):
0f06bcd7 2706 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2707
2708 def __copy__(self):
0f06bcd7 2709 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2710
28419ca2 2711 def __repr__(self):
2712 # repr and str should mimic a list. So we exhaust the iterable
2713 return repr(self.exhaust())
2714
2715 def __str__(self):
2716 return repr(self.exhaust())
2717
483336e7 2718
7be9ccff 2719class PagedList:
c07a39ae 2720
2721 class IndexError(IndexError):
2722 pass
2723
dd26ced1
PH
2724 def __len__(self):
2725 # This is only useful for tests
2726 return len(self.getslice())
2727
7be9ccff 2728 def __init__(self, pagefunc, pagesize, use_cache=True):
2729 self._pagefunc = pagefunc
2730 self._pagesize = pagesize
f1d13090 2731 self._pagecount = float('inf')
7be9ccff 2732 self._use_cache = use_cache
2733 self._cache = {}
2734
2735 def getpage(self, pagenum):
d8cf8d97 2736 page_results = self._cache.get(pagenum)
2737 if page_results is None:
f1d13090 2738 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2739 if self._use_cache:
2740 self._cache[pagenum] = page_results
2741 return page_results
2742
2743 def getslice(self, start=0, end=None):
2744 return list(self._getslice(start, end))
2745
2746 def _getslice(self, start, end):
55575225 2747 raise NotImplementedError('This method must be implemented by subclasses')
2748
2749 def __getitem__(self, idx):
f1d13090 2750 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2751 if not isinstance(idx, int) or idx < 0:
2752 raise TypeError('indices must be non-negative integers')
2753 entries = self.getslice(idx, idx + 1)
d8cf8d97 2754 if not entries:
c07a39ae 2755 raise self.IndexError()
d8cf8d97 2756 return entries[0]
55575225 2757
9c44d242
PH
2758
2759class OnDemandPagedList(PagedList):
a44ca5a4 2760 """Download pages until a page with less than maximum results"""
86e5f3ed 2761
7be9ccff 2762 def _getslice(self, start, end):
b7ab0590
PH
2763 for pagenum in itertools.count(start // self._pagesize):
2764 firstid = pagenum * self._pagesize
2765 nextfirstid = pagenum * self._pagesize + self._pagesize
2766 if start >= nextfirstid:
2767 continue
2768
b7ab0590
PH
2769 startv = (
2770 start % self._pagesize
2771 if firstid <= start < nextfirstid
2772 else 0)
b7ab0590
PH
2773 endv = (
2774 ((end - 1) % self._pagesize) + 1
2775 if (end is not None and firstid <= end <= nextfirstid)
2776 else None)
2777
f1d13090 2778 try:
2779 page_results = self.getpage(pagenum)
2780 except Exception:
2781 self._pagecount = pagenum - 1
2782 raise
b7ab0590
PH
2783 if startv != 0 or endv is not None:
2784 page_results = page_results[startv:endv]
7be9ccff 2785 yield from page_results
b7ab0590
PH
2786
2787 # A little optimization - if current page is not "full", ie. does
2788 # not contain page_size videos then we can assume that this page
2789 # is the last one - there are no more ids on further pages -
2790 # i.e. no need to query again.
2791 if len(page_results) + startv < self._pagesize:
2792 break
2793
2794 # If we got the whole page, but the next page is not interesting,
2795 # break out early as well
2796 if end == nextfirstid:
2797 break
81c2f20b
PH
2798
2799
9c44d242 2800class InAdvancePagedList(PagedList):
a44ca5a4 2801 """PagedList with total number of pages known in advance"""
86e5f3ed 2802
9c44d242 2803 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2804 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2805 self._pagecount = pagecount
9c44d242 2806
7be9ccff 2807 def _getslice(self, start, end):
9c44d242 2808 start_page = start // self._pagesize
d37707bd 2809 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2810 skip_elems = start - start_page * self._pagesize
2811 only_more = None if end is None else end - start
2812 for pagenum in range(start_page, end_page):
7be9ccff 2813 page_results = self.getpage(pagenum)
9c44d242 2814 if skip_elems:
7be9ccff 2815 page_results = page_results[skip_elems:]
9c44d242
PH
2816 skip_elems = None
2817 if only_more is not None:
7be9ccff 2818 if len(page_results) < only_more:
2819 only_more -= len(page_results)
9c44d242 2820 else:
7be9ccff 2821 yield from page_results[:only_more]
9c44d242 2822 break
7be9ccff 2823 yield from page_results
9c44d242
PH
2824
2825
7e88d7d7 2826class PlaylistEntries:
2827 MissingEntry = object()
2828 is_exhausted = False
2829
2830 def __init__(self, ydl, info_dict):
7e9a6125 2831 self.ydl = ydl
2832
2833 # _entries must be assigned now since infodict can change during iteration
2834 entries = info_dict.get('entries')
2835 if entries is None:
2836 raise EntryNotInPlaylist('There are no entries')
2837 elif isinstance(entries, list):
2838 self.is_exhausted = True
2839
2840 requested_entries = info_dict.get('requested_entries')
2841 self.is_incomplete = bool(requested_entries)
2842 if self.is_incomplete:
2843 assert self.is_exhausted
2844 self._entries = [self.MissingEntry] * max(requested_entries)
2845 for i, entry in zip(requested_entries, entries):
2846 self._entries[i - 1] = entry
2847 elif isinstance(entries, (list, PagedList, LazyList)):
2848 self._entries = entries
2849 else:
2850 self._entries = LazyList(entries)
7e88d7d7 2851
2852 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2853 (?P<start>[+-]?\d+)?
2854 (?P<range>[:-]
2855 (?P<end>[+-]?\d+|inf(?:inite)?)?
2856 (?::(?P<step>[+-]?\d+))?
2857 )?''')
2858
2859 @classmethod
2860 def parse_playlist_items(cls, string):
2861 for segment in string.split(','):
2862 if not segment:
2863 raise ValueError('There is two or more consecutive commas')
2864 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2865 if not mobj:
2866 raise ValueError(f'{segment!r} is not a valid specification')
2867 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2868 if int_or_none(step) == 0:
2869 raise ValueError(f'Step in {segment!r} cannot be zero')
2870 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2871
2872 def get_requested_items(self):
2873 playlist_items = self.ydl.params.get('playlist_items')
2874 playlist_start = self.ydl.params.get('playliststart', 1)
2875 playlist_end = self.ydl.params.get('playlistend')
2876 # For backwards compatibility, interpret -1 as whole list
2877 if playlist_end in (-1, None):
2878 playlist_end = ''
2879 if not playlist_items:
2880 playlist_items = f'{playlist_start}:{playlist_end}'
2881 elif playlist_start != 1 or playlist_end:
2882 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2883
2884 for index in self.parse_playlist_items(playlist_items):
2885 for i, entry in self[index]:
2886 yield i, entry
1ac4fd80 2887 if not entry:
2888 continue
7e88d7d7 2889 try:
2890 # TODO: Add auto-generated fields
2891 self.ydl._match_entry(entry, incomplete=True, silent=True)
2892 except (ExistingVideoReached, RejectedVideoReached):
2893 return
2894
7e9a6125 2895 def get_full_count(self):
2896 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2897 return len(self)
2898 elif isinstance(self._entries, InAdvancePagedList):
2899 if self._entries._pagesize == 1:
2900 return self._entries._pagecount
2901
7e88d7d7 2902 @functools.cached_property
2903 def _getter(self):
2904 if isinstance(self._entries, list):
2905 def get_entry(i):
2906 try:
2907 entry = self._entries[i]
2908 except IndexError:
2909 entry = self.MissingEntry
2910 if not self.is_incomplete:
2911 raise self.IndexError()
2912 if entry is self.MissingEntry:
2913 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2914 return entry
2915 else:
2916 def get_entry(i):
2917 try:
2918 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2919 except (LazyList.IndexError, PagedList.IndexError):
2920 raise self.IndexError()
2921 return get_entry
2922
2923 def __getitem__(self, idx):
2924 if isinstance(idx, int):
2925 idx = slice(idx, idx)
2926
2927 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2928 step = 1 if idx.step is None else idx.step
2929 if idx.start is None:
2930 start = 0 if step > 0 else len(self) - 1
2931 else:
2932 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2933
2934 # NB: Do not call len(self) when idx == [:]
2935 if idx.stop is None:
2936 stop = 0 if step < 0 else float('inf')
2937 else:
2938 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2939 stop += [-1, 1][step > 0]
2940
2941 for i in frange(start, stop, step):
2942 if i < 0:
2943 continue
2944 try:
7e9a6125 2945 entry = self._getter(i)
2946 except self.IndexError:
2947 self.is_exhausted = True
2948 if step > 0:
7e88d7d7 2949 break
7e9a6125 2950 continue
7e88d7d7 2951 yield i + 1, entry
2952
2953 def __len__(self):
2954 return len(tuple(self[:]))
2955
2956 class IndexError(IndexError):
2957 pass
2958
2959
81c2f20b 2960def uppercase_escape(s):
676eb3f2 2961 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2962 return re.sub(
a612753d 2963 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2964 lambda m: unicode_escape(m.group(0))[0],
2965 s)
0fe2ff78
YCH
2966
2967
2968def lowercase_escape(s):
2969 unicode_escape = codecs.getdecoder('unicode_escape')
2970 return re.sub(
2971 r'\\u[0-9a-fA-F]{4}',
2972 lambda m: unicode_escape(m.group(0))[0],
2973 s)
b53466e1 2974
d05cfe06
S
2975
2976def escape_rfc3986(s):
2977 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2978 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2979
2980
2981def escape_url(url):
2982 """Escape URL as suggested by RFC 3986"""
14f25df2 2983 url_parsed = urllib.parse.urlparse(url)
d05cfe06 2984 return url_parsed._replace(
efbed08d 2985 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2986 path=escape_rfc3986(url_parsed.path),
2987 params=escape_rfc3986(url_parsed.params),
2988 query=escape_rfc3986(url_parsed.query),
2989 fragment=escape_rfc3986(url_parsed.fragment)
2990 ).geturl()
2991
62e609ab 2992
4dfbf869 2993def parse_qs(url):
14f25df2 2994 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 2995
2996
62e609ab
PH
2997def read_batch_urls(batch_fd):
2998 def fixup(url):
14f25df2 2999 if not isinstance(url, str):
62e609ab 3000 url = url.decode('utf-8', 'replace')
8c04f0be 3001 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3002 for bom in BOM_UTF8:
3003 if url.startswith(bom):
3004 url = url[len(bom):]
3005 url = url.lstrip()
3006 if not url or url.startswith(('#', ';', ']')):
62e609ab 3007 return False
8c04f0be 3008 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3009 # However, it can be safely stripped out if following a whitespace
8c04f0be 3010 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3011
3012 with contextlib.closing(batch_fd) as fd:
3013 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3014
3015
3016def urlencode_postdata(*args, **kargs):
14f25df2 3017 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3018
3019
38f9ef31 3020def update_url_query(url, query):
cacd9966
YCH
3021 if not query:
3022 return url
14f25df2 3023 parsed_url = urllib.parse.urlparse(url)
3024 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3025 qs.update(query)
14f25df2 3026 return urllib.parse.urlunparse(parsed_url._replace(
3027 query=urllib.parse.urlencode(qs, True)))
16392824 3028
8e60dc75 3029
c043c246 3030def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3031 req_headers = req.headers.copy()
c043c246 3032 req_headers.update(headers or {})
ed0291d1
S
3033 req_data = data or req.data
3034 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3035 req_get_method = req.get_method()
3036 if req_get_method == 'HEAD':
3037 req_type = HEADRequest
3038 elif req_get_method == 'PUT':
3039 req_type = PUTRequest
3040 else:
ac668111 3041 req_type = urllib.request.Request
ed0291d1
S
3042 new_req = req_type(
3043 req_url, data=req_data, headers=req_headers,
3044 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3045 if hasattr(req, 'timeout'):
3046 new_req.timeout = req.timeout
3047 return new_req
3048
3049
10c87c15 3050def _multipart_encode_impl(data, boundary):
0c265486
YCH
3051 content_type = 'multipart/form-data; boundary=%s' % boundary
3052
3053 out = b''
3054 for k, v in data.items():
3055 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3056 if isinstance(k, str):
0f06bcd7 3057 k = k.encode()
14f25df2 3058 if isinstance(v, str):
0f06bcd7 3059 v = v.encode()
0c265486
YCH
3060 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3061 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3062 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3063 if boundary.encode('ascii') in content:
3064 raise ValueError('Boundary overlaps with data')
3065 out += content
3066
3067 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3068
3069 return out, content_type
3070
3071
3072def multipart_encode(data, boundary=None):
3073 '''
3074 Encode a dict to RFC 7578-compliant form-data
3075
3076 data:
3077 A dict where keys and values can be either Unicode or bytes-like
3078 objects.
3079 boundary:
3080 If specified a Unicode object, it's used as the boundary. Otherwise
3081 a random boundary is generated.
3082
3083 Reference: https://tools.ietf.org/html/rfc7578
3084 '''
3085 has_specified_boundary = boundary is not None
3086
3087 while True:
3088 if boundary is None:
3089 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3090
3091 try:
10c87c15 3092 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3093 break
3094 except ValueError:
3095 if has_specified_boundary:
3096 raise
3097 boundary = None
3098
3099 return out, content_type
3100
3101
86296ad2 3102def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3103 for val in map(d.get, variadic(key_or_keys)):
3104 if val is not None and (val or not skip_false_values):
3105 return val
3106 return default
cbecc9b9
S
3107
3108
c4f60dd7 3109def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3110 for f in funcs:
a32a9a7e 3111 try:
c4f60dd7 3112 val = f(*args, **kwargs)
3113 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3114 pass
3115 else:
c4f60dd7 3116 if expected_type is None or isinstance(val, expected_type):
3117 return val
3118
3119
3120def try_get(src, getter, expected_type=None):
3121 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3122
3123
90137ca4 3124def filter_dict(dct, cndn=lambda _, v: v is not None):
3125 return {k: v for k, v in dct.items() if cndn(k, v)}
3126
3127
6cc62232
S
3128def merge_dicts(*dicts):
3129 merged = {}
3130 for a_dict in dicts:
3131 for k, v in a_dict.items():
90137ca4 3132 if (v is not None and k not in merged
3133 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3134 merged[k] = v
3135 return merged
3136
3137
8e60dc75 3138def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3139 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3140
16392824 3141
a1a530b0
PH
3142US_RATINGS = {
3143 'G': 0,
3144 'PG': 10,
3145 'PG-13': 13,
3146 'R': 16,
3147 'NC': 18,
3148}
fac55558
PH
3149
3150
a8795327 3151TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3152 'TV-Y': 0,
3153 'TV-Y7': 7,
3154 'TV-G': 0,
3155 'TV-PG': 0,
3156 'TV-14': 14,
3157 'TV-MA': 17,
a8795327
S
3158}
3159
3160
146c80e2 3161def parse_age_limit(s):
19a03940 3162 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3163 if type(s) is int: # noqa: E721
a8795327 3164 return s if 0 <= s <= 21 else None
19a03940 3165 elif not isinstance(s, str):
d838b1bd 3166 return None
146c80e2 3167 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3168 if m:
3169 return int(m.group('age'))
5c5fae6d 3170 s = s.upper()
a8795327
S
3171 if s in US_RATINGS:
3172 return US_RATINGS[s]
5a16c9d9 3173 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3174 if m:
5a16c9d9 3175 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3176 return None
146c80e2
S
3177
3178
fac55558 3179def strip_jsonp(code):
609a61e3 3180 return re.sub(
5552c9eb 3181 r'''(?sx)^
e9c671d5 3182 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3183 (?:\s*&&\s*(?P=func_name))?
3184 \s*\(\s*(?P<callback_data>.*)\);?
3185 \s*?(?://[^\n]*)*$''',
3186 r'\g<callback_data>', code)
478c2c61
PH
3187
3188
5c610515 3189def js_to_json(code, vars={}):
3190 # vars is a dict of var, val pairs to substitute
c843e685 3191 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3192 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3193 INTEGER_TABLE = (
86e5f3ed 3194 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3195 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3196 )
3197
e05f6939 3198 def fix_kv(m):
e7b6d122
PH
3199 v = m.group(0)
3200 if v in ('true', 'false', 'null'):
3201 return v
421ddcb8
C
3202 elif v in ('undefined', 'void 0'):
3203 return 'null'
8bdd16b4 3204 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3205 return ""
3206
3207 if v[0] in ("'", '"'):
3208 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3209 '"': '\\"',
bd1e4844 3210 "\\'": "'",
3211 '\\\n': '',
3212 '\\x': '\\u00',
3213 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3214 else:
3215 for regex, base in INTEGER_TABLE:
3216 im = re.match(regex, v)
3217 if im:
3218 i = int(im.group(1), base)
3219 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3220
5c610515 3221 if v in vars:
3222 return vars[v]
3223
e7b6d122 3224 return '"%s"' % v
e05f6939 3225
8072ef2b 3226 def create_map(mobj):
3227 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3228
febff4c1 3229 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
8072ef2b 3230 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
febff4c1 3231
bd1e4844 3232 return re.sub(r'''(?sx)
3233 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3234 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3235 {comment}|,(?={skip}[\]}}])|
421ddcb8 3236 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3237 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3238 [0-9]+(?={skip}:)|
3239 !+
4195096e 3240 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3241
3242
478c2c61
PH
3243def qualities(quality_ids):
3244 """ Get a numeric quality value out of a list of possible values """
3245 def q(qid):
3246 try:
3247 return quality_ids.index(qid)
3248 except ValueError:
3249 return -1
3250 return q
3251
acd69589 3252
8aa0e7cd 3253POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3254
3255
de6000d9 3256DEFAULT_OUTTMPL = {
3257 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3258 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3259}
3260OUTTMPL_TYPES = {
72755351 3261 'chapter': None,
de6000d9 3262 'subtitle': None,
3263 'thumbnail': None,
3264 'description': 'description',
3265 'annotation': 'annotations.xml',
3266 'infojson': 'info.json',
08438d2c 3267 'link': None,
3b603dbd 3268 'pl_video': None,
5112f26a 3269 'pl_thumbnail': None,
de6000d9 3270 'pl_description': 'description',
3271 'pl_infojson': 'info.json',
3272}
0a871f68 3273
143db31d 3274# As of [1] format syntax is:
3275# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3276# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3277STR_FORMAT_RE_TMPL = r'''(?x)
3278 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3279 %
524e2e4f 3280 (?P<has_key>\((?P<key>{0})\))?
752cda38 3281 (?P<format>
524e2e4f 3282 (?P<conversion>[#0\-+ ]+)?
3283 (?P<min_width>\d+)?
3284 (?P<precision>\.\d+)?
3285 (?P<len_mod>[hlL])? # unused in python
901130bb 3286 {1} # conversion type
752cda38 3287 )
143db31d 3288'''
3289
7d1eb38a 3290
901130bb 3291STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3292
7d1eb38a 3293
a020a0dc
PH
3294def limit_length(s, length):
3295 """ Add ellipses to overly long strings """
3296 if s is None:
3297 return None
3298 ELLIPSES = '...'
3299 if len(s) > length:
3300 return s[:length - len(ELLIPSES)] + ELLIPSES
3301 return s
48844745
PH
3302
3303
3304def version_tuple(v):
5f9b8394 3305 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3306
3307
3308def is_outdated_version(version, limit, assume_new=True):
3309 if not version:
3310 return not assume_new
3311 try:
3312 return version_tuple(version) < version_tuple(limit)
3313 except ValueError:
3314 return not assume_new
732ea2f0
PH
3315
3316
3317def ytdl_is_updateable():
7a5c1cfe 3318 """ Returns if yt-dlp can be updated with -U """
735d865e 3319
5d535b4a 3320 from .update import is_non_updateable
732ea2f0 3321
5d535b4a 3322 return not is_non_updateable()
7d4111ed
PH
3323
3324
3325def args_to_str(args):
3326 # Get a short string representation for a subprocess command
702ccf2d 3327 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3328
3329
9b9c5355 3330def error_to_compat_str(err):
cfb0511d 3331 return str(err)
fdae2358
S
3332
3333
a44ca5a4 3334def error_to_str(err):
3335 return f'{type(err).__name__}: {err}'
3336
3337
c460bdd5 3338def mimetype2ext(mt):
eb9ee194
S
3339 if mt is None:
3340 return None
3341
9359f3d4
F
3342 mt, _, params = mt.partition(';')
3343 mt = mt.strip()
3344
3345 FULL_MAP = {
765ac263 3346 'audio/mp4': 'm4a',
6c33d24b
YCH
3347 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3348 # it's the most popular one
3349 'audio/mpeg': 'mp3',
ba39289d 3350 'audio/x-wav': 'wav',
9359f3d4
F
3351 'audio/wav': 'wav',
3352 'audio/wave': 'wav',
3353 }
3354
3355 ext = FULL_MAP.get(mt)
765ac263
JMF
3356 if ext is not None:
3357 return ext
3358
9359f3d4 3359 SUBTYPE_MAP = {
f6861ec9 3360 '3gpp': '3gp',
cafcf657 3361 'smptett+xml': 'tt',
cafcf657 3362 'ttaf+xml': 'dfxp',
a0d8d704 3363 'ttml+xml': 'ttml',
f6861ec9 3364 'x-flv': 'flv',
a0d8d704 3365 'x-mp4-fragmented': 'mp4',
d4f05d47 3366 'x-ms-sami': 'sami',
a0d8d704 3367 'x-ms-wmv': 'wmv',
b4173f15
RA
3368 'mpegurl': 'm3u8',
3369 'x-mpegurl': 'm3u8',
3370 'vnd.apple.mpegurl': 'm3u8',
3371 'dash+xml': 'mpd',
b4173f15 3372 'f4m+xml': 'f4m',
f164b971 3373 'hds+xml': 'f4m',
e910fe2f 3374 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3375 'quicktime': 'mov',
98ce1a3f 3376 'mp2t': 'ts',
39e7107d 3377 'x-wav': 'wav',
9359f3d4
F
3378 'filmstrip+json': 'fs',
3379 'svg+xml': 'svg',
3380 }
3381
3382 _, _, subtype = mt.rpartition('/')
3383 ext = SUBTYPE_MAP.get(subtype.lower())
3384 if ext is not None:
3385 return ext
3386
3387 SUFFIX_MAP = {
3388 'json': 'json',
3389 'xml': 'xml',
3390 'zip': 'zip',
3391 'gzip': 'gz',
3392 }
3393
3394 _, _, suffix = subtype.partition('+')
3395 ext = SUFFIX_MAP.get(suffix)
3396 if ext is not None:
3397 return ext
3398
3399 return subtype.replace('+', '.')
c460bdd5
PH
3400
3401
2814f12b
THD
3402def ext2mimetype(ext_or_url):
3403 if not ext_or_url:
3404 return None
3405 if '.' not in ext_or_url:
3406 ext_or_url = f'file.{ext_or_url}'
3407 return mimetypes.guess_type(ext_or_url)[0]
3408
3409
4f3c5e06 3410def parse_codecs(codecs_str):
3411 # http://tools.ietf.org/html/rfc6381
3412 if not codecs_str:
3413 return {}
a0566bbf 3414 split_codecs = list(filter(None, map(
dbf5416a 3415 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3416 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3417 for full_codec in split_codecs:
9bd979ca 3418 parts = full_codec.split('.')
3419 codec = parts[0].replace('0', '')
3420 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3421 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3422 if not vcodec:
b69fd25c 3423 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3424 if codec in ('dvh1', 'dvhe'):
3425 hdr = 'DV'
9bd979ca 3426 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3427 hdr = 'HDR10'
3428 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3429 hdr = 'HDR10'
b69fd25c 3430 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3431 if not acodec:
3432 acodec = full_codec
4afa3ec4 3433 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3434 if not scodec:
3435 scodec = full_codec
4f3c5e06 3436 else:
19a03940 3437 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3438 if vcodec or acodec or scodec:
4f3c5e06 3439 return {
3440 'vcodec': vcodec or 'none',
3441 'acodec': acodec or 'none',
176f1866 3442 'dynamic_range': hdr,
3fe75fdc 3443 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3444 }
b69fd25c 3445 elif len(split_codecs) == 2:
3446 return {
3447 'vcodec': split_codecs[0],
3448 'acodec': split_codecs[1],
3449 }
4f3c5e06 3450 return {}
3451
3452
2ccd1b10 3453def urlhandle_detect_ext(url_handle):
79298173 3454 getheader = url_handle.headers.get
2ccd1b10 3455
b55ee18f
PH
3456 cd = getheader('Content-Disposition')
3457 if cd:
3458 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3459 if m:
3460 e = determine_ext(m.group('filename'), default_ext=None)
3461 if e:
3462 return e
3463
c460bdd5 3464 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3465
3466
1e399778
YCH
3467def encode_data_uri(data, mime_type):
3468 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3469
3470
05900629 3471def age_restricted(content_limit, age_limit):
6ec6cb4e 3472 """ Returns True iff the content should be blocked """
05900629
PH
3473
3474 if age_limit is None: # No limit set
3475 return False
3476 if content_limit is None:
3477 return False # Content available for everyone
3478 return age_limit < content_limit
61ca9a80
PH
3479
3480
3481def is_html(first_bytes):
3482 """ Detect whether a file contains HTML by examining its first bytes. """
3483
3484 BOMS = [
3485 (b'\xef\xbb\xbf', 'utf-8'),
3486 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3487 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3488 (b'\xff\xfe', 'utf-16-le'),
3489 (b'\xfe\xff', 'utf-16-be'),
3490 ]
80e8493e 3491
3492 encoding = 'utf-8'
61ca9a80 3493 for bom, enc in BOMS:
80e8493e 3494 while first_bytes.startswith(bom):
3495 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3496
80e8493e 3497 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3498
3499
3500def determine_protocol(info_dict):
3501 protocol = info_dict.get('protocol')
3502 if protocol is not None:
3503 return protocol
3504
7de837a5 3505 url = sanitize_url(info_dict['url'])
a055469f
PH
3506 if url.startswith('rtmp'):
3507 return 'rtmp'
3508 elif url.startswith('mms'):
3509 return 'mms'
3510 elif url.startswith('rtsp'):
3511 return 'rtsp'
3512
3513 ext = determine_ext(url)
3514 if ext == 'm3u8':
3515 return 'm3u8'
3516 elif ext == 'f4m':
3517 return 'f4m'
3518
14f25df2 3519 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3520
3521
c5e3f849 3522def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3523 """ Render a list of rows, each as a list of values.
3524 Text after a \t will be right aligned """
ec11a9f4 3525 def width(string):
c5e3f849 3526 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3527
3528 def get_max_lens(table):
ec11a9f4 3529 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3530
3531 def filter_using_list(row, filterArray):
d16df59d 3532 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3533
d16df59d 3534 max_lens = get_max_lens(data) if hide_empty else []
3535 header_row = filter_using_list(header_row, max_lens)
3536 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3537
cfb56d1a 3538 table = [header_row] + data
76d321f6 3539 max_lens = get_max_lens(table)
c5e3f849 3540 extra_gap += 1
76d321f6 3541 if delim:
c5e3f849 3542 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3543 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3544 for row in table:
3545 for pos, text in enumerate(map(str, row)):
c5e3f849 3546 if '\t' in text:
3547 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3548 else:
3549 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3550 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3551 return ret
347de493
PH
3552
3553
8f18aca8 3554def _match_one(filter_part, dct, incomplete):
77b87f05 3555 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3556 STRING_OPERATORS = {
3557 '*=': operator.contains,
3558 '^=': lambda attr, value: attr.startswith(value),
3559 '$=': lambda attr, value: attr.endswith(value),
3560 '~=': lambda attr, value: re.search(value, attr),
3561 }
347de493 3562 COMPARISON_OPERATORS = {
a047eeb6 3563 **STRING_OPERATORS,
3564 '<=': operator.le, # "<=" must be defined above "<"
347de493 3565 '<': operator.lt,
347de493 3566 '>=': operator.ge,
a047eeb6 3567 '>': operator.gt,
347de493 3568 '=': operator.eq,
347de493 3569 }
a047eeb6 3570
6db9c4d5 3571 if isinstance(incomplete, bool):
3572 is_incomplete = lambda _: incomplete
3573 else:
3574 is_incomplete = lambda k: k in incomplete
3575
64fa820c 3576 operator_rex = re.compile(r'''(?x)
347de493 3577 (?P<key>[a-z_]+)
77b87f05 3578 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3579 (?:
a047eeb6 3580 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3581 (?P<strval>.+?)
347de493 3582 )
347de493 3583 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3584 m = operator_rex.fullmatch(filter_part.strip())
347de493 3585 if m:
18f96d12 3586 m = m.groupdict()
3587 unnegated_op = COMPARISON_OPERATORS[m['op']]
3588 if m['negation']:
77b87f05
MT
3589 op = lambda attr, value: not unnegated_op(attr, value)
3590 else:
3591 op = unnegated_op
18f96d12 3592 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3593 if m['quote']:
3594 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3595 actual_value = dct.get(m['key'])
3596 numeric_comparison = None
f9934b96 3597 if isinstance(actual_value, (int, float)):
e5a088dc
S
3598 # If the original field is a string and matching comparisonvalue is
3599 # a number we should respect the origin of the original field
3600 # and process comparison value as a string (see
18f96d12 3601 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3602 try:
18f96d12 3603 numeric_comparison = int(comparison_value)
347de493 3604 except ValueError:
18f96d12 3605 numeric_comparison = parse_filesize(comparison_value)
3606 if numeric_comparison is None:
3607 numeric_comparison = parse_filesize(f'{comparison_value}B')
3608 if numeric_comparison is None:
3609 numeric_comparison = parse_duration(comparison_value)
3610 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3611 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3612 if actual_value is None:
6db9c4d5 3613 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3614 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3615
3616 UNARY_OPERATORS = {
1cc47c66
S
3617 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3618 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3619 }
64fa820c 3620 operator_rex = re.compile(r'''(?x)
347de493 3621 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3622 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3623 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3624 if m:
3625 op = UNARY_OPERATORS[m.group('op')]
3626 actual_value = dct.get(m.group('key'))
6db9c4d5 3627 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3628 return True
347de493
PH
3629 return op(actual_value)
3630
3631 raise ValueError('Invalid filter part %r' % filter_part)
3632
3633
8f18aca8 3634def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3635 """ Filter a dictionary with a simple string syntax.
3636 @returns Whether the filter passes
3637 @param incomplete Set of keys that is expected to be missing from dct.
3638 Can be True/False to indicate all/none of the keys may be missing.
3639 All conditions on incomplete keys pass if the key is missing
8f18aca8 3640 """
347de493 3641 return all(
8f18aca8 3642 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3643 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3644
3645
b1a7cd05 3646def match_filter_func(filters):
3647 if not filters:
d1b5f70b 3648 return None
492272fe 3649 filters = set(variadic(filters))
d1b5f70b 3650
492272fe 3651 interactive = '-' in filters
3652 if interactive:
3653 filters.remove('-')
3654
3655 def _match_func(info_dict, incomplete=False):
3656 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3657 return NO_DEFAULT if interactive and not incomplete else None
347de493 3658 else:
b1a7cd05 3659 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3660 filter_str = ') | ('.join(map(str.strip, filters))
3661 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3662 return _match_func
91410c9b
PH
3663
3664
5ec1b6b7 3665def download_range_func(chapters, ranges):
3666 def inner(info_dict, ydl):
3667 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3668 else 'Cannot match chapters since chapter information is unavailable')
5ec1b6b7 3669 for regex in chapters or []:
3670 for i, chapter in enumerate(info_dict.get('chapters') or []):
3671 if re.search(regex, chapter['title']):
3672 warning = None
3673 yield {**chapter, 'index': i}
56ba69e4 3674 if chapters and warning:
5ec1b6b7 3675 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3676
3677 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3678
3679 return inner
3680
3681
bf6427d2
YCH
3682def parse_dfxp_time_expr(time_expr):
3683 if not time_expr:
d631d5f9 3684 return
bf6427d2 3685
1d485a1a 3686 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3687 if mobj:
3688 return float(mobj.group('time_offset'))
3689
db2fe38b 3690 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3691 if mobj:
db2fe38b 3692 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3693
3694
c1c924ab 3695def srt_subtitles_timecode(seconds):
aa7785f8 3696 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3697
3698
3699def ass_subtitles_timecode(seconds):
3700 time = timetuple_from_msec(seconds * 1000)
3701 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3702
3703
3704def dfxp2srt(dfxp_data):
3869028f
YCH
3705 '''
3706 @param dfxp_data A bytes-like object containing DFXP data
3707 @returns A unicode object containing converted SRT data
3708 '''
5b995f71 3709 LEGACY_NAMESPACES = (
3869028f
YCH
3710 (b'http://www.w3.org/ns/ttml', [
3711 b'http://www.w3.org/2004/11/ttaf1',
3712 b'http://www.w3.org/2006/04/ttaf1',
3713 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3714 ]),
3869028f
YCH
3715 (b'http://www.w3.org/ns/ttml#styling', [
3716 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3717 ]),
3718 )
3719
3720 SUPPORTED_STYLING = [
3721 'color',
3722 'fontFamily',
3723 'fontSize',
3724 'fontStyle',
3725 'fontWeight',
3726 'textDecoration'
3727 ]
3728
4e335771 3729 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3730 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3731 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3732 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3733 })
bf6427d2 3734
5b995f71
RA
3735 styles = {}
3736 default_style = {}
3737
86e5f3ed 3738 class TTMLPElementParser:
5b995f71
RA
3739 _out = ''
3740 _unclosed_elements = []
3741 _applied_styles = []
bf6427d2 3742
2b14cb56 3743 def start(self, tag, attrib):
5b995f71
RA
3744 if tag in (_x('ttml:br'), 'br'):
3745 self._out += '\n'
3746 else:
3747 unclosed_elements = []
3748 style = {}
3749 element_style_id = attrib.get('style')
3750 if default_style:
3751 style.update(default_style)
3752 if element_style_id:
3753 style.update(styles.get(element_style_id, {}))
3754 for prop in SUPPORTED_STYLING:
3755 prop_val = attrib.get(_x('tts:' + prop))
3756 if prop_val:
3757 style[prop] = prop_val
3758 if style:
3759 font = ''
3760 for k, v in sorted(style.items()):
3761 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3762 continue
3763 if k == 'color':
3764 font += ' color="%s"' % v
3765 elif k == 'fontSize':
3766 font += ' size="%s"' % v
3767 elif k == 'fontFamily':
3768 font += ' face="%s"' % v
3769 elif k == 'fontWeight' and v == 'bold':
3770 self._out += '<b>'
3771 unclosed_elements.append('b')
3772 elif k == 'fontStyle' and v == 'italic':
3773 self._out += '<i>'
3774 unclosed_elements.append('i')
3775 elif k == 'textDecoration' and v == 'underline':
3776 self._out += '<u>'
3777 unclosed_elements.append('u')
3778 if font:
3779 self._out += '<font' + font + '>'
3780 unclosed_elements.append('font')
3781 applied_style = {}
3782 if self._applied_styles:
3783 applied_style.update(self._applied_styles[-1])
3784 applied_style.update(style)
3785 self._applied_styles.append(applied_style)
3786 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3787
2b14cb56 3788 def end(self, tag):
5b995f71
RA
3789 if tag not in (_x('ttml:br'), 'br'):
3790 unclosed_elements = self._unclosed_elements.pop()
3791 for element in reversed(unclosed_elements):
3792 self._out += '</%s>' % element
3793 if unclosed_elements and self._applied_styles:
3794 self._applied_styles.pop()
bf6427d2 3795
2b14cb56 3796 def data(self, data):
5b995f71 3797 self._out += data
2b14cb56 3798
3799 def close(self):
5b995f71 3800 return self._out.strip()
2b14cb56 3801
3802 def parse_node(node):
3803 target = TTMLPElementParser()
3804 parser = xml.etree.ElementTree.XMLParser(target=target)
3805 parser.feed(xml.etree.ElementTree.tostring(node))
3806 return parser.close()
bf6427d2 3807
5b995f71
RA
3808 for k, v in LEGACY_NAMESPACES:
3809 for ns in v:
3810 dfxp_data = dfxp_data.replace(ns, k)
3811
3869028f 3812 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3813 out = []
5b995f71 3814 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3815
3816 if not paras:
3817 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3818
5b995f71
RA
3819 repeat = False
3820 while True:
3821 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3822 style_id = style.get('id') or style.get(_x('xml:id'))
3823 if not style_id:
3824 continue
5b995f71
RA
3825 parent_style_id = style.get('style')
3826 if parent_style_id:
3827 if parent_style_id not in styles:
3828 repeat = True
3829 continue
3830 styles[style_id] = styles[parent_style_id].copy()
3831 for prop in SUPPORTED_STYLING:
3832 prop_val = style.get(_x('tts:' + prop))
3833 if prop_val:
3834 styles.setdefault(style_id, {})[prop] = prop_val
3835 if repeat:
3836 repeat = False
3837 else:
3838 break
3839
3840 for p in ('body', 'div'):
3841 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3842 if ele is None:
3843 continue
3844 style = styles.get(ele.get('style'))
3845 if not style:
3846 continue
3847 default_style.update(style)
3848
bf6427d2 3849 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3850 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3851 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3852 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3853 if begin_time is None:
3854 continue
7dff0363 3855 if not end_time:
d631d5f9
YCH
3856 if not dur:
3857 continue
3858 end_time = begin_time + dur
bf6427d2
YCH
3859 out.append('%d\n%s --> %s\n%s\n\n' % (
3860 index,
c1c924ab
YCH
3861 srt_subtitles_timecode(begin_time),
3862 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3863 parse_node(para)))
3864
3865 return ''.join(out)
3866
3867
c487cf00 3868def cli_option(params, command_option, param, separator=None):
66e289ba 3869 param = params.get(param)
c487cf00 3870 return ([] if param is None
3871 else [command_option, str(param)] if separator is None
3872 else [f'{command_option}{separator}{param}'])
66e289ba
S
3873
3874
3875def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3876 param = params.get(param)
c487cf00 3877 assert param in (True, False, None)
3878 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3879
3880
3881def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3882 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3883
3884
e92caff5 3885def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3886 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3887 if use_compat:
5b1ecbb3 3888 return argdict
3889 else:
3890 argdict = None
eab9b2bc 3891 if argdict is None:
5b1ecbb3 3892 return default
eab9b2bc 3893 assert isinstance(argdict, dict)
3894
e92caff5 3895 assert isinstance(keys, (list, tuple))
3896 for key_list in keys:
e92caff5 3897 arg_list = list(filter(
3898 lambda x: x is not None,
6606817a 3899 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3900 if arg_list:
3901 return [arg for args in arg_list for arg in args]
3902 return default
66e289ba 3903
6251555f 3904
330690a2 3905def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3906 main_key, exe = main_key.lower(), exe.lower()
3907 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3908 keys = [f'{root_key}{k}' for k in (keys or [''])]
3909 if root_key in keys:
3910 if main_key != exe:
3911 keys.append((main_key, exe))
3912 keys.append('default')
3913 else:
3914 use_compat = False
3915 return cli_configuration_args(argdict, keys, default, use_compat)
3916
66e289ba 3917
86e5f3ed 3918class ISO639Utils:
39672624
YCH
3919 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3920 _lang_map = {
3921 'aa': 'aar',
3922 'ab': 'abk',
3923 'ae': 'ave',
3924 'af': 'afr',
3925 'ak': 'aka',
3926 'am': 'amh',
3927 'an': 'arg',
3928 'ar': 'ara',
3929 'as': 'asm',
3930 'av': 'ava',
3931 'ay': 'aym',
3932 'az': 'aze',
3933 'ba': 'bak',
3934 'be': 'bel',
3935 'bg': 'bul',
3936 'bh': 'bih',
3937 'bi': 'bis',
3938 'bm': 'bam',
3939 'bn': 'ben',
3940 'bo': 'bod',
3941 'br': 'bre',
3942 'bs': 'bos',
3943 'ca': 'cat',
3944 'ce': 'che',
3945 'ch': 'cha',
3946 'co': 'cos',
3947 'cr': 'cre',
3948 'cs': 'ces',
3949 'cu': 'chu',
3950 'cv': 'chv',
3951 'cy': 'cym',
3952 'da': 'dan',
3953 'de': 'deu',
3954 'dv': 'div',
3955 'dz': 'dzo',
3956 'ee': 'ewe',
3957 'el': 'ell',
3958 'en': 'eng',
3959 'eo': 'epo',
3960 'es': 'spa',
3961 'et': 'est',
3962 'eu': 'eus',
3963 'fa': 'fas',
3964 'ff': 'ful',
3965 'fi': 'fin',
3966 'fj': 'fij',
3967 'fo': 'fao',
3968 'fr': 'fra',
3969 'fy': 'fry',
3970 'ga': 'gle',
3971 'gd': 'gla',
3972 'gl': 'glg',
3973 'gn': 'grn',
3974 'gu': 'guj',
3975 'gv': 'glv',
3976 'ha': 'hau',
3977 'he': 'heb',
b7acc835 3978 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3979 'hi': 'hin',
3980 'ho': 'hmo',
3981 'hr': 'hrv',
3982 'ht': 'hat',
3983 'hu': 'hun',
3984 'hy': 'hye',
3985 'hz': 'her',
3986 'ia': 'ina',
3987 'id': 'ind',
b7acc835 3988 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3989 'ie': 'ile',
3990 'ig': 'ibo',
3991 'ii': 'iii',
3992 'ik': 'ipk',
3993 'io': 'ido',
3994 'is': 'isl',
3995 'it': 'ita',
3996 'iu': 'iku',
3997 'ja': 'jpn',
3998 'jv': 'jav',
3999 'ka': 'kat',
4000 'kg': 'kon',
4001 'ki': 'kik',
4002 'kj': 'kua',
4003 'kk': 'kaz',
4004 'kl': 'kal',
4005 'km': 'khm',
4006 'kn': 'kan',
4007 'ko': 'kor',
4008 'kr': 'kau',
4009 'ks': 'kas',
4010 'ku': 'kur',
4011 'kv': 'kom',
4012 'kw': 'cor',
4013 'ky': 'kir',
4014 'la': 'lat',
4015 'lb': 'ltz',
4016 'lg': 'lug',
4017 'li': 'lim',
4018 'ln': 'lin',
4019 'lo': 'lao',
4020 'lt': 'lit',
4021 'lu': 'lub',
4022 'lv': 'lav',
4023 'mg': 'mlg',
4024 'mh': 'mah',
4025 'mi': 'mri',
4026 'mk': 'mkd',
4027 'ml': 'mal',
4028 'mn': 'mon',
4029 'mr': 'mar',
4030 'ms': 'msa',
4031 'mt': 'mlt',
4032 'my': 'mya',
4033 'na': 'nau',
4034 'nb': 'nob',
4035 'nd': 'nde',
4036 'ne': 'nep',
4037 'ng': 'ndo',
4038 'nl': 'nld',
4039 'nn': 'nno',
4040 'no': 'nor',
4041 'nr': 'nbl',
4042 'nv': 'nav',
4043 'ny': 'nya',
4044 'oc': 'oci',
4045 'oj': 'oji',
4046 'om': 'orm',
4047 'or': 'ori',
4048 'os': 'oss',
4049 'pa': 'pan',
4050 'pi': 'pli',
4051 'pl': 'pol',
4052 'ps': 'pus',
4053 'pt': 'por',
4054 'qu': 'que',
4055 'rm': 'roh',
4056 'rn': 'run',
4057 'ro': 'ron',
4058 'ru': 'rus',
4059 'rw': 'kin',
4060 'sa': 'san',
4061 'sc': 'srd',
4062 'sd': 'snd',
4063 'se': 'sme',
4064 'sg': 'sag',
4065 'si': 'sin',
4066 'sk': 'slk',
4067 'sl': 'slv',
4068 'sm': 'smo',
4069 'sn': 'sna',
4070 'so': 'som',
4071 'sq': 'sqi',
4072 'sr': 'srp',
4073 'ss': 'ssw',
4074 'st': 'sot',
4075 'su': 'sun',
4076 'sv': 'swe',
4077 'sw': 'swa',
4078 'ta': 'tam',
4079 'te': 'tel',
4080 'tg': 'tgk',
4081 'th': 'tha',
4082 'ti': 'tir',
4083 'tk': 'tuk',
4084 'tl': 'tgl',
4085 'tn': 'tsn',
4086 'to': 'ton',
4087 'tr': 'tur',
4088 'ts': 'tso',
4089 'tt': 'tat',
4090 'tw': 'twi',
4091 'ty': 'tah',
4092 'ug': 'uig',
4093 'uk': 'ukr',
4094 'ur': 'urd',
4095 'uz': 'uzb',
4096 've': 'ven',
4097 'vi': 'vie',
4098 'vo': 'vol',
4099 'wa': 'wln',
4100 'wo': 'wol',
4101 'xh': 'xho',
4102 'yi': 'yid',
e9a50fba 4103 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4104 'yo': 'yor',
4105 'za': 'zha',
4106 'zh': 'zho',
4107 'zu': 'zul',
4108 }
4109
4110 @classmethod
4111 def short2long(cls, code):
4112 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4113 return cls._lang_map.get(code[:2])
4114
4115 @classmethod
4116 def long2short(cls, code):
4117 """Convert language code from ISO 639-2/T to ISO 639-1"""
4118 for short_name, long_name in cls._lang_map.items():
4119 if long_name == code:
4120 return short_name
4121
4122
86e5f3ed 4123class ISO3166Utils:
4eb10f66
YCH
4124 # From http://data.okfn.org/data/core/country-list
4125 _country_map = {
4126 'AF': 'Afghanistan',
4127 'AX': 'Åland Islands',
4128 'AL': 'Albania',
4129 'DZ': 'Algeria',
4130 'AS': 'American Samoa',
4131 'AD': 'Andorra',
4132 'AO': 'Angola',
4133 'AI': 'Anguilla',
4134 'AQ': 'Antarctica',
4135 'AG': 'Antigua and Barbuda',
4136 'AR': 'Argentina',
4137 'AM': 'Armenia',
4138 'AW': 'Aruba',
4139 'AU': 'Australia',
4140 'AT': 'Austria',
4141 'AZ': 'Azerbaijan',
4142 'BS': 'Bahamas',
4143 'BH': 'Bahrain',
4144 'BD': 'Bangladesh',
4145 'BB': 'Barbados',
4146 'BY': 'Belarus',
4147 'BE': 'Belgium',
4148 'BZ': 'Belize',
4149 'BJ': 'Benin',
4150 'BM': 'Bermuda',
4151 'BT': 'Bhutan',
4152 'BO': 'Bolivia, Plurinational State of',
4153 'BQ': 'Bonaire, Sint Eustatius and Saba',
4154 'BA': 'Bosnia and Herzegovina',
4155 'BW': 'Botswana',
4156 'BV': 'Bouvet Island',
4157 'BR': 'Brazil',
4158 'IO': 'British Indian Ocean Territory',
4159 'BN': 'Brunei Darussalam',
4160 'BG': 'Bulgaria',
4161 'BF': 'Burkina Faso',
4162 'BI': 'Burundi',
4163 'KH': 'Cambodia',
4164 'CM': 'Cameroon',
4165 'CA': 'Canada',
4166 'CV': 'Cape Verde',
4167 'KY': 'Cayman Islands',
4168 'CF': 'Central African Republic',
4169 'TD': 'Chad',
4170 'CL': 'Chile',
4171 'CN': 'China',
4172 'CX': 'Christmas Island',
4173 'CC': 'Cocos (Keeling) Islands',
4174 'CO': 'Colombia',
4175 'KM': 'Comoros',
4176 'CG': 'Congo',
4177 'CD': 'Congo, the Democratic Republic of the',
4178 'CK': 'Cook Islands',
4179 'CR': 'Costa Rica',
4180 'CI': 'Côte d\'Ivoire',
4181 'HR': 'Croatia',
4182 'CU': 'Cuba',
4183 'CW': 'Curaçao',
4184 'CY': 'Cyprus',
4185 'CZ': 'Czech Republic',
4186 'DK': 'Denmark',
4187 'DJ': 'Djibouti',
4188 'DM': 'Dominica',
4189 'DO': 'Dominican Republic',
4190 'EC': 'Ecuador',
4191 'EG': 'Egypt',
4192 'SV': 'El Salvador',
4193 'GQ': 'Equatorial Guinea',
4194 'ER': 'Eritrea',
4195 'EE': 'Estonia',
4196 'ET': 'Ethiopia',
4197 'FK': 'Falkland Islands (Malvinas)',
4198 'FO': 'Faroe Islands',
4199 'FJ': 'Fiji',
4200 'FI': 'Finland',
4201 'FR': 'France',
4202 'GF': 'French Guiana',
4203 'PF': 'French Polynesia',
4204 'TF': 'French Southern Territories',
4205 'GA': 'Gabon',
4206 'GM': 'Gambia',
4207 'GE': 'Georgia',
4208 'DE': 'Germany',
4209 'GH': 'Ghana',
4210 'GI': 'Gibraltar',
4211 'GR': 'Greece',
4212 'GL': 'Greenland',
4213 'GD': 'Grenada',
4214 'GP': 'Guadeloupe',
4215 'GU': 'Guam',
4216 'GT': 'Guatemala',
4217 'GG': 'Guernsey',
4218 'GN': 'Guinea',
4219 'GW': 'Guinea-Bissau',
4220 'GY': 'Guyana',
4221 'HT': 'Haiti',
4222 'HM': 'Heard Island and McDonald Islands',
4223 'VA': 'Holy See (Vatican City State)',
4224 'HN': 'Honduras',
4225 'HK': 'Hong Kong',
4226 'HU': 'Hungary',
4227 'IS': 'Iceland',
4228 'IN': 'India',
4229 'ID': 'Indonesia',
4230 'IR': 'Iran, Islamic Republic of',
4231 'IQ': 'Iraq',
4232 'IE': 'Ireland',
4233 'IM': 'Isle of Man',
4234 'IL': 'Israel',
4235 'IT': 'Italy',
4236 'JM': 'Jamaica',
4237 'JP': 'Japan',
4238 'JE': 'Jersey',
4239 'JO': 'Jordan',
4240 'KZ': 'Kazakhstan',
4241 'KE': 'Kenya',
4242 'KI': 'Kiribati',
4243 'KP': 'Korea, Democratic People\'s Republic of',
4244 'KR': 'Korea, Republic of',
4245 'KW': 'Kuwait',
4246 'KG': 'Kyrgyzstan',
4247 'LA': 'Lao People\'s Democratic Republic',
4248 'LV': 'Latvia',
4249 'LB': 'Lebanon',
4250 'LS': 'Lesotho',
4251 'LR': 'Liberia',
4252 'LY': 'Libya',
4253 'LI': 'Liechtenstein',
4254 'LT': 'Lithuania',
4255 'LU': 'Luxembourg',
4256 'MO': 'Macao',
4257 'MK': 'Macedonia, the Former Yugoslav Republic of',
4258 'MG': 'Madagascar',
4259 'MW': 'Malawi',
4260 'MY': 'Malaysia',
4261 'MV': 'Maldives',
4262 'ML': 'Mali',
4263 'MT': 'Malta',
4264 'MH': 'Marshall Islands',
4265 'MQ': 'Martinique',
4266 'MR': 'Mauritania',
4267 'MU': 'Mauritius',
4268 'YT': 'Mayotte',
4269 'MX': 'Mexico',
4270 'FM': 'Micronesia, Federated States of',
4271 'MD': 'Moldova, Republic of',
4272 'MC': 'Monaco',
4273 'MN': 'Mongolia',
4274 'ME': 'Montenegro',
4275 'MS': 'Montserrat',
4276 'MA': 'Morocco',
4277 'MZ': 'Mozambique',
4278 'MM': 'Myanmar',
4279 'NA': 'Namibia',
4280 'NR': 'Nauru',
4281 'NP': 'Nepal',
4282 'NL': 'Netherlands',
4283 'NC': 'New Caledonia',
4284 'NZ': 'New Zealand',
4285 'NI': 'Nicaragua',
4286 'NE': 'Niger',
4287 'NG': 'Nigeria',
4288 'NU': 'Niue',
4289 'NF': 'Norfolk Island',
4290 'MP': 'Northern Mariana Islands',
4291 'NO': 'Norway',
4292 'OM': 'Oman',
4293 'PK': 'Pakistan',
4294 'PW': 'Palau',
4295 'PS': 'Palestine, State of',
4296 'PA': 'Panama',
4297 'PG': 'Papua New Guinea',
4298 'PY': 'Paraguay',
4299 'PE': 'Peru',
4300 'PH': 'Philippines',
4301 'PN': 'Pitcairn',
4302 'PL': 'Poland',
4303 'PT': 'Portugal',
4304 'PR': 'Puerto Rico',
4305 'QA': 'Qatar',
4306 'RE': 'Réunion',
4307 'RO': 'Romania',
4308 'RU': 'Russian Federation',
4309 'RW': 'Rwanda',
4310 'BL': 'Saint Barthélemy',
4311 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4312 'KN': 'Saint Kitts and Nevis',
4313 'LC': 'Saint Lucia',
4314 'MF': 'Saint Martin (French part)',
4315 'PM': 'Saint Pierre and Miquelon',
4316 'VC': 'Saint Vincent and the Grenadines',
4317 'WS': 'Samoa',
4318 'SM': 'San Marino',
4319 'ST': 'Sao Tome and Principe',
4320 'SA': 'Saudi Arabia',
4321 'SN': 'Senegal',
4322 'RS': 'Serbia',
4323 'SC': 'Seychelles',
4324 'SL': 'Sierra Leone',
4325 'SG': 'Singapore',
4326 'SX': 'Sint Maarten (Dutch part)',
4327 'SK': 'Slovakia',
4328 'SI': 'Slovenia',
4329 'SB': 'Solomon Islands',
4330 'SO': 'Somalia',
4331 'ZA': 'South Africa',
4332 'GS': 'South Georgia and the South Sandwich Islands',
4333 'SS': 'South Sudan',
4334 'ES': 'Spain',
4335 'LK': 'Sri Lanka',
4336 'SD': 'Sudan',
4337 'SR': 'Suriname',
4338 'SJ': 'Svalbard and Jan Mayen',
4339 'SZ': 'Swaziland',
4340 'SE': 'Sweden',
4341 'CH': 'Switzerland',
4342 'SY': 'Syrian Arab Republic',
4343 'TW': 'Taiwan, Province of China',
4344 'TJ': 'Tajikistan',
4345 'TZ': 'Tanzania, United Republic of',
4346 'TH': 'Thailand',
4347 'TL': 'Timor-Leste',
4348 'TG': 'Togo',
4349 'TK': 'Tokelau',
4350 'TO': 'Tonga',
4351 'TT': 'Trinidad and Tobago',
4352 'TN': 'Tunisia',
4353 'TR': 'Turkey',
4354 'TM': 'Turkmenistan',
4355 'TC': 'Turks and Caicos Islands',
4356 'TV': 'Tuvalu',
4357 'UG': 'Uganda',
4358 'UA': 'Ukraine',
4359 'AE': 'United Arab Emirates',
4360 'GB': 'United Kingdom',
4361 'US': 'United States',
4362 'UM': 'United States Minor Outlying Islands',
4363 'UY': 'Uruguay',
4364 'UZ': 'Uzbekistan',
4365 'VU': 'Vanuatu',
4366 'VE': 'Venezuela, Bolivarian Republic of',
4367 'VN': 'Viet Nam',
4368 'VG': 'Virgin Islands, British',
4369 'VI': 'Virgin Islands, U.S.',
4370 'WF': 'Wallis and Futuna',
4371 'EH': 'Western Sahara',
4372 'YE': 'Yemen',
4373 'ZM': 'Zambia',
4374 'ZW': 'Zimbabwe',
2f97cc61 4375 # Not ISO 3166 codes, but used for IP blocks
4376 'AP': 'Asia/Pacific Region',
4377 'EU': 'Europe',
4eb10f66
YCH
4378 }
4379
4380 @classmethod
4381 def short2full(cls, code):
4382 """Convert an ISO 3166-2 country code to the corresponding full name"""
4383 return cls._country_map.get(code.upper())
4384
4385
86e5f3ed 4386class GeoUtils:
773f291d
S
4387 # Major IPv4 address blocks per country
4388 _country_ip_map = {
53896ca5 4389 'AD': '46.172.224.0/19',
773f291d
S
4390 'AE': '94.200.0.0/13',
4391 'AF': '149.54.0.0/17',
4392 'AG': '209.59.64.0/18',
4393 'AI': '204.14.248.0/21',
4394 'AL': '46.99.0.0/16',
4395 'AM': '46.70.0.0/15',
4396 'AO': '105.168.0.0/13',
53896ca5
S
4397 'AP': '182.50.184.0/21',
4398 'AQ': '23.154.160.0/24',
773f291d
S
4399 'AR': '181.0.0.0/12',
4400 'AS': '202.70.112.0/20',
53896ca5 4401 'AT': '77.116.0.0/14',
773f291d
S
4402 'AU': '1.128.0.0/11',
4403 'AW': '181.41.0.0/18',
53896ca5
S
4404 'AX': '185.217.4.0/22',
4405 'AZ': '5.197.0.0/16',
773f291d
S
4406 'BA': '31.176.128.0/17',
4407 'BB': '65.48.128.0/17',
4408 'BD': '114.130.0.0/16',
4409 'BE': '57.0.0.0/8',
53896ca5 4410 'BF': '102.178.0.0/15',
773f291d
S
4411 'BG': '95.42.0.0/15',
4412 'BH': '37.131.0.0/17',
4413 'BI': '154.117.192.0/18',
4414 'BJ': '137.255.0.0/16',
53896ca5 4415 'BL': '185.212.72.0/23',
773f291d
S
4416 'BM': '196.12.64.0/18',
4417 'BN': '156.31.0.0/16',
4418 'BO': '161.56.0.0/16',
4419 'BQ': '161.0.80.0/20',
53896ca5 4420 'BR': '191.128.0.0/12',
773f291d
S
4421 'BS': '24.51.64.0/18',
4422 'BT': '119.2.96.0/19',
4423 'BW': '168.167.0.0/16',
4424 'BY': '178.120.0.0/13',
4425 'BZ': '179.42.192.0/18',
4426 'CA': '99.224.0.0/11',
4427 'CD': '41.243.0.0/16',
53896ca5
S
4428 'CF': '197.242.176.0/21',
4429 'CG': '160.113.0.0/16',
773f291d 4430 'CH': '85.0.0.0/13',
53896ca5 4431 'CI': '102.136.0.0/14',
773f291d
S
4432 'CK': '202.65.32.0/19',
4433 'CL': '152.172.0.0/14',
53896ca5 4434 'CM': '102.244.0.0/14',
773f291d
S
4435 'CN': '36.128.0.0/10',
4436 'CO': '181.240.0.0/12',
4437 'CR': '201.192.0.0/12',
4438 'CU': '152.206.0.0/15',
4439 'CV': '165.90.96.0/19',
4440 'CW': '190.88.128.0/17',
53896ca5 4441 'CY': '31.153.0.0/16',
773f291d
S
4442 'CZ': '88.100.0.0/14',
4443 'DE': '53.0.0.0/8',
4444 'DJ': '197.241.0.0/17',
4445 'DK': '87.48.0.0/12',
4446 'DM': '192.243.48.0/20',
4447 'DO': '152.166.0.0/15',
4448 'DZ': '41.96.0.0/12',
4449 'EC': '186.68.0.0/15',
4450 'EE': '90.190.0.0/15',
4451 'EG': '156.160.0.0/11',
4452 'ER': '196.200.96.0/20',
4453 'ES': '88.0.0.0/11',
4454 'ET': '196.188.0.0/14',
4455 'EU': '2.16.0.0/13',
4456 'FI': '91.152.0.0/13',
4457 'FJ': '144.120.0.0/16',
53896ca5 4458 'FK': '80.73.208.0/21',
773f291d
S
4459 'FM': '119.252.112.0/20',
4460 'FO': '88.85.32.0/19',
4461 'FR': '90.0.0.0/9',
4462 'GA': '41.158.0.0/15',
4463 'GB': '25.0.0.0/8',
4464 'GD': '74.122.88.0/21',
4465 'GE': '31.146.0.0/16',
4466 'GF': '161.22.64.0/18',
4467 'GG': '62.68.160.0/19',
53896ca5
S
4468 'GH': '154.160.0.0/12',
4469 'GI': '95.164.0.0/16',
773f291d
S
4470 'GL': '88.83.0.0/19',
4471 'GM': '160.182.0.0/15',
4472 'GN': '197.149.192.0/18',
4473 'GP': '104.250.0.0/19',
4474 'GQ': '105.235.224.0/20',
4475 'GR': '94.64.0.0/13',
4476 'GT': '168.234.0.0/16',
4477 'GU': '168.123.0.0/16',
4478 'GW': '197.214.80.0/20',
4479 'GY': '181.41.64.0/18',
4480 'HK': '113.252.0.0/14',
4481 'HN': '181.210.0.0/16',
4482 'HR': '93.136.0.0/13',
4483 'HT': '148.102.128.0/17',
4484 'HU': '84.0.0.0/14',
4485 'ID': '39.192.0.0/10',
4486 'IE': '87.32.0.0/12',
4487 'IL': '79.176.0.0/13',
4488 'IM': '5.62.80.0/20',
4489 'IN': '117.192.0.0/10',
4490 'IO': '203.83.48.0/21',
4491 'IQ': '37.236.0.0/14',
4492 'IR': '2.176.0.0/12',
4493 'IS': '82.221.0.0/16',
4494 'IT': '79.0.0.0/10',
4495 'JE': '87.244.64.0/18',
4496 'JM': '72.27.0.0/17',
4497 'JO': '176.29.0.0/16',
53896ca5 4498 'JP': '133.0.0.0/8',
773f291d
S
4499 'KE': '105.48.0.0/12',
4500 'KG': '158.181.128.0/17',
4501 'KH': '36.37.128.0/17',
4502 'KI': '103.25.140.0/22',
4503 'KM': '197.255.224.0/20',
53896ca5 4504 'KN': '198.167.192.0/19',
773f291d
S
4505 'KP': '175.45.176.0/22',
4506 'KR': '175.192.0.0/10',
4507 'KW': '37.36.0.0/14',
4508 'KY': '64.96.0.0/15',
4509 'KZ': '2.72.0.0/13',
4510 'LA': '115.84.64.0/18',
4511 'LB': '178.135.0.0/16',
53896ca5 4512 'LC': '24.92.144.0/20',
773f291d
S
4513 'LI': '82.117.0.0/19',
4514 'LK': '112.134.0.0/15',
53896ca5 4515 'LR': '102.183.0.0/16',
773f291d
S
4516 'LS': '129.232.0.0/17',
4517 'LT': '78.56.0.0/13',
4518 'LU': '188.42.0.0/16',
4519 'LV': '46.109.0.0/16',
4520 'LY': '41.252.0.0/14',
4521 'MA': '105.128.0.0/11',
4522 'MC': '88.209.64.0/18',
4523 'MD': '37.246.0.0/16',
4524 'ME': '178.175.0.0/17',
4525 'MF': '74.112.232.0/21',
4526 'MG': '154.126.0.0/17',
4527 'MH': '117.103.88.0/21',
4528 'MK': '77.28.0.0/15',
4529 'ML': '154.118.128.0/18',
4530 'MM': '37.111.0.0/17',
4531 'MN': '49.0.128.0/17',
4532 'MO': '60.246.0.0/16',
4533 'MP': '202.88.64.0/20',
4534 'MQ': '109.203.224.0/19',
4535 'MR': '41.188.64.0/18',
4536 'MS': '208.90.112.0/22',
4537 'MT': '46.11.0.0/16',
4538 'MU': '105.16.0.0/12',
4539 'MV': '27.114.128.0/18',
53896ca5 4540 'MW': '102.70.0.0/15',
773f291d
S
4541 'MX': '187.192.0.0/11',
4542 'MY': '175.136.0.0/13',
4543 'MZ': '197.218.0.0/15',
4544 'NA': '41.182.0.0/16',
4545 'NC': '101.101.0.0/18',
4546 'NE': '197.214.0.0/18',
4547 'NF': '203.17.240.0/22',
4548 'NG': '105.112.0.0/12',
4549 'NI': '186.76.0.0/15',
4550 'NL': '145.96.0.0/11',
4551 'NO': '84.208.0.0/13',
4552 'NP': '36.252.0.0/15',
4553 'NR': '203.98.224.0/19',
4554 'NU': '49.156.48.0/22',
4555 'NZ': '49.224.0.0/14',
4556 'OM': '5.36.0.0/15',
4557 'PA': '186.72.0.0/15',
4558 'PE': '186.160.0.0/14',
4559 'PF': '123.50.64.0/18',
4560 'PG': '124.240.192.0/19',
4561 'PH': '49.144.0.0/13',
4562 'PK': '39.32.0.0/11',
4563 'PL': '83.0.0.0/11',
4564 'PM': '70.36.0.0/20',
4565 'PR': '66.50.0.0/16',
4566 'PS': '188.161.0.0/16',
4567 'PT': '85.240.0.0/13',
4568 'PW': '202.124.224.0/20',
4569 'PY': '181.120.0.0/14',
4570 'QA': '37.210.0.0/15',
53896ca5 4571 'RE': '102.35.0.0/16',
773f291d 4572 'RO': '79.112.0.0/13',
53896ca5 4573 'RS': '93.86.0.0/15',
773f291d 4574 'RU': '5.136.0.0/13',
53896ca5 4575 'RW': '41.186.0.0/16',
773f291d
S
4576 'SA': '188.48.0.0/13',
4577 'SB': '202.1.160.0/19',
4578 'SC': '154.192.0.0/11',
53896ca5 4579 'SD': '102.120.0.0/13',
773f291d 4580 'SE': '78.64.0.0/12',
53896ca5 4581 'SG': '8.128.0.0/10',
773f291d
S
4582 'SI': '188.196.0.0/14',
4583 'SK': '78.98.0.0/15',
53896ca5 4584 'SL': '102.143.0.0/17',
773f291d
S
4585 'SM': '89.186.32.0/19',
4586 'SN': '41.82.0.0/15',
53896ca5 4587 'SO': '154.115.192.0/18',
773f291d
S
4588 'SR': '186.179.128.0/17',
4589 'SS': '105.235.208.0/21',
4590 'ST': '197.159.160.0/19',
4591 'SV': '168.243.0.0/16',
4592 'SX': '190.102.0.0/20',
4593 'SY': '5.0.0.0/16',
4594 'SZ': '41.84.224.0/19',
4595 'TC': '65.255.48.0/20',
4596 'TD': '154.68.128.0/19',
4597 'TG': '196.168.0.0/14',
4598 'TH': '171.96.0.0/13',
4599 'TJ': '85.9.128.0/18',
4600 'TK': '27.96.24.0/21',
4601 'TL': '180.189.160.0/20',
4602 'TM': '95.85.96.0/19',
4603 'TN': '197.0.0.0/11',
4604 'TO': '175.176.144.0/21',
4605 'TR': '78.160.0.0/11',
4606 'TT': '186.44.0.0/15',
4607 'TV': '202.2.96.0/19',
4608 'TW': '120.96.0.0/11',
4609 'TZ': '156.156.0.0/14',
53896ca5
S
4610 'UA': '37.52.0.0/14',
4611 'UG': '102.80.0.0/13',
4612 'US': '6.0.0.0/8',
773f291d 4613 'UY': '167.56.0.0/13',
53896ca5 4614 'UZ': '84.54.64.0/18',
773f291d 4615 'VA': '212.77.0.0/19',
53896ca5 4616 'VC': '207.191.240.0/21',
773f291d 4617 'VE': '186.88.0.0/13',
53896ca5 4618 'VG': '66.81.192.0/20',
773f291d
S
4619 'VI': '146.226.0.0/16',
4620 'VN': '14.160.0.0/11',
4621 'VU': '202.80.32.0/20',
4622 'WF': '117.20.32.0/21',
4623 'WS': '202.4.32.0/19',
4624 'YE': '134.35.0.0/16',
4625 'YT': '41.242.116.0/22',
4626 'ZA': '41.0.0.0/11',
53896ca5
S
4627 'ZM': '102.144.0.0/13',
4628 'ZW': '102.177.192.0/18',
773f291d
S
4629 }
4630
4631 @classmethod
5f95927a
S
4632 def random_ipv4(cls, code_or_block):
4633 if len(code_or_block) == 2:
4634 block = cls._country_ip_map.get(code_or_block.upper())
4635 if not block:
4636 return None
4637 else:
4638 block = code_or_block
773f291d 4639 addr, preflen = block.split('/')
ac668111 4640 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4641 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4642 return str(socket.inet_ntoa(
ac668111 4643 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4644
4645
ac668111 4646class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4647 def __init__(self, proxies=None):
4648 # Set default handlers
4649 for type in ('http', 'https'):
4650 setattr(self, '%s_open' % type,
4651 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4652 meth(r, proxy, type))
ac668111 4653 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4654
91410c9b 4655 def proxy_open(self, req, proxy, type):
2461f79d 4656 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4657 if req_proxy is not None:
4658 proxy = req_proxy
2461f79d
PH
4659 del req.headers['Ytdl-request-proxy']
4660
4661 if proxy == '__noproxy__':
4662 return None # No Proxy
14f25df2 4663 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4664 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4665 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4666 return None
ac668111 4667 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4668 self, req, proxy, type)
5bc880b9
YCH
4669
4670
0a5445dd
YCH
4671# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4672# released into Public Domain
4673# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4674
4675def long_to_bytes(n, blocksize=0):
4676 """long_to_bytes(n:long, blocksize:int) : string
4677 Convert a long integer to a byte string.
4678
4679 If optional blocksize is given and greater than zero, pad the front of the
4680 byte string with binary zeros so that the length is a multiple of
4681 blocksize.
4682 """
4683 # after much testing, this algorithm was deemed to be the fastest
4684 s = b''
4685 n = int(n)
4686 while n > 0:
ac668111 4687 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4688 n = n >> 32
4689 # strip off leading zeros
4690 for i in range(len(s)):
4691 if s[i] != b'\000'[0]:
4692 break
4693 else:
4694 # only happens when n == 0
4695 s = b'\000'
4696 i = 0
4697 s = s[i:]
4698 # add back some pad bytes. this could be done more efficiently w.r.t. the
4699 # de-padding being done above, but sigh...
4700 if blocksize > 0 and len(s) % blocksize:
4701 s = (blocksize - len(s) % blocksize) * b'\000' + s
4702 return s
4703
4704
4705def bytes_to_long(s):
4706 """bytes_to_long(string) : long
4707 Convert a byte string to a long integer.
4708
4709 This is (essentially) the inverse of long_to_bytes().
4710 """
4711 acc = 0
4712 length = len(s)
4713 if length % 4:
4714 extra = (4 - length % 4)
4715 s = b'\000' * extra + s
4716 length = length + extra
4717 for i in range(0, length, 4):
ac668111 4718 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4719 return acc
4720
4721
5bc880b9
YCH
4722def ohdave_rsa_encrypt(data, exponent, modulus):
4723 '''
4724 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4725
4726 Input:
4727 data: data to encrypt, bytes-like object
4728 exponent, modulus: parameter e and N of RSA algorithm, both integer
4729 Output: hex string of encrypted data
4730
4731 Limitation: supports one block encryption only
4732 '''
4733
4734 payload = int(binascii.hexlify(data[::-1]), 16)
4735 encrypted = pow(payload, exponent, modulus)
4736 return '%x' % encrypted
81bdc8fd
YCH
4737
4738
f48409c7
YCH
4739def pkcs1pad(data, length):
4740 """
4741 Padding input data with PKCS#1 scheme
4742
4743 @param {int[]} data input data
4744 @param {int} length target length
4745 @returns {int[]} padded data
4746 """
4747 if len(data) > length - 11:
4748 raise ValueError('Input data too long for PKCS#1 padding')
4749
4750 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4751 return [0, 2] + pseudo_random + [0] + data
4752
4753
7b2c3f47 4754def _base_n_table(n, table):
4755 if not table and not n:
4756 raise ValueError('Either table or n must be specified')
612f2be5 4757 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4758
44f14eb4 4759 if n and n != len(table):
612f2be5 4760 raise ValueError(f'base {n} exceeds table length {len(table)}')
4761 return table
59f898b7 4762
5eb6bdce 4763
7b2c3f47 4764def encode_base_n(num, n=None, table=None):
4765 """Convert given int to a base-n string"""
612f2be5 4766 table = _base_n_table(n, table)
7b2c3f47 4767 if not num:
5eb6bdce
YCH
4768 return table[0]
4769
7b2c3f47 4770 result, base = '', len(table)
81bdc8fd 4771 while num:
7b2c3f47 4772 result = table[num % base] + result
612f2be5 4773 num = num // base
7b2c3f47 4774 return result
4775
4776
4777def decode_base_n(string, n=None, table=None):
4778 """Convert given base-n string to int"""
4779 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4780 result, base = 0, len(table)
4781 for char in string:
4782 result = result * base + table[char]
4783 return result
4784
4785
4786def decode_base(value, digits):
4787 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4788 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4789 return decode_base_n(value, table=digits)
f52354a8
YCH
4790
4791
4792def decode_packed_codes(code):
06b3fe29 4793 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4794 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4795 base = int(base)
4796 count = int(count)
4797 symbols = symbols.split('|')
4798 symbol_table = {}
4799
4800 while count:
4801 count -= 1
5eb6bdce 4802 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4803 symbol_table[base_n_count] = symbols[count] or base_n_count
4804
4805 return re.sub(
4806 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4807 obfuscated_code)
e154c651 4808
4809
1ced2221
S
4810def caesar(s, alphabet, shift):
4811 if shift == 0:
4812 return s
4813 l = len(alphabet)
4814 return ''.join(
4815 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4816 for c in s)
4817
4818
4819def rot47(s):
4820 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4821
4822
e154c651 4823def parse_m3u8_attributes(attrib):
4824 info = {}
4825 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4826 if val.startswith('"'):
4827 val = val[1:-1]
4828 info[key] = val
4829 return info
1143535d
YCH
4830
4831
4832def urshift(val, n):
4833 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4834
4835
4836# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4837# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4838def decode_png(png_data):
4839 # Reference: https://www.w3.org/TR/PNG/
4840 header = png_data[8:]
4841
4842 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4843 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4844
4845 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4846 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4847
4848 chunks = []
4849
4850 while header:
4851 length = unpack_integer(header[:4])
4852 header = header[4:]
4853
4854 chunk_type = header[:4]
4855 header = header[4:]
4856
4857 chunk_data = header[:length]
4858 header = header[length:]
4859
4860 header = header[4:] # Skip CRC
4861
4862 chunks.append({
4863 'type': chunk_type,
4864 'length': length,
4865 'data': chunk_data
4866 })
4867
4868 ihdr = chunks[0]['data']
4869
4870 width = unpack_integer(ihdr[:4])
4871 height = unpack_integer(ihdr[4:8])
4872
4873 idat = b''
4874
4875 for chunk in chunks:
4876 if chunk['type'] == b'IDAT':
4877 idat += chunk['data']
4878
4879 if not idat:
86e5f3ed 4880 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4881
4882 decompressed_data = bytearray(zlib.decompress(idat))
4883
4884 stride = width * 3
4885 pixels = []
4886
4887 def _get_pixel(idx):
4888 x = idx % stride
4889 y = idx // stride
4890 return pixels[y][x]
4891
4892 for y in range(height):
4893 basePos = y * (1 + stride)
4894 filter_type = decompressed_data[basePos]
4895
4896 current_row = []
4897
4898 pixels.append(current_row)
4899
4900 for x in range(stride):
4901 color = decompressed_data[1 + basePos + x]
4902 basex = y * stride + x
4903 left = 0
4904 up = 0
4905
4906 if x > 2:
4907 left = _get_pixel(basex - 3)
4908 if y > 0:
4909 up = _get_pixel(basex - stride)
4910
4911 if filter_type == 1: # Sub
4912 color = (color + left) & 0xff
4913 elif filter_type == 2: # Up
4914 color = (color + up) & 0xff
4915 elif filter_type == 3: # Average
4916 color = (color + ((left + up) >> 1)) & 0xff
4917 elif filter_type == 4: # Paeth
4918 a = left
4919 b = up
4920 c = 0
4921
4922 if x > 2 and y > 0:
4923 c = _get_pixel(basex - stride - 3)
4924
4925 p = a + b - c
4926
4927 pa = abs(p - a)
4928 pb = abs(p - b)
4929 pc = abs(p - c)
4930
4931 if pa <= pb and pa <= pc:
4932 color = (color + a) & 0xff
4933 elif pb <= pc:
4934 color = (color + b) & 0xff
4935 else:
4936 color = (color + c) & 0xff
4937
4938 current_row.append(color)
4939
4940 return width, height, pixels
efa97bdc
YCH
4941
4942
4943def write_xattr(path, key, value):
6f7563be 4944 # Windows: Write xattrs to NTFS Alternate Data Streams:
4945 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4946 if compat_os_name == 'nt':
4947 assert ':' not in key
4948 assert os.path.exists(path)
efa97bdc
YCH
4949
4950 try:
6f7563be 4951 with open(f'{path}:{key}', 'wb') as f:
4952 f.write(value)
86e5f3ed 4953 except OSError as e:
efa97bdc 4954 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4955 return
efa97bdc 4956
6f7563be 4957 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4958
6f7563be 4959 setxattr = None
4960 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4961 # Unicode arguments are not supported in pyxattr until version 0.5.0
4962 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4963 if version_tuple(xattr.__version__) >= (0, 5, 0):
4964 setxattr = xattr.set
4965 elif xattr:
4966 setxattr = xattr.setxattr
efa97bdc 4967
6f7563be 4968 if setxattr:
4969 try:
4970 setxattr(path, key, value)
4971 except OSError as e:
4972 raise XAttrMetadataError(e.errno, e.strerror)
4973 return
efa97bdc 4974
6f7563be 4975 # UNIX Method 2. Use setfattr/xattr executables
4976 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4977 else 'xattr' if check_executable('xattr', ['-h']) else None)
4978 if not exe:
4979 raise XAttrUnavailableError(
4980 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4981 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4982
0f06bcd7 4983 value = value.decode()
6f7563be 4984 try:
f0c9fb96 4985 _, stderr, returncode = Popen.run(
6f7563be 4986 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4987 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4988 except OSError as e:
4989 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4990 if returncode:
4991 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4992
4993
4994def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4995 start_date = datetime.date(1950, 1, 1)
4996 end_date = datetime.date(1995, 12, 31)
4997 offset = random.randint(0, (end_date - start_date).days)
4998 random_date = start_date + datetime.timedelta(offset)
0c265486 4999 return {
aa374bc7
AS
5000 year_field: str(random_date.year),
5001 month_field: str(random_date.month),
5002 day_field: str(random_date.day),
0c265486 5003 }
732044af 5004
c76eb41b 5005
732044af 5006# Templates for internet shortcut files, which are plain text files.
e5a998f3 5007DOT_URL_LINK_TEMPLATE = '''\
732044af 5008[InternetShortcut]
5009URL=%(url)s
e5a998f3 5010'''
732044af 5011
e5a998f3 5012DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5013<?xml version="1.0" encoding="UTF-8"?>
5014<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5015<plist version="1.0">
5016<dict>
5017\t<key>URL</key>
5018\t<string>%(url)s</string>
5019</dict>
5020</plist>
e5a998f3 5021'''
732044af 5022
e5a998f3 5023DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5024[Desktop Entry]
5025Encoding=UTF-8
5026Name=%(filename)s
5027Type=Link
5028URL=%(url)s
5029Icon=text-html
e5a998f3 5030'''
732044af 5031
08438d2c 5032LINK_TEMPLATES = {
5033 'url': DOT_URL_LINK_TEMPLATE,
5034 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5035 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5036}
5037
732044af 5038
5039def iri_to_uri(iri):
5040 """
5041 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5042
5043 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5044 """
5045
14f25df2 5046 iri_parts = urllib.parse.urlparse(iri)
732044af 5047
5048 if '[' in iri_parts.netloc:
5049 raise ValueError('IPv6 URIs are not, yet, supported.')
5050 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5051
5052 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5053
5054 net_location = ''
5055 if iri_parts.username:
f9934b96 5056 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5057 if iri_parts.password is not None:
f9934b96 5058 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5059 net_location += '@'
5060
0f06bcd7 5061 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5062 # The 'idna' encoding produces ASCII text.
5063 if iri_parts.port is not None and iri_parts.port != 80:
5064 net_location += ':' + str(iri_parts.port)
5065
f9934b96 5066 return urllib.parse.urlunparse(
732044af 5067 (iri_parts.scheme,
5068 net_location,
5069
f9934b96 5070 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5071
5072 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5073 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5074
5075 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5076 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5077
f9934b96 5078 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5079
5080 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5081
5082
5083def to_high_limit_path(path):
5084 if sys.platform in ['win32', 'cygwin']:
5085 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5086 return '\\\\?\\' + os.path.abspath(path)
732044af 5087
5088 return path
76d321f6 5089
c76eb41b 5090
7b2c3f47 5091def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5092 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5093 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5094 return default
7b2c3f47 5095 return template % func(val)
00dd0cd5 5096
5097
5098def clean_podcast_url(url):
5099 return re.sub(r'''(?x)
5100 (?:
5101 (?:
5102 chtbl\.com/track|
5103 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5104 play\.podtrac\.com
5105 )/[^/]+|
5106 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5107 flex\.acast\.com|
5108 pd(?:
5109 cn\.co| # https://podcorn.com/analytics-prefix/
5110 st\.fm # https://podsights.com/docs/
5111 )/e
5112 )/''', '', url)
ffcb8191
THD
5113
5114
5115_HEX_TABLE = '0123456789abcdef'
5116
5117
5118def random_uuidv4():
5119 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5120
5121
5122def make_dir(path, to_screen=None):
5123 try:
5124 dn = os.path.dirname(path)
5125 if dn and not os.path.exists(dn):
5126 os.makedirs(dn)
5127 return True
86e5f3ed 5128 except OSError as err:
0202b52a 5129 if callable(to_screen) is not None:
5130 to_screen('unable to create directory ' + error_to_compat_str(err))
5131 return False
f74980cb 5132
5133
5134def get_executable_path():
b5899f4f 5135 from .update import _get_variant_and_executable_path
c487cf00 5136
b5899f4f 5137 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5138
5139
2f567473 5140def load_plugins(name, suffix, namespace):
3ae5e797 5141 classes = {}
19a03940 5142 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5143 plugins_spec = importlib.util.spec_from_file_location(
5144 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5145 plugins = importlib.util.module_from_spec(plugins_spec)
5146 sys.modules[plugins_spec.name] = plugins
5147 plugins_spec.loader.exec_module(plugins)
f74980cb 5148 for name in dir(plugins):
2f567473 5149 if name in namespace:
5150 continue
5151 if not name.endswith(suffix):
f74980cb 5152 continue
5153 klass = getattr(plugins, name)
3ae5e797 5154 classes[name] = namespace[name] = klass
f74980cb 5155 return classes
06167fbb 5156
5157
325ebc17 5158def traverse_obj(
352d63fd 5159 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5160 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5161 ''' Traverse nested list/dict/tuple
8f334380 5162 @param path_list A list of paths which are checked one by one.
19a03940 5163 Each path is a list of keys where each key is a:
5164 - None: Do nothing
5165 - string: A dictionary key
5166 - int: An index into a list
5167 - tuple: A list of keys all of which will be traversed
5168 - Ellipsis: Fetch all values in the object
5169 - Function: Takes the key and value as arguments
5170 and returns whether the key matches or not
325ebc17 5171 @param default Default value to return
352d63fd 5172 @param expected_type Only accept final value of this type (Can also be any callable)
5173 @param get_all Return all the values obtained from a path or only the first one
324ad820 5174 @param casesense Whether to consider dictionary keys as case sensitive
5175 @param is_user_input Whether the keys are generated from user input. If True,
5176 strings are converted to int/slice if necessary
5177 @param traverse_string Whether to traverse inside strings. If True, any
5178 non-compatible object will also be converted into a string
8f334380 5179 # TODO: Write tests
324ad820 5180 '''
325ebc17 5181 if not casesense:
dbf5416a 5182 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5183 path_list = (map(_lower, variadic(path)) for path in path_list)
5184
5185 def _traverse_obj(obj, path, _current_depth=0):
5186 nonlocal depth
5187 path = tuple(variadic(path))
5188 for i, key in enumerate(path):
1797b073 5189 if None in (key, obj):
5190 return obj
8f334380 5191 if isinstance(key, (list, tuple)):
5192 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5193 key = ...
5194 if key is ...:
5195 obj = (obj.values() if isinstance(obj, dict)
5196 else obj if isinstance(obj, (list, tuple, LazyList))
5197 else str(obj) if traverse_string else [])
5198 _current_depth += 1
5199 depth = max(depth, _current_depth)
5200 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5201 elif callable(key):
5202 if isinstance(obj, (list, tuple, LazyList)):
5203 obj = enumerate(obj)
5204 elif isinstance(obj, dict):
5205 obj = obj.items()
5206 else:
5207 if not traverse_string:
5208 return None
5209 obj = str(obj)
5210 _current_depth += 1
5211 depth = max(depth, _current_depth)
e6f868a6 5212 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5213 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5214 obj = (obj.get(key) if casesense or (key in obj)
5215 else next((v for k, v in obj.items() if _lower(k) == key), None))
5216 else:
5217 if is_user_input:
5218 key = (int_or_none(key) if ':' not in key
5219 else slice(*map(int_or_none, key.split(':'))))
8f334380 5220 if key == slice(None):
575e17a1 5221 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5222 if not isinstance(key, (int, slice)):
9fea350f 5223 return None
8f334380 5224 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5225 if not traverse_string:
5226 return None
5227 obj = str(obj)
5228 try:
5229 obj = obj[key]
5230 except IndexError:
324ad820 5231 return None
325ebc17 5232 return obj
5233
352d63fd 5234 if isinstance(expected_type, type):
5235 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5236 else:
7b2c3f47 5237 type_test = expected_type or IDENTITY
352d63fd 5238
8f334380 5239 for path in path_list:
5240 depth = 0
5241 val = _traverse_obj(obj, path)
325ebc17 5242 if val is not None:
8f334380 5243 if depth:
5244 for _ in range(depth - 1):
6586bca9 5245 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5246 val = [v for v in map(type_test, val) if v is not None]
8f334380 5247 if val:
352d63fd 5248 return val if get_all else val[0]
5249 else:
5250 val = type_test(val)
5251 if val is not None:
8f334380 5252 return val
325ebc17 5253 return default
324ad820 5254
5255
5256def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5257 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5258 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5259 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5260
5261
ff91cf74 5262def get_first(obj, keys, **kwargs):
5263 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5264
5265
4b4b7f74 5266def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5267 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5268
5269
3e9b66d7
LNO
5270def time_seconds(**kwargs):
5271 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5272 return t.timestamp()
5273
5274
49fa4d9a
N
5275# create a JSON Web Signature (jws) with HS256 algorithm
5276# the resulting format is in JWS Compact Serialization
5277# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5278# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5279def jwt_encode_hs256(payload_data, key, headers={}):
5280 header_data = {
5281 'alg': 'HS256',
5282 'typ': 'JWT',
5283 }
5284 if headers:
5285 header_data.update(headers)
0f06bcd7 5286 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5287 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5288 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5289 signature_b64 = base64.b64encode(h.digest())
5290 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5291 return token
819e0531 5292
5293
16b0d7e6 5294# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5295def jwt_decode_hs256(jwt):
5296 header_b64, payload_b64, signature_b64 = jwt.split('.')
5297 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5298 return payload_data
5299
5300
53973b4d 5301WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5302
5303
0b9c08b4 5304@functools.cache
819e0531 5305def supports_terminal_sequences(stream):
5306 if compat_os_name == 'nt':
8a82af35 5307 if not WINDOWS_VT_MODE:
819e0531 5308 return False
5309 elif not os.getenv('TERM'):
5310 return False
5311 try:
5312 return stream.isatty()
5313 except BaseException:
5314 return False
5315
5316
53973b4d 5317def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5318 if get_windows_version() < (10, 0, 10586):
53973b4d 5319 return
5320 global WINDOWS_VT_MODE
53973b4d 5321 try:
f0c9fb96 5322 Popen.run('', shell=True)
53973b4d 5323 except Exception:
5324 return
5325
5326 WINDOWS_VT_MODE = True
5327 supports_terminal_sequences.cache_clear()
5328
5329
ec11a9f4 5330_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5331
5332
5333def remove_terminal_sequences(string):
5334 return _terminal_sequences_re.sub('', string)
5335
5336
5337def number_of_digits(number):
5338 return len('%d' % number)
34921b43 5339
5340
5341def join_nonempty(*values, delim='-', from_dict=None):
5342 if from_dict is not None:
7b2c3f47 5343 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5344 return delim.join(map(str, filter(None, values)))
06e57990 5345
5346
27231526
ZM
5347def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5348 """
5349 Find the largest format dimensions in terms of video width and, for each thumbnail:
5350 * Modify the URL: Match the width with the provided regex and replace with the former width
5351 * Update dimensions
5352
5353 This function is useful with video services that scale the provided thumbnails on demand
5354 """
5355 _keys = ('width', 'height')
5356 max_dimensions = max(
86e5f3ed 5357 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5358 default=(0, 0))
5359 if not max_dimensions[0]:
5360 return thumbnails
5361 return [
5362 merge_dicts(
5363 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5364 dict(zip(_keys, max_dimensions)), thumbnail)
5365 for thumbnail in thumbnails
5366 ]
5367
5368
93c8410d
LNO
5369def parse_http_range(range):
5370 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5371 if not range:
5372 return None, None, None
5373 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5374 if not crg:
5375 return None, None, None
5376 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5377
5378
6b9e832d 5379def read_stdin(what):
5380 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5381 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5382 return sys.stdin
5383
5384
06e57990 5385class Config:
5386 own_args = None
9e491463 5387 parsed_args = None
06e57990 5388 filename = None
5389 __initialized = False
5390
5391 def __init__(self, parser, label=None):
9e491463 5392 self.parser, self.label = parser, label
06e57990 5393 self._loaded_paths, self.configs = set(), []
5394
5395 def init(self, args=None, filename=None):
5396 assert not self.__initialized
284a60c5 5397 self.own_args, self.filename = args, filename
5398 return self.load_configs()
5399
5400 def load_configs(self):
65662dff 5401 directory = ''
284a60c5 5402 if self.filename:
5403 location = os.path.realpath(self.filename)
65662dff 5404 directory = os.path.dirname(location)
06e57990 5405 if location in self._loaded_paths:
5406 return False
5407 self._loaded_paths.add(location)
5408
284a60c5 5409 self.__initialized = True
5410 opts, _ = self.parser.parse_known_args(self.own_args)
5411 self.parsed_args = self.own_args
9e491463 5412 for location in opts.config_locations or []:
6b9e832d 5413 if location == '-':
5414 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5415 continue
65662dff 5416 location = os.path.join(directory, expand_path(location))
06e57990 5417 if os.path.isdir(location):
5418 location = os.path.join(location, 'yt-dlp.conf')
5419 if not os.path.exists(location):
9e491463 5420 self.parser.error(f'config location {location} does not exist')
06e57990 5421 self.append_config(self.read_file(location), location)
5422 return True
5423
5424 def __str__(self):
5425 label = join_nonempty(
5426 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5427 delim=' ')
5428 return join_nonempty(
5429 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5430 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5431 delim='\n')
5432
5433 @staticmethod
5434 def read_file(filename, default=[]):
5435 try:
5436 optionf = open(filename)
86e5f3ed 5437 except OSError:
06e57990 5438 return default # silently skip if file is not present
5439 try:
5440 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5441 contents = optionf.read()
f9934b96 5442 res = shlex.split(contents, comments=True)
44a6fcff 5443 except Exception as err:
5444 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5445 finally:
5446 optionf.close()
5447 return res
5448
5449 @staticmethod
5450 def hide_login_info(opts):
86e5f3ed 5451 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5452 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5453
5454 def _scrub_eq(o):
5455 m = eqre.match(o)
5456 if m:
5457 return m.group('key') + '=PRIVATE'
5458 else:
5459 return o
5460
5461 opts = list(map(_scrub_eq, opts))
5462 for idx, opt in enumerate(opts):
5463 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5464 opts[idx + 1] = 'PRIVATE'
5465 return opts
5466
5467 def append_config(self, *args, label=None):
9e491463 5468 config = type(self)(self.parser, label)
06e57990 5469 config._loaded_paths = self._loaded_paths
5470 if config.init(*args):
5471 self.configs.append(config)
5472
5473 @property
5474 def all_args(self):
5475 for config in reversed(self.configs):
5476 yield from config.all_args
9e491463 5477 yield from self.parsed_args or []
5478
5479 def parse_known_args(self, **kwargs):
5480 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5481
5482 def parse_args(self):
9e491463 5483 return self.parser.parse_args(self.all_args)
da42679b
LNO
5484
5485
5486class WebSocketsWrapper():
5487 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5488 pool = None
da42679b 5489
3cea3edd 5490 def __init__(self, url, headers=None, connect=True):
059bc4db 5491 self.loop = asyncio.new_event_loop()
9cd08050 5492 # XXX: "loop" is deprecated
5493 self.conn = websockets.connect(
5494 url, extra_headers=headers, ping_interval=None,
5495 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5496 if connect:
5497 self.__enter__()
15dfb392 5498 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5499
5500 def __enter__(self):
3cea3edd 5501 if not self.pool:
9cd08050 5502 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5503 return self
5504
5505 def send(self, *args):
5506 self.run_with_loop(self.pool.send(*args), self.loop)
5507
5508 def recv(self, *args):
5509 return self.run_with_loop(self.pool.recv(*args), self.loop)
5510
5511 def __exit__(self, type, value, traceback):
5512 try:
5513 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5514 finally:
5515 self.loop.close()
15dfb392 5516 self._cancel_all_tasks(self.loop)
da42679b
LNO
5517
5518 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5519 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5520 @staticmethod
5521 def run_with_loop(main, loop):
059bc4db 5522 if not asyncio.iscoroutine(main):
da42679b
LNO
5523 raise ValueError(f'a coroutine was expected, got {main!r}')
5524
5525 try:
5526 return loop.run_until_complete(main)
5527 finally:
5528 loop.run_until_complete(loop.shutdown_asyncgens())
5529 if hasattr(loop, 'shutdown_default_executor'):
5530 loop.run_until_complete(loop.shutdown_default_executor())
5531
5532 @staticmethod
5533 def _cancel_all_tasks(loop):
059bc4db 5534 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5535
5536 if not to_cancel:
5537 return
5538
5539 for task in to_cancel:
5540 task.cancel()
5541
9cd08050 5542 # XXX: "loop" is removed in python 3.10+
da42679b 5543 loop.run_until_complete(
059bc4db 5544 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5545
5546 for task in to_cancel:
5547 if task.cancelled():
5548 continue
5549 if task.exception() is not None:
5550 loop.call_exception_handler({
5551 'message': 'unhandled exception during asyncio.run() shutdown',
5552 'exception': task.exception(),
5553 'task': task,
5554 })
5555
5556
8b7539d2 5557def merge_headers(*dicts):
08d30158 5558 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5559 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5560
5561
b1f94422 5562def cached_method(f):
5563 """Cache a method"""
5564 signature = inspect.signature(f)
5565
5566 @functools.wraps(f)
5567 def wrapper(self, *args, **kwargs):
5568 bound_args = signature.bind(self, *args, **kwargs)
5569 bound_args.apply_defaults()
5570 key = tuple(bound_args.arguments.values())
5571
5572 if not hasattr(self, '__cached_method__cache'):
5573 self.__cached_method__cache = {}
5574 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5575 if key not in cache:
5576 cache[key] = f(self, *args, **kwargs)
5577 return cache[key]
5578 return wrapper
5579
5580
28787f16 5581class classproperty:
b1f94422 5582 """property access for class methods"""
c487cf00 5583
5584 def __init__(self, func):
5585 functools.update_wrapper(self, func)
5586 self.func = func
28787f16 5587
5588 def __get__(self, _, cls):
c487cf00 5589 return self.func(cls)
19a03940 5590
5591
64fa820c 5592class Namespace(types.SimpleNamespace):
591bb9d3 5593 """Immutable namespace"""
591bb9d3 5594
7896214c 5595 def __iter__(self):
64fa820c 5596 return iter(self.__dict__.values())
7896214c 5597
64fa820c 5598 @property
5599 def items_(self):
5600 return self.__dict__.items()
9b8ee23b 5601
5602
5603# Deprecated
5604has_certifi = bool(certifi)
5605has_websockets = bool(websockets)