]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[docs] Consistent use of `e.g.` (#4643)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
c496ca96 9import datetime
0c265486 10import email.header
f8271158 11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
019a94f7 20import importlib.util
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 151}
a942d6cb 152
8f53dc44 153# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
154TIMEZONE_NAMES = {
155 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
156 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
157 'EST': -5, 'EDT': -4, # Eastern
158 'CST': -6, 'CDT': -5, # Central
159 'MST': -7, 'MDT': -6, # Mountain
160 'PST': -8, 'PDT': -7 # Pacific
161}
162
c587cbb7 163# needed for sanitizing filenames in restricted mode
c8827027 164ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
165 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
166 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 167
46f59e89
S
168DATE_FORMATS = (
169 '%d %B %Y',
170 '%d %b %Y',
171 '%B %d %Y',
cb655f34
S
172 '%B %dst %Y',
173 '%B %dnd %Y',
9d30c213 174 '%B %drd %Y',
cb655f34 175 '%B %dth %Y',
46f59e89 176 '%b %d %Y',
cb655f34
S
177 '%b %dst %Y',
178 '%b %dnd %Y',
9d30c213 179 '%b %drd %Y',
cb655f34 180 '%b %dth %Y',
46f59e89
S
181 '%b %dst %Y %I:%M',
182 '%b %dnd %Y %I:%M',
9d30c213 183 '%b %drd %Y %I:%M',
46f59e89
S
184 '%b %dth %Y %I:%M',
185 '%Y %m %d',
186 '%Y-%m-%d',
bccdbd22 187 '%Y.%m.%d.',
46f59e89 188 '%Y/%m/%d',
81c13222 189 '%Y/%m/%d %H:%M',
46f59e89 190 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
191 '%Y%m%d%H%M',
192 '%Y%m%d%H%M%S',
4f3fa23e 193 '%Y%m%d',
0c1c6f4b 194 '%Y-%m-%d %H:%M',
46f59e89
S
195 '%Y-%m-%d %H:%M:%S',
196 '%Y-%m-%d %H:%M:%S.%f',
5014558a 197 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
198 '%d.%m.%Y %H:%M',
199 '%d.%m.%Y %H.%M',
200 '%Y-%m-%dT%H:%M:%SZ',
201 '%Y-%m-%dT%H:%M:%S.%fZ',
202 '%Y-%m-%dT%H:%M:%S.%f0Z',
203 '%Y-%m-%dT%H:%M:%S',
204 '%Y-%m-%dT%H:%M:%S.%f',
205 '%Y-%m-%dT%H:%M',
c6eed6b8
S
206 '%b %d %Y at %H:%M',
207 '%b %d %Y at %H:%M:%S',
b555ae9b
S
208 '%B %d %Y at %H:%M',
209 '%B %d %Y at %H:%M:%S',
a63d9bd0 210 '%H:%M %d-%b-%Y',
46f59e89
S
211)
212
213DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
214DATE_FORMATS_DAY_FIRST.extend([
215 '%d-%m-%Y',
216 '%d.%m.%Y',
217 '%d.%m.%y',
218 '%d/%m/%Y',
219 '%d/%m/%y',
220 '%d/%m/%Y %H:%M:%S',
47304e07 221 '%d-%m-%Y %H:%M',
46f59e89
S
222])
223
224DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
225DATE_FORMATS_MONTH_FIRST.extend([
226 '%m-%d-%Y',
227 '%m.%d.%Y',
228 '%m/%d/%Y',
229 '%m/%d/%y',
230 '%m/%d/%Y %H:%M:%S',
231])
232
06b3fe29 233PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 234JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 235
1d485a1a 236NUMBER_RE = r'\d+(?:\.\d+)?'
237
7105440c 238
0b9c08b4 239@functools.cache
d77c3dfd 240def preferredencoding():
59ae15a5 241 """Get preferred encoding.
d77c3dfd 242
59ae15a5
PH
243 Returns the best encoding scheme for the system, based on
244 locale.getpreferredencoding() and some further tweaks.
245 """
246 try:
247 pref = locale.getpreferredencoding()
28e614de 248 'TEST'.encode(pref)
70a1165b 249 except Exception:
59ae15a5 250 pref = 'UTF-8'
bae611f2 251
59ae15a5 252 return pref
d77c3dfd 253
f4bfd65f 254
181c8655 255def write_json_file(obj, fn):
1394646a 256 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 257
cfb0511d 258 tf = tempfile.NamedTemporaryFile(
259 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
260 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
261
262 try:
263 with tf:
45d86abe 264 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
265 if sys.platform == 'win32':
266 # Need to remove existing file on Windows, else os.rename raises
267 # WindowsError or FileExistsError.
19a03940 268 with contextlib.suppress(OSError):
1394646a 269 os.unlink(fn)
19a03940 270 with contextlib.suppress(OSError):
9cd5f54e
R
271 mask = os.umask(0)
272 os.umask(mask)
273 os.chmod(tf.name, 0o666 & ~mask)
181c8655 274 os.rename(tf.name, fn)
70a1165b 275 except Exception:
19a03940 276 with contextlib.suppress(OSError):
181c8655 277 os.remove(tf.name)
181c8655
PH
278 raise
279
280
cfb0511d 281def find_xpath_attr(node, xpath, key, val=None):
282 """ Find the xpath xpath[@key=val] """
283 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 284 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 285 return node.find(expr)
59ae56fa 286
d7e66d39
JMF
287# On python2.6 the xml.etree.ElementTree.Element methods don't support
288# the namespace parameter
5f6a1245
JW
289
290
d7e66d39
JMF
291def xpath_with_ns(path, ns_map):
292 components = [c.split(':') for c in path.split('/')]
293 replaced = []
294 for c in components:
295 if len(c) == 1:
296 replaced.append(c[0])
297 else:
298 ns, tag = c
299 replaced.append('{%s}%s' % (ns_map[ns], tag))
300 return '/'.join(replaced)
301
d77c3dfd 302
a41fb80c 303def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 304 def _find_xpath(xpath):
f9934b96 305 return node.find(xpath)
578c0745 306
14f25df2 307 if isinstance(xpath, str):
578c0745
S
308 n = _find_xpath(xpath)
309 else:
310 for xp in xpath:
311 n = _find_xpath(xp)
312 if n is not None:
313 break
d74bebd5 314
8e636da4 315 if n is None:
bf42a990
S
316 if default is not NO_DEFAULT:
317 return default
318 elif fatal:
bf0ff932
PH
319 name = xpath if name is None else name
320 raise ExtractorError('Could not find XML element %s' % name)
321 else:
322 return None
a41fb80c
S
323 return n
324
325
326def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
327 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
328 if n is None or n == default:
329 return n
330 if n.text is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = xpath if name is None else name
335 raise ExtractorError('Could not find XML element\'s text %s' % name)
336 else:
337 return None
338 return n.text
a41fb80c
S
339
340
341def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
342 n = find_xpath_attr(node, xpath, key)
343 if n is None:
344 if default is not NO_DEFAULT:
345 return default
346 elif fatal:
86e5f3ed 347 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
348 raise ExtractorError('Could not find XML attribute %s' % name)
349 else:
350 return None
351 return n.attrib[key]
bf0ff932
PH
352
353
c487cf00 354def get_element_by_id(id, html, **kwargs):
43e8fafd 355 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 356 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 357
12ea2f30 358
c487cf00 359def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 360 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 361 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
362
363
84c237fb 364def get_element_by_class(class_name, html):
2af12ad9
TC
365 """Return the content of the first tag with the specified class in the passed HTML document"""
366 retval = get_elements_by_class(class_name, html)
367 return retval[0] if retval else None
368
369
6f32a0b5
ZM
370def get_element_html_by_class(class_name, html):
371 """Return the html of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_html_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
c487cf00 376def get_element_by_attribute(attribute, value, html, **kwargs):
377 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
378 return retval[0] if retval else None
379
380
c487cf00 381def get_element_html_by_attribute(attribute, value, html, **kargs):
382 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
383 return retval[0] if retval else None
384
385
c487cf00 386def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
387 """Return the content of all tags with the specified class in the passed HTML document as a list"""
388 return get_elements_by_attribute(
64fa820c 389 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
390 html, escape_value=False)
391
392
6f32a0b5
ZM
393def get_elements_html_by_class(class_name, html):
394 """Return the html of all tags with the specified class in the passed HTML document as a list"""
395 return get_elements_html_by_attribute(
64fa820c 396 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
397 html, escape_value=False)
398
399
400def get_elements_by_attribute(*args, **kwargs):
43e8fafd 401 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
402 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
403
404
405def get_elements_html_by_attribute(*args, **kwargs):
406 """Return the html of the tag with the specified attribute in the passed HTML document"""
407 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
411 """
412 Return the text (content) and the html (whole) of the tag with the specified
413 attribute in the passed HTML document
414 """
9e6dd238 415
86e5f3ed 416 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 417
84c237fb
YCH
418 value = re.escape(value) if escape_value else value
419
86e5f3ed 420 partial_element_re = rf'''(?x)
6f32a0b5 421 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 422 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 423 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
424 '''
38285056 425
0254f162
ZM
426 for m in re.finditer(partial_element_re, html):
427 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 428
0254f162
ZM
429 yield (
430 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
431 whole
432 )
a921f407 433
c5229f39 434
ac668111 435class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
436 """
437 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
438 closing tag for the first opening tag it has encountered, and can be used
439 as a context manager
440 """
441
442 class HTMLBreakOnClosingTagException(Exception):
443 pass
444
445 def __init__(self):
446 self.tagstack = collections.deque()
ac668111 447 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
448
449 def __enter__(self):
450 return self
451
452 def __exit__(self, *_):
453 self.close()
454
455 def close(self):
456 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
457 # so data remains buffered; we no longer have any interest in it, thus
458 # override this method to discard it
459 pass
460
461 def handle_starttag(self, tag, _):
462 self.tagstack.append(tag)
463
464 def handle_endtag(self, tag):
465 if not self.tagstack:
466 raise compat_HTMLParseError('no tags in the stack')
467 while self.tagstack:
468 inner_tag = self.tagstack.pop()
469 if inner_tag == tag:
470 break
471 else:
472 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
473 if not self.tagstack:
474 raise self.HTMLBreakOnClosingTagException()
475
476
477def get_element_text_and_html_by_tag(tag, html):
478 """
479 For the first element with the specified tag in the passed HTML document
480 return its' content (text) and the whole element (html)
481 """
482 def find_or_raise(haystack, needle, exc):
483 try:
484 return haystack.index(needle)
485 except ValueError:
486 raise exc
487 closing_tag = f'</{tag}>'
488 whole_start = find_or_raise(
489 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
490 content_start = find_or_raise(
491 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
492 content_start += whole_start + 1
493 with HTMLBreakOnClosingTagParser() as parser:
494 parser.feed(html[whole_start:content_start])
495 if not parser.tagstack or parser.tagstack[0] != tag:
496 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
497 offset = content_start
498 while offset < len(html):
499 next_closing_tag_start = find_or_raise(
500 html[offset:], closing_tag,
501 compat_HTMLParseError(f'closing {tag} tag not found'))
502 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
503 try:
504 parser.feed(html[offset:offset + next_closing_tag_end])
505 offset += next_closing_tag_end
506 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
507 return html[content_start:offset + next_closing_tag_start], \
508 html[whole_start:offset + next_closing_tag_end]
509 raise compat_HTMLParseError('unexpected end of html')
510
511
ac668111 512class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 513 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 514
8bb56eee 515 def __init__(self):
c5229f39 516 self.attrs = {}
ac668111 517 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
518
519 def handle_starttag(self, tag, attrs):
520 self.attrs = dict(attrs)
521
c5229f39 522
ac668111 523class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
524 """HTML parser to gather the attributes for the elements of a list"""
525
526 def __init__(self):
ac668111 527 html.parser.HTMLParser.__init__(self)
73673ccf
FF
528 self.items = []
529 self._level = 0
530
531 def handle_starttag(self, tag, attrs):
532 if tag == 'li' and self._level == 0:
533 self.items.append(dict(attrs))
534 self._level += 1
535
536 def handle_endtag(self, tag):
537 self._level -= 1
538
539
8bb56eee
BF
540def extract_attributes(html_element):
541 """Given a string for an HTML element such as
542 <el
543 a="foo" B="bar" c="&98;az" d=boz
544 empty= noval entity="&amp;"
545 sq='"' dq="'"
546 >
547 Decode and return a dictionary of attributes.
548 {
549 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
550 'empty': '', 'noval': None, 'entity': '&',
551 'sq': '"', 'dq': '\''
552 }.
8bb56eee
BF
553 """
554 parser = HTMLAttributeParser()
19a03940 555 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
556 parser.feed(html_element)
557 parser.close()
8bb56eee 558 return parser.attrs
9e6dd238 559
c5229f39 560
73673ccf
FF
561def parse_list(webpage):
562 """Given a string for an series of HTML <li> elements,
563 return a dictionary of their attributes"""
564 parser = HTMLListAttrsParser()
565 parser.feed(webpage)
566 parser.close()
567 return parser.items
568
569
9e6dd238 570def clean_html(html):
59ae15a5 571 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
572
573 if html is None: # Convenience for sanitizing descriptions etc.
574 return html
575
49185227 576 html = re.sub(r'\s+', ' ', html)
577 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
578 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
579 # Strip html tags
580 html = re.sub('<.*?>', '', html)
581 # Replace html entities
582 html = unescapeHTML(html)
7decf895 583 return html.strip()
9e6dd238
FV
584
585
b7c47b74 586class LenientJSONDecoder(json.JSONDecoder):
587 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
588 self.transform_source, self.ignore_extra = transform_source, ignore_extra
589 super().__init__(*args, **kwargs)
590
591 def decode(self, s):
592 if self.transform_source:
593 s = self.transform_source(s)
594 if self.ignore_extra:
595 return self.raw_decode(s.lstrip())[0]
596 return super().decode(s)
597
598
d77c3dfd 599def sanitize_open(filename, open_mode):
59ae15a5
PH
600 """Try to open the given filename, and slightly tweak it if this fails.
601
602 Attempts to open the given filename. If this fails, it tries to change
603 the filename slightly, step by step, until it's either able to open it
604 or it fails and raises a final exception, like the standard open()
605 function.
606
607 It returns the tuple (stream, definitive_file_name).
608 """
0edb3e33 609 if filename == '-':
610 if sys.platform == 'win32':
611 import msvcrt
be5c1ae8 612
62b58c09 613 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 614 with contextlib.suppress(io.UnsupportedOperation):
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 617
0edb3e33 618 for attempt in range(2):
619 try:
620 try:
89737671 621 if sys.platform == 'win32':
b506289f 622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the file on windows (for now).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 625 raise LockingUnsupportedError()
0edb3e33 626 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 627 except OSError:
0edb3e33 628 stream = open(filename, open_mode)
8a82af35 629 return stream, filename
86e5f3ed 630 except OSError as err:
0edb3e33 631 if attempt or err.errno in (errno.EACCES,):
632 raise
633 old_filename, filename = filename, sanitize_path(filename)
634 if old_filename == filename:
635 raise
d77c3dfd
FV
636
637
638def timeconvert(timestr):
59ae15a5
PH
639 """Convert RFC 2822 defined time string into system timestamp"""
640 timestamp = None
641 timetuple = email.utils.parsedate_tz(timestr)
642 if timetuple is not None:
643 timestamp = email.utils.mktime_tz(timetuple)
644 return timestamp
1c469a94 645
5f6a1245 646
5c3895ff 647def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 648 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 652 """
5c3895ff 653 if s == '':
654 return ''
655
59ae15a5 656 def replace_insane(char):
c587cbb7
AT
657 if restricted and char in ACCENT_CHARS:
658 return ACCENT_CHARS[char]
91dd88b9 659 elif not restricted and char == '\n':
5c3895ff 660 return '\0 '
989a01c2 661 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
662 # Replace with their full-width unicode counterparts
663 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 664 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
665 return ''
666 elif char == '"':
667 return '' if restricted else '\''
668 elif char == ':':
5c3895ff 669 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 670 elif char in '\\/|*<>':
5c3895ff 671 return '\0_'
672 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
673 return '\0_'
59ae15a5
PH
674 return char
675
989a01c2 676 if restricted and is_id is NO_DEFAULT:
677 s = unicodedata.normalize('NFKC', s)
5c3895ff 678 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 679 result = ''.join(map(replace_insane, s))
5c3895ff 680 if is_id is NO_DEFAULT:
ae61d108 681 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
682 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 683 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
684 result = result.replace('\0', '') or '_'
685
796173d0
PH
686 if not is_id:
687 while '__' in result:
688 result = result.replace('__', '_')
689 result = result.strip('_')
690 # Common case of "Foreign band name - English song title"
691 if restricted and result.startswith('-_'):
692 result = result[2:]
5a42414b
PH
693 if result.startswith('-'):
694 result = '_' + result[len('-'):]
a7440261 695 result = result.lstrip('.')
796173d0
PH
696 if not result:
697 result = '_'
59ae15a5 698 return result
d77c3dfd 699
5f6a1245 700
c2934512 701def sanitize_path(s, force=False):
a2aaf4db 702 """Sanitizes and normalizes path on Windows"""
c2934512 703 if sys.platform == 'win32':
c4218ac3 704 force = False
c2934512 705 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 706 elif force:
707 drive_or_unc = ''
708 else:
a2aaf4db 709 return s
c2934512 710
be531ef1
S
711 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
712 if drive_or_unc:
a2aaf4db
S
713 norm_path.pop(0)
714 sanitized_path = [
ec85ded8 715 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 716 for path_part in norm_path]
be531ef1
S
717 if drive_or_unc:
718 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 719 elif force and s and s[0] == os.path.sep:
c4218ac3 720 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
721 return os.path.join(*sanitized_path)
722
723
8f97a15d 724def sanitize_url(url, *, scheme='http'):
befa4708
S
725 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
726 # the number of unwanted failures due to missing protocol
21633673 727 if url is None:
728 return
729 elif url.startswith('//'):
8f97a15d 730 return f'{scheme}:{url}'
befa4708
S
731 # Fix some common typos seen so far
732 COMMON_TYPOS = (
067aa17e 733 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
734 (r'^httpss://', r'https://'),
735 # https://bx1.be/lives/direct-tv/
736 (r'^rmtp([es]?)://', r'rtmp\1://'),
737 )
738 for mistake, fixup in COMMON_TYPOS:
739 if re.match(mistake, url):
740 return re.sub(mistake, fixup, url)
bc6b9bcd 741 return url
17bcc626
S
742
743
5435dcf9 744def extract_basic_auth(url):
14f25df2 745 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
746 if parts.username is None:
747 return url, None
14f25df2 748 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
749 parts.hostname if parts.port is None
750 else '%s:%d' % (parts.hostname, parts.port))))
751 auth_payload = base64.b64encode(
0f06bcd7 752 ('%s:%s' % (parts.username, parts.password or '')).encode())
753 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
754
755
67dda517 756def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 757 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
758 if auth_header is not None:
759 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
760 headers['Authorization'] = auth_header
ac668111 761 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
762
763
51098426
S
764def expand_path(s):
765 """Expand shell variables and ~"""
766 return os.path.expandvars(compat_expanduser(s))
767
768
7e9a6125 769def orderedSet(iterable, *, lazy=False):
770 """Remove all duplicates from the input iterable"""
771 def _iter():
772 seen = [] # Do not use set since the items can be unhashable
773 for x in iterable:
774 if x not in seen:
775 seen.append(x)
776 yield x
777
778 return _iter() if lazy else list(_iter())
d77c3dfd 779
912b38b4 780
55b2f099 781def _htmlentity_transform(entity_with_semicolon):
4e408e47 782 """Transforms an HTML entity to a character."""
55b2f099
YCH
783 entity = entity_with_semicolon[:-1]
784
4e408e47 785 # Known non-numeric HTML entity
ac668111 786 if entity in html.entities.name2codepoint:
787 return chr(html.entities.name2codepoint[entity])
4e408e47 788
62b58c09
L
789 # TODO: HTML5 allows entities without a semicolon.
790 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 791 if entity_with_semicolon in html.entities.html5:
792 return html.entities.html5[entity_with_semicolon]
55b2f099 793
91757b0f 794 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
795 if mobj is not None:
796 numstr = mobj.group(1)
28e614de 797 if numstr.startswith('x'):
4e408e47 798 base = 16
28e614de 799 numstr = '0%s' % numstr
4e408e47
PH
800 else:
801 base = 10
067aa17e 802 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 803 with contextlib.suppress(ValueError):
ac668111 804 return chr(int(numstr, base))
4e408e47
PH
805
806 # Unknown entity in name, return its literal representation
7a3f0c00 807 return '&%s;' % entity
4e408e47
PH
808
809
d77c3dfd 810def unescapeHTML(s):
912b38b4
PH
811 if s is None:
812 return None
19a03940 813 assert isinstance(s, str)
d77c3dfd 814
4e408e47 815 return re.sub(
95f3f7c2 816 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 817
8bf48f23 818
cdb19aa4 819def escapeHTML(text):
820 return (
821 text
822 .replace('&', '&amp;')
823 .replace('<', '&lt;')
824 .replace('>', '&gt;')
825 .replace('"', '&quot;')
826 .replace("'", '&#39;')
827 )
828
829
f5b1bca9 830def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 831 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
832 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
833 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 834
835
d3c93ec2 836class Popen(subprocess.Popen):
837 if sys.platform == 'win32':
838 _startupinfo = subprocess.STARTUPINFO()
839 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
840 else:
841 _startupinfo = None
842
f0c9fb96 843 def __init__(self, *args, text=False, **kwargs):
844 if text is True:
845 kwargs['universal_newlines'] = True # For 3.6 compatibility
846 kwargs.setdefault('encoding', 'utf-8')
847 kwargs.setdefault('errors', 'replace')
86e5f3ed 848 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 849
850 def communicate_or_kill(self, *args, **kwargs):
8a82af35 851 try:
852 return self.communicate(*args, **kwargs)
853 except BaseException: # Including KeyboardInterrupt
f0c9fb96 854 self.kill(timeout=None)
8a82af35 855 raise
d3c93ec2 856
f0c9fb96 857 def kill(self, *, timeout=0):
858 super().kill()
859 if timeout != 0:
860 self.wait(timeout=timeout)
861
862 @classmethod
863 def run(cls, *args, **kwargs):
864 with cls(*args, **kwargs) as proc:
865 stdout, stderr = proc.communicate_or_kill()
866 return stdout or '', stderr or '', proc.returncode
867
d3c93ec2 868
aa49acd1
S
869def get_subprocess_encoding():
870 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
871 # For subprocess calls, encode with locale encoding
872 # Refer to http://stackoverflow.com/a/9951851/35070
873 encoding = preferredencoding()
874 else:
875 encoding = sys.getfilesystemencoding()
876 if encoding is None:
877 encoding = 'utf-8'
878 return encoding
879
880
8bf48f23 881def encodeFilename(s, for_subprocess=False):
19a03940 882 assert isinstance(s, str)
cfb0511d 883 return s
aa49acd1
S
884
885
886def decodeFilename(b, for_subprocess=False):
cfb0511d 887 return b
8bf48f23 888
f07b74fc
PH
889
890def encodeArgument(s):
cfb0511d 891 # Legacy code that uses byte strings
892 # Uncomment the following line after fixing all post processors
14f25df2 893 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 894 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
895
896
aa49acd1 897def decodeArgument(b):
cfb0511d 898 return b
aa49acd1
S
899
900
8271226a
PH
901def decodeOption(optval):
902 if optval is None:
903 return optval
904 if isinstance(optval, bytes):
905 optval = optval.decode(preferredencoding())
906
14f25df2 907 assert isinstance(optval, str)
8271226a 908 return optval
1c256f70 909
5f6a1245 910
aa7785f8 911_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
912
913
914def timetuple_from_msec(msec):
915 secs, msec = divmod(msec, 1000)
916 mins, secs = divmod(secs, 60)
917 hrs, mins = divmod(mins, 60)
918 return _timetuple(hrs, mins, secs, msec)
919
920
cdb19aa4 921def formatSeconds(secs, delim=':', msec=False):
aa7785f8 922 time = timetuple_from_msec(secs * 1000)
923 if time.hours:
924 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
925 elif time.minutes:
926 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 927 else:
aa7785f8 928 ret = '%d' % time.seconds
929 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 930
a0ddb8a2 931
77562778 932def _ssl_load_windows_store_certs(ssl_context, storename):
933 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
934 try:
935 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
936 if encoding == 'x509_asn' and (
937 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
938 except PermissionError:
939 return
940 for cert in certs:
19a03940 941 with contextlib.suppress(ssl.SSLError):
77562778 942 ssl_context.load_verify_locations(cadata=cert)
a2366922 943
77562778 944
945def make_HTTPS_handler(params, **kwargs):
946 opts_check_certificate = not params.get('nocheckcertificate')
947 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
948 context.check_hostname = opts_check_certificate
f81c62a6 949 if params.get('legacyserverconnect'):
950 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 951 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
952 context.set_ciphers('DEFAULT')
8a82af35 953
77562778 954 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
955 if opts_check_certificate:
d5820461 956 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
957 context.load_verify_locations(cafile=certifi.where())
168bbc4f 958 else:
959 try:
960 context.load_default_certs()
961 # Work around the issue in load_default_certs when there are bad certificates. See:
962 # https://github.com/yt-dlp/yt-dlp/issues/1060,
963 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
964 except ssl.SSLError:
965 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
966 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
967 for storename in ('CA', 'ROOT'):
968 _ssl_load_windows_store_certs(context, storename)
969 context.set_default_verify_paths()
8a82af35 970
bb58c9ed 971 client_certfile = params.get('client_certificate')
972 if client_certfile:
973 try:
974 context.load_cert_chain(
975 client_certfile, keyfile=params.get('client_certificate_key'),
976 password=params.get('client_certificate_password'))
977 except ssl.SSLError:
978 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 979
980 # Some servers may reject requests if ALPN extension is not sent. See:
981 # https://github.com/python/cpython/issues/85140
982 # https://github.com/yt-dlp/yt-dlp/issues/3878
983 with contextlib.suppress(NotImplementedError):
984 context.set_alpn_protocols(['http/1.1'])
985
77562778 986 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 987
732ea2f0 988
5873d4cc 989def bug_reports_message(before=';'):
57e0f077 990 from .update import REPOSITORY
991
992 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
993 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
994
995 before = before.rstrip()
996 if not before or before.endswith(('.', '!', '?')):
997 msg = msg[0].title() + msg[1:]
998
999 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1000
1001
bf5b9d85
PM
1002class YoutubeDLError(Exception):
1003 """Base exception for YoutubeDL errors."""
aa9369a2 1004 msg = None
1005
1006 def __init__(self, msg=None):
1007 if msg is not None:
1008 self.msg = msg
1009 elif self.msg is None:
1010 self.msg = type(self).__name__
1011 super().__init__(self.msg)
bf5b9d85
PM
1012
1013
ac668111 1014network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1015if hasattr(ssl, 'CertificateError'):
1016 network_exceptions.append(ssl.CertificateError)
1017network_exceptions = tuple(network_exceptions)
1018
1019
bf5b9d85 1020class ExtractorError(YoutubeDLError):
1c256f70 1021 """Error during info extraction."""
5f6a1245 1022
1151c407 1023 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1024 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1025 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1026 """
3158150c 1027 if sys.exc_info()[0] in network_exceptions:
9a82b238 1028 expected = True
d5979c5d 1029
7265a219 1030 self.orig_msg = str(msg)
1c256f70 1031 self.traceback = tb
1151c407 1032 self.expected = expected
2eabb802 1033 self.cause = cause
d11271dd 1034 self.video_id = video_id
1151c407 1035 self.ie = ie
1036 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1037 if isinstance(self.exc_info[1], ExtractorError):
1038 self.exc_info = self.exc_info[1].exc_info
1151c407 1039
86e5f3ed 1040 super().__init__(''.join((
a70635b8 1041 format_field(ie, None, '[%s] '),
1042 format_field(video_id, None, '%s: '),
7265a219 1043 msg,
a70635b8 1044 format_field(cause, None, ' (caused by %r)'),
1151c407 1045 '' if expected else bug_reports_message())))
1c256f70 1046
01951dda 1047 def format_traceback(self):
497d2fab 1048 return join_nonempty(
1049 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1050 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1051 delim='\n') or None
01951dda 1052
1c256f70 1053
416c7fcb
PH
1054class UnsupportedError(ExtractorError):
1055 def __init__(self, url):
86e5f3ed 1056 super().__init__(
416c7fcb
PH
1057 'Unsupported URL: %s' % url, expected=True)
1058 self.url = url
1059
1060
55b3e45b
JMF
1061class RegexNotFoundError(ExtractorError):
1062 """Error when a regex didn't match"""
1063 pass
1064
1065
773f291d
S
1066class GeoRestrictedError(ExtractorError):
1067 """Geographic restriction Error exception.
1068
1069 This exception may be thrown when a video is not available from your
1070 geographic location due to geographic restrictions imposed by a website.
1071 """
b6e0c7d2 1072
0db3bae8 1073 def __init__(self, msg, countries=None, **kwargs):
1074 kwargs['expected'] = True
86e5f3ed 1075 super().__init__(msg, **kwargs)
773f291d
S
1076 self.countries = countries
1077
1078
693f0600 1079class UserNotLive(ExtractorError):
1080 """Error when a channel/user is not live"""
1081
1082 def __init__(self, msg=None, **kwargs):
1083 kwargs['expected'] = True
1084 super().__init__(msg or 'The channel is not currently live', **kwargs)
1085
1086
bf5b9d85 1087class DownloadError(YoutubeDLError):
59ae15a5 1088 """Download Error exception.
d77c3dfd 1089
59ae15a5
PH
1090 This exception may be thrown by FileDownloader objects if they are not
1091 configured to continue on errors. They will contain the appropriate
1092 error message.
1093 """
5f6a1245 1094
8cc83b8d
FV
1095 def __init__(self, msg, exc_info=None):
1096 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1097 super().__init__(msg)
8cc83b8d 1098 self.exc_info = exc_info
d77c3dfd
FV
1099
1100
498f5606 1101class EntryNotInPlaylist(YoutubeDLError):
1102 """Entry not in playlist exception.
1103
1104 This exception will be thrown by YoutubeDL when a requested entry
1105 is not found in the playlist info_dict
1106 """
aa9369a2 1107 msg = 'Entry not found in info'
498f5606 1108
1109
bf5b9d85 1110class SameFileError(YoutubeDLError):
59ae15a5 1111 """Same File exception.
d77c3dfd 1112
59ae15a5
PH
1113 This exception will be thrown by FileDownloader objects if they detect
1114 multiple files would have to be downloaded to the same file on disk.
1115 """
aa9369a2 1116 msg = 'Fixed output name but more than one file to download'
1117
1118 def __init__(self, filename=None):
1119 if filename is not None:
1120 self.msg += f': {filename}'
1121 super().__init__(self.msg)
d77c3dfd
FV
1122
1123
bf5b9d85 1124class PostProcessingError(YoutubeDLError):
59ae15a5 1125 """Post Processing exception.
d77c3dfd 1126
59ae15a5
PH
1127 This exception may be raised by PostProcessor's .run() method to
1128 indicate an error in the postprocessing task.
1129 """
5f6a1245 1130
5f6a1245 1131
48f79687 1132class DownloadCancelled(YoutubeDLError):
1133 """ Exception raised when the download queue should be interrupted """
1134 msg = 'The download was cancelled'
8b0d7497 1135
8b0d7497 1136
48f79687 1137class ExistingVideoReached(DownloadCancelled):
1138 """ --break-on-existing triggered """
1139 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1140
48f79687 1141
1142class RejectedVideoReached(DownloadCancelled):
1143 """ --break-on-reject triggered """
1144 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1145
1146
48f79687 1147class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1148 """ --max-downloads limit has been reached. """
48f79687 1149 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1150
1151
f2ebc5c7 1152class ReExtractInfo(YoutubeDLError):
1153 """ Video info needs to be re-extracted. """
1154
1155 def __init__(self, msg, expected=False):
1156 super().__init__(msg)
1157 self.expected = expected
1158
1159
1160class ThrottledDownload(ReExtractInfo):
48f79687 1161 """ Download speed below --throttled-rate. """
aa9369a2 1162 msg = 'The download speed is below throttle limit'
d77c3dfd 1163
43b22906 1164 def __init__(self):
1165 super().__init__(self.msg, expected=False)
f2ebc5c7 1166
d77c3dfd 1167
bf5b9d85 1168class UnavailableVideoError(YoutubeDLError):
59ae15a5 1169 """Unavailable Format exception.
d77c3dfd 1170
59ae15a5
PH
1171 This exception will be thrown when a video is requested
1172 in a format that is not available for that video.
1173 """
aa9369a2 1174 msg = 'Unable to download video'
1175
1176 def __init__(self, err=None):
1177 if err is not None:
1178 self.msg += f': {err}'
1179 super().__init__(self.msg)
d77c3dfd
FV
1180
1181
bf5b9d85 1182class ContentTooShortError(YoutubeDLError):
59ae15a5 1183 """Content Too Short exception.
d77c3dfd 1184
59ae15a5
PH
1185 This exception may be raised by FileDownloader objects when a file they
1186 download is too small for what the server announced first, indicating
1187 the connection was probably interrupted.
1188 """
d77c3dfd 1189
59ae15a5 1190 def __init__(self, downloaded, expected):
86e5f3ed 1191 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1192 # Both in bytes
59ae15a5
PH
1193 self.downloaded = downloaded
1194 self.expected = expected
d77c3dfd 1195
5f6a1245 1196
bf5b9d85 1197class XAttrMetadataError(YoutubeDLError):
efa97bdc 1198 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1199 super().__init__(msg)
efa97bdc 1200 self.code = code
bd264412 1201 self.msg = msg
efa97bdc
YCH
1202
1203 # Parsing code and msg
3089bc74 1204 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1205 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1206 self.reason = 'NO_SPACE'
1207 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1208 self.reason = 'VALUE_TOO_LONG'
1209 else:
1210 self.reason = 'NOT_SUPPORTED'
1211
1212
bf5b9d85 1213class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1214 pass
1215
1216
c5a59d93 1217def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1218 hc = http_class(*args, **kwargs)
be4a824d 1219 source_address = ydl_handler._params.get('source_address')
8959018a 1220
be4a824d 1221 if source_address is not None:
8959018a
AU
1222 # This is to workaround _create_connection() from socket where it will try all
1223 # address data from getaddrinfo() including IPv6. This filters the result from
1224 # getaddrinfo() based on the source_address value.
1225 # This is based on the cpython socket.create_connection() function.
1226 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1227 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1228 host, port = address
1229 err = None
1230 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1231 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1232 ip_addrs = [addr for addr in addrs if addr[0] == af]
1233 if addrs and not ip_addrs:
1234 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1235 raise OSError(
9e21e6d9
S
1236 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1237 % (ip_version, source_address[0]))
8959018a
AU
1238 for res in ip_addrs:
1239 af, socktype, proto, canonname, sa = res
1240 sock = None
1241 try:
1242 sock = socket.socket(af, socktype, proto)
1243 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1244 sock.settimeout(timeout)
1245 sock.bind(source_address)
1246 sock.connect(sa)
1247 err = None # Explicitly break reference cycle
1248 return sock
86e5f3ed 1249 except OSError as _:
8959018a
AU
1250 err = _
1251 if sock is not None:
1252 sock.close()
1253 if err is not None:
1254 raise err
1255 else:
86e5f3ed 1256 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1257 if hasattr(hc, '_create_connection'):
1258 hc._create_connection = _create_connection
cfb0511d 1259 hc.source_address = (source_address, 0)
be4a824d
PH
1260
1261 return hc
1262
1263
87f0e62d 1264def handle_youtubedl_headers(headers):
992fc9d6
YCH
1265 filtered_headers = headers
1266
1267 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1268 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1269 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1270
992fc9d6 1271 return filtered_headers
87f0e62d
YCH
1272
1273
ac668111 1274class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1275 """Handler for HTTP requests and responses.
1276
1277 This class, when installed with an OpenerDirector, automatically adds
1278 the standard headers to every HTTP request and handles gzipped and
1279 deflated responses from web servers. If compression is to be avoided in
1280 a particular request, the original request in the program code only has
0424ec30 1281 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1282 removed before making the real request.
1283
1284 Part of this code was copied from:
1285
1286 http://techknack.net/python-urllib2-handlers/
1287
1288 Andrew Rowls, the author of that code, agreed to release it to the
1289 public domain.
1290 """
1291
be4a824d 1292 def __init__(self, params, *args, **kwargs):
ac668111 1293 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1294 self._params = params
1295
1296 def http_open(self, req):
ac668111 1297 conn_class = http.client.HTTPConnection
71aff188
YCH
1298
1299 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1300 if socks_proxy:
1301 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1302 del req.headers['Ytdl-socks-proxy']
1303
be4a824d 1304 return self.do_open(functools.partial(
71aff188 1305 _create_http_connection, self, conn_class, False),
be4a824d
PH
1306 req)
1307
59ae15a5
PH
1308 @staticmethod
1309 def deflate(data):
fc2119f2 1310 if not data:
1311 return data
59ae15a5
PH
1312 try:
1313 return zlib.decompress(data, -zlib.MAX_WBITS)
1314 except zlib.error:
1315 return zlib.decompress(data)
1316
4390d5ec 1317 @staticmethod
1318 def brotli(data):
1319 if not data:
1320 return data
9b8ee23b 1321 return brotli.decompress(data)
4390d5ec 1322
acebc9cd 1323 def http_request(self, req):
51f267d9
S
1324 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1325 # always respected by websites, some tend to give out URLs with non percent-encoded
1326 # non-ASCII characters (see telemb.py, ard.py [#3412])
1327 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1328 # To work around aforementioned issue we will replace request's original URL with
1329 # percent-encoded one
1330 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1331 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1332 url = req.get_full_url()
1333 url_escaped = escape_url(url)
1334
1335 # Substitute URL if any change after escaping
1336 if url != url_escaped:
15d260eb 1337 req = update_Request(req, url=url_escaped)
51f267d9 1338
8b7539d2 1339 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1340 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1341 # The dict keys are capitalized because of this bug by urllib
1342 if h.capitalize() not in req.headers:
33ac271b 1343 req.add_header(h, v)
87f0e62d 1344
af14914b 1345 if 'Accept-encoding' not in req.headers:
1346 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1347
87f0e62d 1348 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1349
379a4f16 1350 return super().do_request_(req)
59ae15a5 1351
acebc9cd 1352 def http_response(self, req, resp):
59ae15a5
PH
1353 old_resp = resp
1354 # gzip
1355 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1356 content = resp.read()
1357 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1358 try:
1359 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1360 except OSError as original_ioerror:
aa3e9507
PH
1361 # There may be junk add the end of the file
1362 # See http://stackoverflow.com/q/4928560/35070 for details
1363 for i in range(1, 1024):
1364 try:
1365 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1366 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1367 except OSError:
aa3e9507
PH
1368 continue
1369 break
1370 else:
1371 raise original_ioerror
ac668111 1372 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1373 resp.msg = old_resp.msg
c047270c 1374 del resp.headers['Content-encoding']
59ae15a5
PH
1375 # deflate
1376 if resp.headers.get('Content-encoding', '') == 'deflate':
1377 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1378 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1379 resp.msg = old_resp.msg
c047270c 1380 del resp.headers['Content-encoding']
4390d5ec 1381 # brotli
1382 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1383 resp = urllib.request.addinfourl(
4390d5ec 1384 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1385 resp.msg = old_resp.msg
1386 del resp.headers['Content-encoding']
ad729172 1387 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1388 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1389 if 300 <= resp.code < 400:
1390 location = resp.headers.get('Location')
1391 if location:
1392 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1393 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1394 location_escaped = escape_url(location)
1395 if location != location_escaped:
1396 del resp.headers['Location']
1397 resp.headers['Location'] = location_escaped
59ae15a5 1398 return resp
0f8d03f8 1399
acebc9cd
PH
1400 https_request = http_request
1401 https_response = http_response
bf50b038 1402
5de90176 1403
71aff188
YCH
1404def make_socks_conn_class(base_class, socks_proxy):
1405 assert issubclass(base_class, (
ac668111 1406 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1407
14f25df2 1408 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1409 if url_components.scheme.lower() == 'socks5':
1410 socks_type = ProxyType.SOCKS5
1411 elif url_components.scheme.lower() in ('socks', 'socks4'):
1412 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1413 elif url_components.scheme.lower() == 'socks4a':
1414 socks_type = ProxyType.SOCKS4A
71aff188 1415
cdd94c2e
YCH
1416 def unquote_if_non_empty(s):
1417 if not s:
1418 return s
ac668111 1419 return urllib.parse.unquote_plus(s)
cdd94c2e 1420
71aff188
YCH
1421 proxy_args = (
1422 socks_type,
1423 url_components.hostname, url_components.port or 1080,
1424 True, # Remote DNS
cdd94c2e
YCH
1425 unquote_if_non_empty(url_components.username),
1426 unquote_if_non_empty(url_components.password),
71aff188
YCH
1427 )
1428
1429 class SocksConnection(base_class):
1430 def connect(self):
1431 self.sock = sockssocket()
1432 self.sock.setproxy(*proxy_args)
19a03940 1433 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1434 self.sock.settimeout(self.timeout)
1435 self.sock.connect((self.host, self.port))
1436
ac668111 1437 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1438 if hasattr(self, '_context'): # Python > 2.6
1439 self.sock = self._context.wrap_socket(
1440 self.sock, server_hostname=self.host)
1441 else:
1442 self.sock = ssl.wrap_socket(self.sock)
1443
1444 return SocksConnection
1445
1446
ac668111 1447class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1448 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1449 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1450 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1451 self._params = params
1452
1453 def https_open(self, req):
4f264c02 1454 kwargs = {}
71aff188
YCH
1455 conn_class = self._https_conn_class
1456
4f264c02
JMF
1457 if hasattr(self, '_context'): # python > 2.6
1458 kwargs['context'] = self._context
1459 if hasattr(self, '_check_hostname'): # python 3.x
1460 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1461
1462 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1463 if socks_proxy:
1464 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1465 del req.headers['Ytdl-socks-proxy']
1466
4f28b537 1467 try:
1468 return self.do_open(
1469 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1470 except urllib.error.URLError as e:
1471 if (isinstance(e.reason, ssl.SSLError)
1472 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1473 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1474 raise
be4a824d
PH
1475
1476
ac668111 1477class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1478 """
1479 See [1] for cookie file format.
1480
1481 1. https://curl.haxx.se/docs/http-cookies.html
1482 """
e7e62441 1483 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1484 _ENTRY_LEN = 7
1485 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1486# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1487
1488'''
1489 _CookieFileEntry = collections.namedtuple(
1490 'CookieFileEntry',
1491 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1492
d76fa1f3 1493 def __init__(self, filename=None, *args, **kwargs):
1494 super().__init__(None, *args, **kwargs)
1495 if self.is_path(filename):
1496 filename = os.fspath(filename)
1497 self.filename = filename
1498
24146491 1499 @staticmethod
1500 def _true_or_false(cndn):
1501 return 'TRUE' if cndn else 'FALSE'
1502
d76fa1f3 1503 @staticmethod
1504 def is_path(file):
1505 return isinstance(file, (str, bytes, os.PathLike))
1506
1507 @contextlib.contextmanager
1508 def open(self, file, *, write=False):
1509 if self.is_path(file):
1510 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1511 yield f
1512 else:
1513 if write:
1514 file.truncate(0)
1515 yield file
1516
24146491 1517 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1518 now = time.time()
1519 for cookie in self:
1520 if (not ignore_discard and cookie.discard
1521 or not ignore_expires and cookie.is_expired(now)):
1522 continue
1523 name, value = cookie.name, cookie.value
1524 if value is None:
1525 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1526 # with no name, whereas http.cookiejar regards it as a
1527 # cookie with no value.
1528 name, value = '', name
1529 f.write('%s\n' % '\t'.join((
1530 cookie.domain,
1531 self._true_or_false(cookie.domain.startswith('.')),
1532 cookie.path,
1533 self._true_or_false(cookie.secure),
1534 str_or_none(cookie.expires, default=''),
1535 name, value
1536 )))
1537
1538 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1539 """
1540 Save cookies to a file.
24146491 1541 Code is taken from CPython 3.6
1542 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1543
c380cc28
S
1544 if filename is None:
1545 if self.filename is not None:
1546 filename = self.filename
1547 else:
ac668111 1548 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1549
24146491 1550 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1551 for cookie in self:
1552 if cookie.expires is None:
1553 cookie.expires = 0
c380cc28 1554
d76fa1f3 1555 with self.open(filename, write=True) as f:
c380cc28 1556 f.write(self._HEADER)
24146491 1557 self._really_save(f, *args, **kwargs)
1bab3437
S
1558
1559 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1560 """Load cookies from a file."""
1561 if filename is None:
1562 if self.filename is not None:
1563 filename = self.filename
1564 else:
ac668111 1565 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1566
c380cc28
S
1567 def prepare_line(line):
1568 if line.startswith(self._HTTPONLY_PREFIX):
1569 line = line[len(self._HTTPONLY_PREFIX):]
1570 # comments and empty lines are fine
1571 if line.startswith('#') or not line.strip():
1572 return line
1573 cookie_list = line.split('\t')
1574 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1575 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1576 cookie = self._CookieFileEntry(*cookie_list)
1577 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1578 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1579 return line
1580
e7e62441 1581 cf = io.StringIO()
d76fa1f3 1582 with self.open(filename) as f:
e7e62441 1583 for line in f:
c380cc28
S
1584 try:
1585 cf.write(prepare_line(line))
ac668111 1586 except http.cookiejar.LoadError as e:
94aa0644 1587 if f'{line.strip()} '[0] in '[{"':
ac668111 1588 raise http.cookiejar.LoadError(
94aa0644
L
1589 'Cookies file must be Netscape formatted, not JSON. See '
1590 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1591 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1592 continue
e7e62441 1593 cf.seek(0)
1594 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1595 # Session cookies are denoted by either `expires` field set to
1596 # an empty string or 0. MozillaCookieJar only recognizes the former
1597 # (see [1]). So we need force the latter to be recognized as session
1598 # cookies on our own.
1599 # Session cookies may be important for cookies-based authentication,
1600 # e.g. usually, when user does not check 'Remember me' check box while
1601 # logging in on a site, some important cookies are stored as session
1602 # cookies so that not recognizing them will result in failed login.
1603 # 1. https://bugs.python.org/issue17164
1604 for cookie in self:
1605 # Treat `expires=0` cookies as session cookies
1606 if cookie.expires == 0:
1607 cookie.expires = None
1608 cookie.discard = True
1609
1610
ac668111 1611class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1612 def __init__(self, cookiejar=None):
ac668111 1613 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1614
1615 def http_response(self, request, response):
ac668111 1616 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1617
ac668111 1618 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1619 https_response = http_response
1620
1621
ac668111 1622class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1623 """YoutubeDL redirect handler
1624
1625 The code is based on HTTPRedirectHandler implementation from CPython [1].
1626
1627 This redirect handler solves two issues:
1628 - ensures redirect URL is always unicode under python 2
1629 - introduces support for experimental HTTP response status code
1630 308 Permanent Redirect [2] used by some sites [3]
1631
1632 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1633 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1634 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1635 """
1636
ac668111 1637 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1638
1639 def redirect_request(self, req, fp, code, msg, headers, newurl):
1640 """Return a Request or None in response to a redirect.
1641
1642 This is called by the http_error_30x methods when a
1643 redirection response is received. If a redirection should
1644 take place, return a new Request to allow http_error_30x to
1645 perform the redirect. Otherwise, raise HTTPError if no-one
1646 else should try to handle this url. Return None if you can't
1647 but another Handler might.
1648 """
1649 m = req.get_method()
1650 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1651 or code in (301, 302, 303) and m == "POST")):
14f25df2 1652 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1653 # Strictly (according to RFC 2616), 301 or 302 in response to
1654 # a POST MUST NOT cause a redirection without confirmation
1655 # from the user (of urllib.request, in this case). In practice,
1656 # essentially all clients do redirect in this case, so we do
1657 # the same.
1658
201c1459 1659 # Be conciliant with URIs containing a space. This is mainly
1660 # redundant with the more complete encoding done in http_error_302(),
1661 # but it is kept for compatibility with other callers.
1662 newurl = newurl.replace(' ', '%20')
1663
1664 CONTENT_HEADERS = ("content-length", "content-type")
1665 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1666 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1667
1668 # A 303 must either use GET or HEAD for subsequent request
1669 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1670 if code == 303 and m != 'HEAD':
1671 m = 'GET'
1672 # 301 and 302 redirects are commonly turned into a GET from a POST
1673 # for subsequent requests by browsers, so we'll do the same.
1674 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1675 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1676 if code in (301, 302) and m == 'POST':
1677 m = 'GET'
1678
ac668111 1679 return urllib.request.Request(
201c1459 1680 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1681 unverifiable=True, method=m)
fca6dba8
S
1682
1683
46f59e89
S
1684def extract_timezone(date_str):
1685 m = re.search(
f137e4c2 1686 r'''(?x)
1687 ^.{8,}? # >=8 char non-TZ prefix, if present
1688 (?P<tz>Z| # just the UTC Z, or
1689 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1690 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1691 [ ]? # optional space
1692 (?P<sign>\+|-) # +/-
1693 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1694 $)
1695 ''', date_str)
46f59e89 1696 if not m:
8f53dc44 1697 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1698 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1699 if timezone is not None:
1700 date_str = date_str[:-len(m.group('tz'))]
1701 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1702 else:
1703 date_str = date_str[:-len(m.group('tz'))]
1704 if not m.group('sign'):
1705 timezone = datetime.timedelta()
1706 else:
1707 sign = 1 if m.group('sign') == '+' else -1
1708 timezone = datetime.timedelta(
1709 hours=sign * int(m.group('hours')),
1710 minutes=sign * int(m.group('minutes')))
1711 return timezone, date_str
1712
1713
08b38d54 1714def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1715 """ Return a UNIX timestamp from the given date """
1716
1717 if date_str is None:
1718 return None
1719
52c3a6e4
S
1720 date_str = re.sub(r'\.[0-9]+', '', date_str)
1721
08b38d54 1722 if timezone is None:
46f59e89
S
1723 timezone, date_str = extract_timezone(date_str)
1724
19a03940 1725 with contextlib.suppress(ValueError):
86e5f3ed 1726 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1727 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1728 return calendar.timegm(dt.timetuple())
912b38b4
PH
1729
1730
46f59e89
S
1731def date_formats(day_first=True):
1732 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1733
1734
42bdd9d0 1735def unified_strdate(date_str, day_first=True):
bf50b038 1736 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1737
1738 if date_str is None:
1739 return None
bf50b038 1740 upload_date = None
5f6a1245 1741 # Replace commas
026fcc04 1742 date_str = date_str.replace(',', ' ')
42bdd9d0 1743 # Remove AM/PM + timezone
9bb8e0a3 1744 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1745 _, date_str = extract_timezone(date_str)
42bdd9d0 1746
46f59e89 1747 for expression in date_formats(day_first):
19a03940 1748 with contextlib.suppress(ValueError):
bf50b038 1749 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1750 if upload_date is None:
1751 timetuple = email.utils.parsedate_tz(date_str)
1752 if timetuple:
19a03940 1753 with contextlib.suppress(ValueError):
c6b9cf05 1754 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1755 if upload_date is not None:
14f25df2 1756 return str(upload_date)
bf50b038 1757
5f6a1245 1758
46f59e89
S
1759def unified_timestamp(date_str, day_first=True):
1760 if date_str is None:
1761 return None
1762
8f53dc44 1763 date_str = re.sub(r'\s+', ' ', re.sub(
1764 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1765
7dc2a74e 1766 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1767 timezone, date_str = extract_timezone(date_str)
1768
1769 # Remove AM/PM + timezone
1770 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1771
deef3195
S
1772 # Remove unrecognized timezones from ISO 8601 alike timestamps
1773 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1774 if m:
1775 date_str = date_str[:-len(m.group('tz'))]
1776
f226880c
PH
1777 # Python only supports microseconds, so remove nanoseconds
1778 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1779 if m:
1780 date_str = m.group(1)
1781
46f59e89 1782 for expression in date_formats(day_first):
19a03940 1783 with contextlib.suppress(ValueError):
7dc2a74e 1784 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1785 return calendar.timegm(dt.timetuple())
8f53dc44 1786
46f59e89
S
1787 timetuple = email.utils.parsedate_tz(date_str)
1788 if timetuple:
8f53dc44 1789 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1790
1791
28e614de 1792def determine_ext(url, default_ext='unknown_video'):
85750f89 1793 if url is None or '.' not in url:
f4776371 1794 return default_ext
9cb9a5df 1795 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1796 if re.match(r'^[A-Za-z0-9]+$', guess):
1797 return guess
a7aaa398
S
1798 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1799 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1800 return guess.rstrip('/')
73e79f2a 1801 else:
cbdbb766 1802 return default_ext
73e79f2a 1803
5f6a1245 1804
824fa511
S
1805def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1806 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1807
5f6a1245 1808
9e62f283 1809def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1810 R"""
1811 Return a datetime object from a string.
1812 Supported format:
1813 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1814
1815 @param format strftime format of DATE
1816 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1817 auto: round to the unit provided in date_str (if applicable).
9e62f283 1818 """
1819 auto_precision = False
1820 if precision == 'auto':
1821 auto_precision = True
1822 precision = 'microsecond'
396a76f7 1823 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1824 if date_str in ('now', 'today'):
37254abc 1825 return today
f8795e10
PH
1826 if date_str == 'yesterday':
1827 return today - datetime.timedelta(days=1)
9e62f283 1828 match = re.match(
3d38b2d6 1829 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1830 date_str)
37254abc 1831 if match is not None:
9e62f283 1832 start_time = datetime_from_str(match.group('start'), precision, format)
1833 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1834 unit = match.group('unit')
9e62f283 1835 if unit == 'month' or unit == 'year':
1836 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1837 unit = 'day'
9e62f283 1838 else:
1839 if unit == 'week':
1840 unit = 'day'
1841 time *= 7
1842 delta = datetime.timedelta(**{unit + 's': time})
1843 new_date = start_time + delta
1844 if auto_precision:
1845 return datetime_round(new_date, unit)
1846 return new_date
1847
1848 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1849
1850
d49f8db3 1851def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1852 R"""
1853 Return a date object from a string using datetime_from_str
9e62f283 1854
3d38b2d6 1855 @param strict Restrict allowed patterns to "YYYYMMDD" and
1856 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1857 """
3d38b2d6 1858 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1859 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1860 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1861
1862
1863def datetime_add_months(dt, months):
1864 """Increment/Decrement a datetime object by months."""
1865 month = dt.month + months - 1
1866 year = dt.year + month // 12
1867 month = month % 12 + 1
1868 day = min(dt.day, calendar.monthrange(year, month)[1])
1869 return dt.replace(year, month, day)
1870
1871
1872def datetime_round(dt, precision='day'):
1873 """
1874 Round a datetime object's time to a specific precision
1875 """
1876 if precision == 'microsecond':
1877 return dt
1878
1879 unit_seconds = {
1880 'day': 86400,
1881 'hour': 3600,
1882 'minute': 60,
1883 'second': 1,
1884 }
1885 roundto = lambda x, n: ((x + n / 2) // n) * n
1886 timestamp = calendar.timegm(dt.timetuple())
1887 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1888
1889
e63fc1be 1890def hyphenate_date(date_str):
1891 """
1892 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1893 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1894 if match is not None:
1895 return '-'.join(match.groups())
1896 else:
1897 return date_str
1898
5f6a1245 1899
86e5f3ed 1900class DateRange:
bd558525 1901 """Represents a time interval between two dates"""
5f6a1245 1902
bd558525
JMF
1903 def __init__(self, start=None, end=None):
1904 """start and end must be strings in the format accepted by date"""
1905 if start is not None:
d49f8db3 1906 self.start = date_from_str(start, strict=True)
bd558525
JMF
1907 else:
1908 self.start = datetime.datetime.min.date()
1909 if end is not None:
d49f8db3 1910 self.end = date_from_str(end, strict=True)
bd558525
JMF
1911 else:
1912 self.end = datetime.datetime.max.date()
37254abc 1913 if self.start > self.end:
bd558525 1914 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1915
bd558525
JMF
1916 @classmethod
1917 def day(cls, day):
1918 """Returns a range that only contains the given day"""
5f6a1245
JW
1919 return cls(day, day)
1920
bd558525
JMF
1921 def __contains__(self, date):
1922 """Check if the date is in the range"""
37254abc
JMF
1923 if not isinstance(date, datetime.date):
1924 date = date_from_str(date)
1925 return self.start <= date <= self.end
5f6a1245 1926
bd558525 1927 def __str__(self):
86e5f3ed 1928 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1929
f2df4071 1930 def __eq__(self, other):
1931 return (isinstance(other, DateRange)
1932 and self.start == other.start and self.end == other.end)
1933
c496ca96
PH
1934
1935def platform_name():
14f25df2 1936 """ Returns the platform name as a str """
b1f94422 1937 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1938 return platform.platform()
c496ca96 1939
b1f94422 1940
1941@functools.cache
1942def system_identifier():
1943 python_implementation = platform.python_implementation()
1944 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1945 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1946
1947 return 'Python %s (%s %s) - %s %s' % (
1948 platform.python_version(),
1949 python_implementation,
1950 platform.architecture()[0],
1951 platform.platform(),
1952 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1953 )
c257baff
PH
1954
1955
0b9c08b4 1956@functools.cache
49fa4d9a 1957def get_windows_version():
8a82af35 1958 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1959 if compat_os_name == 'nt':
1960 return version_tuple(platform.win32_ver()[1])
1961 else:
8a82af35 1962 return ()
49fa4d9a
N
1963
1964
734f90bb 1965def write_string(s, out=None, encoding=None):
19a03940 1966 assert isinstance(s, str)
1967 out = out or sys.stderr
7459e3a2 1968
fe1daad3 1969 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1970 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1971
8a82af35 1972 enc, buffer = None, out
cfb0511d 1973 if 'b' in getattr(out, 'mode', ''):
c487cf00 1974 enc = encoding or preferredencoding()
104aa738 1975 elif hasattr(out, 'buffer'):
8a82af35 1976 buffer = out.buffer
104aa738 1977 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1978
8a82af35 1979 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1980 out.flush()
1981
1982
48ea9cea
PH
1983def bytes_to_intlist(bs):
1984 if not bs:
1985 return []
1986 if isinstance(bs[0], int): # Python 3
1987 return list(bs)
1988 else:
1989 return [ord(c) for c in bs]
1990
c257baff 1991
cba892fa 1992def intlist_to_bytes(xs):
1993 if not xs:
1994 return b''
ac668111 1995 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1996
1997
8a82af35 1998class LockingUnsupportedError(OSError):
1890fc63 1999 msg = 'File locking is not supported'
0edb3e33 2000
2001 def __init__(self):
2002 super().__init__(self.msg)
2003
2004
c1c9a79c
PH
2005# Cross-platform file locking
2006if sys.platform == 'win32':
fe0918bb 2007 import ctypes
c1c9a79c
PH
2008 import ctypes.wintypes
2009 import msvcrt
2010
2011 class OVERLAPPED(ctypes.Structure):
2012 _fields_ = [
2013 ('Internal', ctypes.wintypes.LPVOID),
2014 ('InternalHigh', ctypes.wintypes.LPVOID),
2015 ('Offset', ctypes.wintypes.DWORD),
2016 ('OffsetHigh', ctypes.wintypes.DWORD),
2017 ('hEvent', ctypes.wintypes.HANDLE),
2018 ]
2019
2020 kernel32 = ctypes.windll.kernel32
2021 LockFileEx = kernel32.LockFileEx
2022 LockFileEx.argtypes = [
2023 ctypes.wintypes.HANDLE, # hFile
2024 ctypes.wintypes.DWORD, # dwFlags
2025 ctypes.wintypes.DWORD, # dwReserved
2026 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2027 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2028 ctypes.POINTER(OVERLAPPED) # Overlapped
2029 ]
2030 LockFileEx.restype = ctypes.wintypes.BOOL
2031 UnlockFileEx = kernel32.UnlockFileEx
2032 UnlockFileEx.argtypes = [
2033 ctypes.wintypes.HANDLE, # hFile
2034 ctypes.wintypes.DWORD, # dwReserved
2035 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2036 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2037 ctypes.POINTER(OVERLAPPED) # Overlapped
2038 ]
2039 UnlockFileEx.restype = ctypes.wintypes.BOOL
2040 whole_low = 0xffffffff
2041 whole_high = 0x7fffffff
2042
747c0bd1 2043 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2044 overlapped = OVERLAPPED()
2045 overlapped.Offset = 0
2046 overlapped.OffsetHigh = 0
2047 overlapped.hEvent = 0
2048 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2049
2050 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2051 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2052 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2053 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2054 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2055
2056 def _unlock_file(f):
2057 assert f._lock_file_overlapped_p
2058 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2059 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2060 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2061
2062else:
399a76e6
YCH
2063 try:
2064 import fcntl
c1c9a79c 2065
a3125791 2066 def _lock_file(f, exclusive, block):
b63837bc 2067 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2068 if not block:
2069 flags |= fcntl.LOCK_NB
acea8d7c 2070 try:
b63837bc 2071 fcntl.flock(f, flags)
acea8d7c
JK
2072 except BlockingIOError:
2073 raise
2074 except OSError: # AOSP does not have flock()
b63837bc 2075 fcntl.lockf(f, flags)
c1c9a79c 2076
399a76e6 2077 def _unlock_file(f):
acea8d7c
JK
2078 try:
2079 fcntl.flock(f, fcntl.LOCK_UN)
2080 except OSError:
2081 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2082
399a76e6 2083 except ImportError:
399a76e6 2084
a3125791 2085 def _lock_file(f, exclusive, block):
0edb3e33 2086 raise LockingUnsupportedError()
399a76e6
YCH
2087
2088 def _unlock_file(f):
0edb3e33 2089 raise LockingUnsupportedError()
c1c9a79c
PH
2090
2091
86e5f3ed 2092class locked_file:
0edb3e33 2093 locked = False
747c0bd1 2094
a3125791 2095 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2096 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2097 raise NotImplementedError(mode)
2098 self.mode, self.block = mode, block
2099
2100 writable = any(f in mode for f in 'wax+')
2101 readable = any(f in mode for f in 'r+')
2102 flags = functools.reduce(operator.ior, (
2103 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2104 getattr(os, 'O_BINARY', 0), # Windows only
2105 getattr(os, 'O_NOINHERIT', 0), # Windows only
2106 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2107 os.O_APPEND if 'a' in mode else 0,
2108 os.O_EXCL if 'x' in mode else 0,
2109 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2110 ))
2111
98804d03 2112 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2113
2114 def __enter__(self):
a3125791 2115 exclusive = 'r' not in self.mode
c1c9a79c 2116 try:
a3125791 2117 _lock_file(self.f, exclusive, self.block)
0edb3e33 2118 self.locked = True
86e5f3ed 2119 except OSError:
c1c9a79c
PH
2120 self.f.close()
2121 raise
fcfa8853 2122 if 'w' in self.mode:
131e14dc
JK
2123 try:
2124 self.f.truncate()
2125 except OSError as e:
1890fc63 2126 if e.errno not in (
2127 errno.ESPIPE, # Illegal seek - expected for FIFO
2128 errno.EINVAL, # Invalid argument - expected for /dev/null
2129 ):
2130 raise
c1c9a79c
PH
2131 return self
2132
0edb3e33 2133 def unlock(self):
2134 if not self.locked:
2135 return
c1c9a79c 2136 try:
0edb3e33 2137 _unlock_file(self.f)
c1c9a79c 2138 finally:
0edb3e33 2139 self.locked = False
c1c9a79c 2140
0edb3e33 2141 def __exit__(self, *_):
2142 try:
2143 self.unlock()
2144 finally:
2145 self.f.close()
4eb7f1d1 2146
0edb3e33 2147 open = __enter__
2148 close = __exit__
a3125791 2149
0edb3e33 2150 def __getattr__(self, attr):
2151 return getattr(self.f, attr)
a3125791 2152
0edb3e33 2153 def __iter__(self):
2154 return iter(self.f)
a3125791 2155
4eb7f1d1 2156
0b9c08b4 2157@functools.cache
4644ac55
S
2158def get_filesystem_encoding():
2159 encoding = sys.getfilesystemencoding()
2160 return encoding if encoding is not None else 'utf-8'
2161
2162
4eb7f1d1 2163def shell_quote(args):
a6a173c2 2164 quoted_args = []
4644ac55 2165 encoding = get_filesystem_encoding()
a6a173c2
JMF
2166 for a in args:
2167 if isinstance(a, bytes):
2168 # We may get a filename encoded with 'encodeFilename'
2169 a = a.decode(encoding)
aefce8e6 2170 quoted_args.append(compat_shlex_quote(a))
28e614de 2171 return ' '.join(quoted_args)
9d4660ca
PH
2172
2173
2174def smuggle_url(url, data):
2175 """ Pass additional data in a URL for internal use. """
2176
81953d1a
RA
2177 url, idata = unsmuggle_url(url, {})
2178 data.update(idata)
14f25df2 2179 sdata = urllib.parse.urlencode(
28e614de
PH
2180 {'__youtubedl_smuggle': json.dumps(data)})
2181 return url + '#' + sdata
9d4660ca
PH
2182
2183
79f82953 2184def unsmuggle_url(smug_url, default=None):
83e865a3 2185 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2186 return smug_url, default
28e614de 2187 url, _, sdata = smug_url.rpartition('#')
14f25df2 2188 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2189 data = json.loads(jsond)
2190 return url, data
02dbf93f
PH
2191
2192
e0fd9573 2193def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2194 """ Formats numbers with decimal sufixes like K, M, etc """
2195 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2196 if num is None or num < 0:
e0fd9573 2197 return None
eeb2a770 2198 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2199 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2200 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2201 if factor == 1024:
2202 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2203 converted = num / (factor ** exponent)
abbeeebc 2204 return fmt % (converted, suffix)
e0fd9573 2205
2206
02dbf93f 2207def format_bytes(bytes):
f02d24d8 2208 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2209
1c088fa8 2210
fb47597b
S
2211def lookup_unit_table(unit_table, s):
2212 units_re = '|'.join(re.escape(u) for u in unit_table)
2213 m = re.match(
782b1b5b 2214 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2215 if not m:
2216 return None
2217 num_str = m.group('num').replace(',', '.')
2218 mult = unit_table[m.group('unit')]
2219 return int(float(num_str) * mult)
2220
2221
be64b5b0
PH
2222def parse_filesize(s):
2223 if s is None:
2224 return None
2225
dfb1b146 2226 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2227 # but we support those too
2228 _UNIT_TABLE = {
2229 'B': 1,
2230 'b': 1,
70852b47 2231 'bytes': 1,
be64b5b0
PH
2232 'KiB': 1024,
2233 'KB': 1000,
2234 'kB': 1024,
2235 'Kb': 1000,
13585d76 2236 'kb': 1000,
70852b47
YCH
2237 'kilobytes': 1000,
2238 'kibibytes': 1024,
be64b5b0
PH
2239 'MiB': 1024 ** 2,
2240 'MB': 1000 ** 2,
2241 'mB': 1024 ** 2,
2242 'Mb': 1000 ** 2,
13585d76 2243 'mb': 1000 ** 2,
70852b47
YCH
2244 'megabytes': 1000 ** 2,
2245 'mebibytes': 1024 ** 2,
be64b5b0
PH
2246 'GiB': 1024 ** 3,
2247 'GB': 1000 ** 3,
2248 'gB': 1024 ** 3,
2249 'Gb': 1000 ** 3,
13585d76 2250 'gb': 1000 ** 3,
70852b47
YCH
2251 'gigabytes': 1000 ** 3,
2252 'gibibytes': 1024 ** 3,
be64b5b0
PH
2253 'TiB': 1024 ** 4,
2254 'TB': 1000 ** 4,
2255 'tB': 1024 ** 4,
2256 'Tb': 1000 ** 4,
13585d76 2257 'tb': 1000 ** 4,
70852b47
YCH
2258 'terabytes': 1000 ** 4,
2259 'tebibytes': 1024 ** 4,
be64b5b0
PH
2260 'PiB': 1024 ** 5,
2261 'PB': 1000 ** 5,
2262 'pB': 1024 ** 5,
2263 'Pb': 1000 ** 5,
13585d76 2264 'pb': 1000 ** 5,
70852b47
YCH
2265 'petabytes': 1000 ** 5,
2266 'pebibytes': 1024 ** 5,
be64b5b0
PH
2267 'EiB': 1024 ** 6,
2268 'EB': 1000 ** 6,
2269 'eB': 1024 ** 6,
2270 'Eb': 1000 ** 6,
13585d76 2271 'eb': 1000 ** 6,
70852b47
YCH
2272 'exabytes': 1000 ** 6,
2273 'exbibytes': 1024 ** 6,
be64b5b0
PH
2274 'ZiB': 1024 ** 7,
2275 'ZB': 1000 ** 7,
2276 'zB': 1024 ** 7,
2277 'Zb': 1000 ** 7,
13585d76 2278 'zb': 1000 ** 7,
70852b47
YCH
2279 'zettabytes': 1000 ** 7,
2280 'zebibytes': 1024 ** 7,
be64b5b0
PH
2281 'YiB': 1024 ** 8,
2282 'YB': 1000 ** 8,
2283 'yB': 1024 ** 8,
2284 'Yb': 1000 ** 8,
13585d76 2285 'yb': 1000 ** 8,
70852b47
YCH
2286 'yottabytes': 1000 ** 8,
2287 'yobibytes': 1024 ** 8,
be64b5b0
PH
2288 }
2289
fb47597b
S
2290 return lookup_unit_table(_UNIT_TABLE, s)
2291
2292
2293def parse_count(s):
2294 if s is None:
be64b5b0
PH
2295 return None
2296
352d5da8 2297 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2298
2299 if re.match(r'^[\d,.]+$', s):
2300 return str_to_int(s)
2301
2302 _UNIT_TABLE = {
2303 'k': 1000,
2304 'K': 1000,
2305 'm': 1000 ** 2,
2306 'M': 1000 ** 2,
2307 'kk': 1000 ** 2,
2308 'KK': 1000 ** 2,
352d5da8 2309 'b': 1000 ** 3,
2310 'B': 1000 ** 3,
fb47597b 2311 }
be64b5b0 2312
352d5da8 2313 ret = lookup_unit_table(_UNIT_TABLE, s)
2314 if ret is not None:
2315 return ret
2316
2317 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2318 if mobj:
2319 return str_to_int(mobj.group(1))
be64b5b0 2320
2f7ae819 2321
5d45484c 2322def parse_resolution(s, *, lenient=False):
b871d7e9
S
2323 if s is None:
2324 return {}
2325
5d45484c
LNO
2326 if lenient:
2327 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2328 else:
2329 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2330 if mobj:
2331 return {
2332 'width': int(mobj.group('w')),
2333 'height': int(mobj.group('h')),
2334 }
2335
17ec8bcf 2336 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2337 if mobj:
2338 return {'height': int(mobj.group(1))}
2339
2340 mobj = re.search(r'\b([48])[kK]\b', s)
2341 if mobj:
2342 return {'height': int(mobj.group(1)) * 540}
2343
2344 return {}
2345
2346
0dc41787 2347def parse_bitrate(s):
14f25df2 2348 if not isinstance(s, str):
0dc41787
S
2349 return
2350 mobj = re.search(r'\b(\d+)\s*kbps', s)
2351 if mobj:
2352 return int(mobj.group(1))
2353
2354
a942d6cb 2355def month_by_name(name, lang='en'):
caefb1de
PH
2356 """ Return the number of a month by (locale-independently) English name """
2357
f6717dec 2358 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2359
caefb1de 2360 try:
f6717dec 2361 return month_names.index(name) + 1
7105440c
YCH
2362 except ValueError:
2363 return None
2364
2365
2366def month_by_abbreviation(abbrev):
2367 """ Return the number of a month by (locale-independently) English
2368 abbreviations """
2369
2370 try:
2371 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2372 except ValueError:
2373 return None
18258362
JMF
2374
2375
5aafe895 2376def fix_xml_ampersands(xml_str):
18258362 2377 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2378 return re.sub(
2379 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2380 '&amp;',
5aafe895 2381 xml_str)
e3946f98
PH
2382
2383
2384def setproctitle(title):
14f25df2 2385 assert isinstance(title, str)
c1c05c67 2386
fe0918bb 2387 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2388 try:
2389 import ctypes
2390 except ImportError:
c1c05c67
YCH
2391 return
2392
e3946f98 2393 try:
611c1dd9 2394 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2395 except OSError:
2396 return
2f49bcd6
RC
2397 except TypeError:
2398 # LoadLibrary in Windows Python 2.7.13 only expects
2399 # a bytestring, but since unicode_literals turns
2400 # every string into a unicode string, it fails.
2401 return
0f06bcd7 2402 title_bytes = title.encode()
6eefe533
PH
2403 buf = ctypes.create_string_buffer(len(title_bytes))
2404 buf.value = title_bytes
e3946f98 2405 try:
6eefe533 2406 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2407 except AttributeError:
2408 return # Strange libc, just skip this
d7dda168
PH
2409
2410
2411def remove_start(s, start):
46bc9b7d 2412 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2413
2414
2b9faf55 2415def remove_end(s, end):
46bc9b7d 2416 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2417
2418
31b2051e
S
2419def remove_quotes(s):
2420 if s is None or len(s) < 2:
2421 return s
2422 for quote in ('"', "'", ):
2423 if s[0] == quote and s[-1] == quote:
2424 return s[1:-1]
2425 return s
2426
2427
b6e0c7d2 2428def get_domain(url):
ebf99aaf 2429 """
2430 This implementation is inconsistent, but is kept for compatibility.
2431 Use this only for "webpage_url_domain"
2432 """
2433 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2434
2435
29eb5174 2436def url_basename(url):
14f25df2 2437 path = urllib.parse.urlparse(url).path
28e614de 2438 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2439
2440
02dc0a36
S
2441def base_url(url):
2442 return re.match(r'https?://[^?#&]+/', url).group()
2443
2444
e34c3361 2445def urljoin(base, path):
4b5de77b 2446 if isinstance(path, bytes):
0f06bcd7 2447 path = path.decode()
14f25df2 2448 if not isinstance(path, str) or not path:
e34c3361 2449 return None
fad4ceb5 2450 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2451 return path
4b5de77b 2452 if isinstance(base, bytes):
0f06bcd7 2453 base = base.decode()
14f25df2 2454 if not isinstance(base, str) or not re.match(
4b5de77b 2455 r'^(?:https?:)?//', base):
e34c3361 2456 return None
14f25df2 2457 return urllib.parse.urljoin(base, path)
e34c3361
S
2458
2459
ac668111 2460class HEADRequest(urllib.request.Request):
aa94a6d3 2461 def get_method(self):
611c1dd9 2462 return 'HEAD'
7217e148
PH
2463
2464
ac668111 2465class PUTRequest(urllib.request.Request):
95cf60e8
S
2466 def get_method(self):
2467 return 'PUT'
2468
2469
9732d77e 2470def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2471 if get_attr and v is not None:
2472 v = getattr(v, get_attr, None)
1812afb7
S
2473 try:
2474 return int(v) * invscale // scale
31c49255 2475 except (ValueError, TypeError, OverflowError):
af98f8ff 2476 return default
9732d77e 2477
9572013d 2478
40a90862 2479def str_or_none(v, default=None):
14f25df2 2480 return default if v is None else str(v)
40a90862 2481
9732d77e
PH
2482
2483def str_to_int(int_str):
48d4681e 2484 """ A more relaxed version of int_or_none """
f9934b96 2485 if isinstance(int_str, int):
348c6bf1 2486 return int_str
14f25df2 2487 elif isinstance(int_str, str):
42db58ec
S
2488 int_str = re.sub(r'[,\.\+]', '', int_str)
2489 return int_or_none(int_str)
608d11f5
PH
2490
2491
9732d77e 2492def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2493 if v is None:
2494 return default
2495 try:
2496 return float(v) * invscale / scale
5e1271c5 2497 except (ValueError, TypeError):
caf80631 2498 return default
43f775e4
PH
2499
2500
c7e327c4
S
2501def bool_or_none(v, default=None):
2502 return v if isinstance(v, bool) else default
2503
2504
53cd37ba 2505def strip_or_none(v, default=None):
14f25df2 2506 return v.strip() if isinstance(v, str) else default
b72b4431
S
2507
2508
af03000a 2509def url_or_none(url):
14f25df2 2510 if not url or not isinstance(url, str):
af03000a
S
2511 return None
2512 url = url.strip()
29f7c58a 2513 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2514
2515
3e9b66d7 2516def request_to_url(req):
ac668111 2517 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2518 return req.get_full_url()
2519 else:
2520 return req
2521
2522
e29663c6 2523def strftime_or_none(timestamp, date_format, default=None):
2524 datetime_object = None
2525 try:
f9934b96 2526 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2527 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2528 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2529 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2530 return datetime_object.strftime(date_format)
2531 except (ValueError, TypeError, AttributeError):
2532 return default
2533
2534
608d11f5 2535def parse_duration(s):
f9934b96 2536 if not isinstance(s, str):
608d11f5 2537 return None
ca7b3246 2538 s = s.strip()
38d79fd1 2539 if not s:
2540 return None
ca7b3246 2541
acaff495 2542 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2543 m = re.match(r'''(?x)
2544 (?P<before_secs>
2545 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2546 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2547 (?P<ms>[.:][0-9]+)?Z?$
2548 ''', s)
acaff495 2549 if m:
8bd1c00b 2550 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2551 else:
2552 m = re.match(
056653bb
S
2553 r'''(?ix)(?:P?
2554 (?:
1c1b2f96 2555 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2556 )?
2557 (?:
1c1b2f96 2558 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2559 )?
2560 (?:
1c1b2f96 2561 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2562 )?
8f4b58d7 2563 (?:
1c1b2f96 2564 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2565 )?
056653bb 2566 T)?
acaff495 2567 (?:
1c1b2f96 2568 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2569 )?
2570 (?:
1c1b2f96 2571 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2572 )?
2573 (?:
2574 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2575 )?Z?$''', s)
acaff495 2576 if m:
2577 days, hours, mins, secs, ms = m.groups()
2578 else:
15846398 2579 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2580 if m:
2581 hours, mins = m.groups()
2582 else:
2583 return None
2584
acaff495 2585 if ms:
19a03940 2586 ms = ms.replace(':', '.')
2587 return sum(float(part or 0) * mult for part, mult in (
2588 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2589
2590
e65e4c88 2591def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2592 name, real_ext = os.path.splitext(filename)
e65e4c88 2593 return (
86e5f3ed 2594 f'{name}.{ext}{real_ext}'
e65e4c88 2595 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2596 else f'{filename}.{ext}')
d70ad093
PH
2597
2598
b3ed15b7
S
2599def replace_extension(filename, ext, expected_real_ext=None):
2600 name, real_ext = os.path.splitext(filename)
86e5f3ed 2601 return '{}.{}'.format(
b3ed15b7
S
2602 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2603 ext)
2604
2605
d70ad093
PH
2606def check_executable(exe, args=[]):
2607 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2608 args can be a list of arguments for a short output (like -version) """
2609 try:
f0c9fb96 2610 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2611 except OSError:
2612 return False
2613 return exe
b7ab0590
PH
2614
2615
8a7f68d0 2616def _get_exe_version_output(exe, args, *, to_screen=None):
2617 if to_screen:
2618 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2619 try:
b64d04c1 2620 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2621 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2622 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2623 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2624 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2625 except OSError:
2626 return False
f0c9fb96 2627 return stdout
cae97f65
PH
2628
2629
2630def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2631 assert isinstance(output, str)
cae97f65
PH
2632 if version_re is None:
2633 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2634 m = re.search(version_re, output)
95807118
PH
2635 if m:
2636 return m.group(1)
2637 else:
2638 return unrecognized
2639
2640
9af98e17 2641def get_exe_version(exe, args=['--version'],
2642 version_re=None, unrecognized='present'):
2643 """ Returns the version of the specified executable,
2644 or False if the executable is not present """
2645 out = _get_exe_version_output(exe, args)
2646 return detect_exe_version(out, version_re, unrecognized) if out else False
2647
2648
7e88d7d7 2649def frange(start=0, stop=None, step=1):
2650 """Float range"""
2651 if stop is None:
2652 start, stop = 0, start
2653 sign = [-1, 1][step > 0] if step else 0
2654 while sign * start < sign * stop:
2655 yield start
2656 start += step
2657
2658
cb89cfc1 2659class LazyList(collections.abc.Sequence):
0f06bcd7 2660 """Lazy immutable list from an iterable
2661 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2662
8e5fecc8 2663 class IndexError(IndexError):
2664 pass
2665
282f5709 2666 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2667 self._iterable = iter(iterable)
2668 self._cache = [] if _cache is None else _cache
2669 self._reversed = reverse
483336e7 2670
2671 def __iter__(self):
0f06bcd7 2672 if self._reversed:
28419ca2 2673 # We need to consume the entire iterable to iterate in reverse
981052c9 2674 yield from self.exhaust()
28419ca2 2675 return
0f06bcd7 2676 yield from self._cache
2677 for item in self._iterable:
2678 self._cache.append(item)
483336e7 2679 yield item
2680
0f06bcd7 2681 def _exhaust(self):
2682 self._cache.extend(self._iterable)
2683 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2684 return self._cache
28419ca2 2685
981052c9 2686 def exhaust(self):
0f06bcd7 2687 """Evaluate the entire iterable"""
2688 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2689
28419ca2 2690 @staticmethod
0f06bcd7 2691 def _reverse_index(x):
f2df4071 2692 return None if x is None else ~x
483336e7 2693
2694 def __getitem__(self, idx):
2695 if isinstance(idx, slice):
0f06bcd7 2696 if self._reversed:
2697 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2698 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2699 elif isinstance(idx, int):
0f06bcd7 2700 if self._reversed:
2701 idx = self._reverse_index(idx)
e0f2b4b4 2702 start, stop, step = idx, idx, 0
483336e7 2703 else:
2704 raise TypeError('indices must be integers or slices')
e0f2b4b4 2705 if ((start or 0) < 0 or (stop or 0) < 0
2706 or (start is None and step < 0)
2707 or (stop is None and step > 0)):
483336e7 2708 # We need to consume the entire iterable to be able to slice from the end
2709 # Obviously, never use this with infinite iterables
0f06bcd7 2710 self._exhaust()
8e5fecc8 2711 try:
0f06bcd7 2712 return self._cache[idx]
8e5fecc8 2713 except IndexError as e:
2714 raise self.IndexError(e) from e
0f06bcd7 2715 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2716 if n > 0:
0f06bcd7 2717 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2718 try:
0f06bcd7 2719 return self._cache[idx]
8e5fecc8 2720 except IndexError as e:
2721 raise self.IndexError(e) from e
483336e7 2722
2723 def __bool__(self):
2724 try:
0f06bcd7 2725 self[-1] if self._reversed else self[0]
8e5fecc8 2726 except self.IndexError:
483336e7 2727 return False
2728 return True
2729
2730 def __len__(self):
0f06bcd7 2731 self._exhaust()
2732 return len(self._cache)
483336e7 2733
282f5709 2734 def __reversed__(self):
0f06bcd7 2735 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2736
2737 def __copy__(self):
0f06bcd7 2738 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2739
28419ca2 2740 def __repr__(self):
2741 # repr and str should mimic a list. So we exhaust the iterable
2742 return repr(self.exhaust())
2743
2744 def __str__(self):
2745 return repr(self.exhaust())
2746
483336e7 2747
7be9ccff 2748class PagedList:
c07a39ae 2749
2750 class IndexError(IndexError):
2751 pass
2752
dd26ced1
PH
2753 def __len__(self):
2754 # This is only useful for tests
2755 return len(self.getslice())
2756
7be9ccff 2757 def __init__(self, pagefunc, pagesize, use_cache=True):
2758 self._pagefunc = pagefunc
2759 self._pagesize = pagesize
f1d13090 2760 self._pagecount = float('inf')
7be9ccff 2761 self._use_cache = use_cache
2762 self._cache = {}
2763
2764 def getpage(self, pagenum):
d8cf8d97 2765 page_results = self._cache.get(pagenum)
2766 if page_results is None:
f1d13090 2767 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2768 if self._use_cache:
2769 self._cache[pagenum] = page_results
2770 return page_results
2771
2772 def getslice(self, start=0, end=None):
2773 return list(self._getslice(start, end))
2774
2775 def _getslice(self, start, end):
55575225 2776 raise NotImplementedError('This method must be implemented by subclasses')
2777
2778 def __getitem__(self, idx):
f1d13090 2779 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2780 if not isinstance(idx, int) or idx < 0:
2781 raise TypeError('indices must be non-negative integers')
2782 entries = self.getslice(idx, idx + 1)
d8cf8d97 2783 if not entries:
c07a39ae 2784 raise self.IndexError()
d8cf8d97 2785 return entries[0]
55575225 2786
9c44d242
PH
2787
2788class OnDemandPagedList(PagedList):
a44ca5a4 2789 """Download pages until a page with less than maximum results"""
86e5f3ed 2790
7be9ccff 2791 def _getslice(self, start, end):
b7ab0590
PH
2792 for pagenum in itertools.count(start // self._pagesize):
2793 firstid = pagenum * self._pagesize
2794 nextfirstid = pagenum * self._pagesize + self._pagesize
2795 if start >= nextfirstid:
2796 continue
2797
b7ab0590
PH
2798 startv = (
2799 start % self._pagesize
2800 if firstid <= start < nextfirstid
2801 else 0)
b7ab0590
PH
2802 endv = (
2803 ((end - 1) % self._pagesize) + 1
2804 if (end is not None and firstid <= end <= nextfirstid)
2805 else None)
2806
f1d13090 2807 try:
2808 page_results = self.getpage(pagenum)
2809 except Exception:
2810 self._pagecount = pagenum - 1
2811 raise
b7ab0590
PH
2812 if startv != 0 or endv is not None:
2813 page_results = page_results[startv:endv]
7be9ccff 2814 yield from page_results
b7ab0590
PH
2815
2816 # A little optimization - if current page is not "full", ie. does
2817 # not contain page_size videos then we can assume that this page
2818 # is the last one - there are no more ids on further pages -
2819 # i.e. no need to query again.
2820 if len(page_results) + startv < self._pagesize:
2821 break
2822
2823 # If we got the whole page, but the next page is not interesting,
2824 # break out early as well
2825 if end == nextfirstid:
2826 break
81c2f20b
PH
2827
2828
9c44d242 2829class InAdvancePagedList(PagedList):
a44ca5a4 2830 """PagedList with total number of pages known in advance"""
86e5f3ed 2831
9c44d242 2832 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2833 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2834 self._pagecount = pagecount
9c44d242 2835
7be9ccff 2836 def _getslice(self, start, end):
9c44d242 2837 start_page = start // self._pagesize
d37707bd 2838 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2839 skip_elems = start - start_page * self._pagesize
2840 only_more = None if end is None else end - start
2841 for pagenum in range(start_page, end_page):
7be9ccff 2842 page_results = self.getpage(pagenum)
9c44d242 2843 if skip_elems:
7be9ccff 2844 page_results = page_results[skip_elems:]
9c44d242
PH
2845 skip_elems = None
2846 if only_more is not None:
7be9ccff 2847 if len(page_results) < only_more:
2848 only_more -= len(page_results)
9c44d242 2849 else:
7be9ccff 2850 yield from page_results[:only_more]
9c44d242 2851 break
7be9ccff 2852 yield from page_results
9c44d242
PH
2853
2854
7e88d7d7 2855class PlaylistEntries:
2856 MissingEntry = object()
2857 is_exhausted = False
2858
2859 def __init__(self, ydl, info_dict):
7e9a6125 2860 self.ydl = ydl
2861
2862 # _entries must be assigned now since infodict can change during iteration
2863 entries = info_dict.get('entries')
2864 if entries is None:
2865 raise EntryNotInPlaylist('There are no entries')
2866 elif isinstance(entries, list):
2867 self.is_exhausted = True
2868
2869 requested_entries = info_dict.get('requested_entries')
2870 self.is_incomplete = bool(requested_entries)
2871 if self.is_incomplete:
2872 assert self.is_exhausted
2873 self._entries = [self.MissingEntry] * max(requested_entries)
2874 for i, entry in zip(requested_entries, entries):
2875 self._entries[i - 1] = entry
2876 elif isinstance(entries, (list, PagedList, LazyList)):
2877 self._entries = entries
2878 else:
2879 self._entries = LazyList(entries)
7e88d7d7 2880
2881 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2882 (?P<start>[+-]?\d+)?
2883 (?P<range>[:-]
2884 (?P<end>[+-]?\d+|inf(?:inite)?)?
2885 (?::(?P<step>[+-]?\d+))?
2886 )?''')
2887
2888 @classmethod
2889 def parse_playlist_items(cls, string):
2890 for segment in string.split(','):
2891 if not segment:
2892 raise ValueError('There is two or more consecutive commas')
2893 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2894 if not mobj:
2895 raise ValueError(f'{segment!r} is not a valid specification')
2896 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2897 if int_or_none(step) == 0:
2898 raise ValueError(f'Step in {segment!r} cannot be zero')
2899 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2900
2901 def get_requested_items(self):
2902 playlist_items = self.ydl.params.get('playlist_items')
2903 playlist_start = self.ydl.params.get('playliststart', 1)
2904 playlist_end = self.ydl.params.get('playlistend')
2905 # For backwards compatibility, interpret -1 as whole list
2906 if playlist_end in (-1, None):
2907 playlist_end = ''
2908 if not playlist_items:
2909 playlist_items = f'{playlist_start}:{playlist_end}'
2910 elif playlist_start != 1 or playlist_end:
2911 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2912
2913 for index in self.parse_playlist_items(playlist_items):
2914 for i, entry in self[index]:
2915 yield i, entry
1ac4fd80 2916 if not entry:
2917 continue
7e88d7d7 2918 try:
2919 # TODO: Add auto-generated fields
2920 self.ydl._match_entry(entry, incomplete=True, silent=True)
2921 except (ExistingVideoReached, RejectedVideoReached):
2922 return
2923
7e9a6125 2924 def get_full_count(self):
2925 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2926 return len(self)
2927 elif isinstance(self._entries, InAdvancePagedList):
2928 if self._entries._pagesize == 1:
2929 return self._entries._pagecount
2930
7e88d7d7 2931 @functools.cached_property
2932 def _getter(self):
2933 if isinstance(self._entries, list):
2934 def get_entry(i):
2935 try:
2936 entry = self._entries[i]
2937 except IndexError:
2938 entry = self.MissingEntry
2939 if not self.is_incomplete:
2940 raise self.IndexError()
2941 if entry is self.MissingEntry:
2942 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2943 return entry
2944 else:
2945 def get_entry(i):
2946 try:
2947 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2948 except (LazyList.IndexError, PagedList.IndexError):
2949 raise self.IndexError()
2950 return get_entry
2951
2952 def __getitem__(self, idx):
2953 if isinstance(idx, int):
2954 idx = slice(idx, idx)
2955
2956 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2957 step = 1 if idx.step is None else idx.step
2958 if idx.start is None:
2959 start = 0 if step > 0 else len(self) - 1
2960 else:
2961 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2962
2963 # NB: Do not call len(self) when idx == [:]
2964 if idx.stop is None:
2965 stop = 0 if step < 0 else float('inf')
2966 else:
2967 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2968 stop += [-1, 1][step > 0]
2969
2970 for i in frange(start, stop, step):
2971 if i < 0:
2972 continue
2973 try:
7e9a6125 2974 entry = self._getter(i)
2975 except self.IndexError:
2976 self.is_exhausted = True
2977 if step > 0:
7e88d7d7 2978 break
7e9a6125 2979 continue
7e88d7d7 2980 yield i + 1, entry
2981
2982 def __len__(self):
2983 return len(tuple(self[:]))
2984
2985 class IndexError(IndexError):
2986 pass
2987
2988
81c2f20b 2989def uppercase_escape(s):
676eb3f2 2990 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2991 return re.sub(
a612753d 2992 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2993 lambda m: unicode_escape(m.group(0))[0],
2994 s)
0fe2ff78
YCH
2995
2996
2997def lowercase_escape(s):
2998 unicode_escape = codecs.getdecoder('unicode_escape')
2999 return re.sub(
3000 r'\\u[0-9a-fA-F]{4}',
3001 lambda m: unicode_escape(m.group(0))[0],
3002 s)
b53466e1 3003
d05cfe06
S
3004
3005def escape_rfc3986(s):
3006 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3007 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3008
3009
3010def escape_url(url):
3011 """Escape URL as suggested by RFC 3986"""
14f25df2 3012 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3013 return url_parsed._replace(
efbed08d 3014 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3015 path=escape_rfc3986(url_parsed.path),
3016 params=escape_rfc3986(url_parsed.params),
3017 query=escape_rfc3986(url_parsed.query),
3018 fragment=escape_rfc3986(url_parsed.fragment)
3019 ).geturl()
3020
62e609ab 3021
4dfbf869 3022def parse_qs(url):
14f25df2 3023 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 3024
3025
62e609ab
PH
3026def read_batch_urls(batch_fd):
3027 def fixup(url):
14f25df2 3028 if not isinstance(url, str):
62e609ab 3029 url = url.decode('utf-8', 'replace')
8c04f0be 3030 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3031 for bom in BOM_UTF8:
3032 if url.startswith(bom):
3033 url = url[len(bom):]
3034 url = url.lstrip()
3035 if not url or url.startswith(('#', ';', ']')):
62e609ab 3036 return False
8c04f0be 3037 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3038 # However, it can be safely stripped out if following a whitespace
8c04f0be 3039 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3040
3041 with contextlib.closing(batch_fd) as fd:
3042 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3043
3044
3045def urlencode_postdata(*args, **kargs):
14f25df2 3046 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3047
3048
38f9ef31 3049def update_url_query(url, query):
cacd9966
YCH
3050 if not query:
3051 return url
14f25df2 3052 parsed_url = urllib.parse.urlparse(url)
3053 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3054 qs.update(query)
14f25df2 3055 return urllib.parse.urlunparse(parsed_url._replace(
3056 query=urllib.parse.urlencode(qs, True)))
16392824 3057
8e60dc75 3058
c043c246 3059def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3060 req_headers = req.headers.copy()
c043c246 3061 req_headers.update(headers or {})
ed0291d1
S
3062 req_data = data or req.data
3063 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3064 req_get_method = req.get_method()
3065 if req_get_method == 'HEAD':
3066 req_type = HEADRequest
3067 elif req_get_method == 'PUT':
3068 req_type = PUTRequest
3069 else:
ac668111 3070 req_type = urllib.request.Request
ed0291d1
S
3071 new_req = req_type(
3072 req_url, data=req_data, headers=req_headers,
3073 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3074 if hasattr(req, 'timeout'):
3075 new_req.timeout = req.timeout
3076 return new_req
3077
3078
10c87c15 3079def _multipart_encode_impl(data, boundary):
0c265486
YCH
3080 content_type = 'multipart/form-data; boundary=%s' % boundary
3081
3082 out = b''
3083 for k, v in data.items():
3084 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3085 if isinstance(k, str):
0f06bcd7 3086 k = k.encode()
14f25df2 3087 if isinstance(v, str):
0f06bcd7 3088 v = v.encode()
0c265486
YCH
3089 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3090 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3091 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3092 if boundary.encode('ascii') in content:
3093 raise ValueError('Boundary overlaps with data')
3094 out += content
3095
3096 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3097
3098 return out, content_type
3099
3100
3101def multipart_encode(data, boundary=None):
3102 '''
3103 Encode a dict to RFC 7578-compliant form-data
3104
3105 data:
3106 A dict where keys and values can be either Unicode or bytes-like
3107 objects.
3108 boundary:
3109 If specified a Unicode object, it's used as the boundary. Otherwise
3110 a random boundary is generated.
3111
3112 Reference: https://tools.ietf.org/html/rfc7578
3113 '''
3114 has_specified_boundary = boundary is not None
3115
3116 while True:
3117 if boundary is None:
3118 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3119
3120 try:
10c87c15 3121 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3122 break
3123 except ValueError:
3124 if has_specified_boundary:
3125 raise
3126 boundary = None
3127
3128 return out, content_type
3129
3130
86296ad2 3131def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3132 for val in map(d.get, variadic(key_or_keys)):
3133 if val is not None and (val or not skip_false_values):
3134 return val
3135 return default
cbecc9b9
S
3136
3137
c4f60dd7 3138def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3139 for f in funcs:
a32a9a7e 3140 try:
c4f60dd7 3141 val = f(*args, **kwargs)
3142 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3143 pass
3144 else:
c4f60dd7 3145 if expected_type is None or isinstance(val, expected_type):
3146 return val
3147
3148
3149def try_get(src, getter, expected_type=None):
3150 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3151
3152
90137ca4 3153def filter_dict(dct, cndn=lambda _, v: v is not None):
3154 return {k: v for k, v in dct.items() if cndn(k, v)}
3155
3156
6cc62232
S
3157def merge_dicts(*dicts):
3158 merged = {}
3159 for a_dict in dicts:
3160 for k, v in a_dict.items():
90137ca4 3161 if (v is not None and k not in merged
3162 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3163 merged[k] = v
3164 return merged
3165
3166
8e60dc75 3167def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3168 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3169
16392824 3170
a1a530b0
PH
3171US_RATINGS = {
3172 'G': 0,
3173 'PG': 10,
3174 'PG-13': 13,
3175 'R': 16,
3176 'NC': 18,
3177}
fac55558
PH
3178
3179
a8795327 3180TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3181 'TV-Y': 0,
3182 'TV-Y7': 7,
3183 'TV-G': 0,
3184 'TV-PG': 0,
3185 'TV-14': 14,
3186 'TV-MA': 17,
a8795327
S
3187}
3188
3189
146c80e2 3190def parse_age_limit(s):
19a03940 3191 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3192 if type(s) is int: # noqa: E721
a8795327 3193 return s if 0 <= s <= 21 else None
19a03940 3194 elif not isinstance(s, str):
d838b1bd 3195 return None
146c80e2 3196 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3197 if m:
3198 return int(m.group('age'))
5c5fae6d 3199 s = s.upper()
a8795327
S
3200 if s in US_RATINGS:
3201 return US_RATINGS[s]
5a16c9d9 3202 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3203 if m:
5a16c9d9 3204 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3205 return None
146c80e2
S
3206
3207
fac55558 3208def strip_jsonp(code):
609a61e3 3209 return re.sub(
5552c9eb 3210 r'''(?sx)^
e9c671d5 3211 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3212 (?:\s*&&\s*(?P=func_name))?
3213 \s*\(\s*(?P<callback_data>.*)\);?
3214 \s*?(?://[^\n]*)*$''',
3215 r'\g<callback_data>', code)
478c2c61
PH
3216
3217
8f53dc44 3218def js_to_json(code, vars={}, *, strict=False):
5c610515 3219 # vars is a dict of var, val pairs to substitute
c843e685 3220 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3221 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3222 INTEGER_TABLE = (
86e5f3ed 3223 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3224 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3225 )
3226
e05f6939 3227 def fix_kv(m):
e7b6d122
PH
3228 v = m.group(0)
3229 if v in ('true', 'false', 'null'):
3230 return v
421ddcb8
C
3231 elif v in ('undefined', 'void 0'):
3232 return 'null'
8bdd16b4 3233 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3234 return ""
3235
3236 if v[0] in ("'", '"'):
3237 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3238 '"': '\\"',
bd1e4844 3239 "\\'": "'",
3240 '\\\n': '',
3241 '\\x': '\\u00',
3242 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3243 else:
3244 for regex, base in INTEGER_TABLE:
3245 im = re.match(regex, v)
3246 if im:
3247 i = int(im.group(1), base)
3248 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3249
5c610515 3250 if v in vars:
3251 return vars[v]
8f53dc44 3252 if strict:
3253 raise ValueError(f'Unknown value: {v}')
5c610515 3254
e7b6d122 3255 return '"%s"' % v
e05f6939 3256
8072ef2b 3257 def create_map(mobj):
3258 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3259
8072ef2b 3260 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3261 if not strict:
3262 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
febff4c1 3263
bd1e4844 3264 return re.sub(r'''(?sx)
3265 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3266 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3267 {comment}|,(?={skip}[\]}}])|
421ddcb8 3268 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3269 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3270 [0-9]+(?={skip}:)|
3271 !+
4195096e 3272 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3273
3274
478c2c61
PH
3275def qualities(quality_ids):
3276 """ Get a numeric quality value out of a list of possible values """
3277 def q(qid):
3278 try:
3279 return quality_ids.index(qid)
3280 except ValueError:
3281 return -1
3282 return q
3283
acd69589 3284
8aa0e7cd 3285POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3286
3287
de6000d9 3288DEFAULT_OUTTMPL = {
3289 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3290 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3291}
3292OUTTMPL_TYPES = {
72755351 3293 'chapter': None,
de6000d9 3294 'subtitle': None,
3295 'thumbnail': None,
3296 'description': 'description',
3297 'annotation': 'annotations.xml',
3298 'infojson': 'info.json',
08438d2c 3299 'link': None,
3b603dbd 3300 'pl_video': None,
5112f26a 3301 'pl_thumbnail': None,
de6000d9 3302 'pl_description': 'description',
3303 'pl_infojson': 'info.json',
3304}
0a871f68 3305
143db31d 3306# As of [1] format syntax is:
3307# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3308# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3309STR_FORMAT_RE_TMPL = r'''(?x)
3310 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3311 %
524e2e4f 3312 (?P<has_key>\((?P<key>{0})\))?
752cda38 3313 (?P<format>
524e2e4f 3314 (?P<conversion>[#0\-+ ]+)?
3315 (?P<min_width>\d+)?
3316 (?P<precision>\.\d+)?
3317 (?P<len_mod>[hlL])? # unused in python
901130bb 3318 {1} # conversion type
752cda38 3319 )
143db31d 3320'''
3321
7d1eb38a 3322
901130bb 3323STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3324
7d1eb38a 3325
a020a0dc
PH
3326def limit_length(s, length):
3327 """ Add ellipses to overly long strings """
3328 if s is None:
3329 return None
3330 ELLIPSES = '...'
3331 if len(s) > length:
3332 return s[:length - len(ELLIPSES)] + ELLIPSES
3333 return s
48844745
PH
3334
3335
3336def version_tuple(v):
5f9b8394 3337 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3338
3339
3340def is_outdated_version(version, limit, assume_new=True):
3341 if not version:
3342 return not assume_new
3343 try:
3344 return version_tuple(version) < version_tuple(limit)
3345 except ValueError:
3346 return not assume_new
732ea2f0
PH
3347
3348
3349def ytdl_is_updateable():
7a5c1cfe 3350 """ Returns if yt-dlp can be updated with -U """
735d865e 3351
5d535b4a 3352 from .update import is_non_updateable
732ea2f0 3353
5d535b4a 3354 return not is_non_updateable()
7d4111ed
PH
3355
3356
3357def args_to_str(args):
3358 # Get a short string representation for a subprocess command
702ccf2d 3359 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3360
3361
9b9c5355 3362def error_to_compat_str(err):
cfb0511d 3363 return str(err)
fdae2358
S
3364
3365
a44ca5a4 3366def error_to_str(err):
3367 return f'{type(err).__name__}: {err}'
3368
3369
c460bdd5 3370def mimetype2ext(mt):
eb9ee194
S
3371 if mt is None:
3372 return None
3373
9359f3d4
F
3374 mt, _, params = mt.partition(';')
3375 mt = mt.strip()
3376
3377 FULL_MAP = {
765ac263 3378 'audio/mp4': 'm4a',
6c33d24b
YCH
3379 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3380 # it's the most popular one
3381 'audio/mpeg': 'mp3',
ba39289d 3382 'audio/x-wav': 'wav',
9359f3d4
F
3383 'audio/wav': 'wav',
3384 'audio/wave': 'wav',
3385 }
3386
3387 ext = FULL_MAP.get(mt)
765ac263
JMF
3388 if ext is not None:
3389 return ext
3390
9359f3d4 3391 SUBTYPE_MAP = {
f6861ec9 3392 '3gpp': '3gp',
cafcf657 3393 'smptett+xml': 'tt',
cafcf657 3394 'ttaf+xml': 'dfxp',
a0d8d704 3395 'ttml+xml': 'ttml',
f6861ec9 3396 'x-flv': 'flv',
a0d8d704 3397 'x-mp4-fragmented': 'mp4',
d4f05d47 3398 'x-ms-sami': 'sami',
a0d8d704 3399 'x-ms-wmv': 'wmv',
b4173f15
RA
3400 'mpegurl': 'm3u8',
3401 'x-mpegurl': 'm3u8',
3402 'vnd.apple.mpegurl': 'm3u8',
3403 'dash+xml': 'mpd',
b4173f15 3404 'f4m+xml': 'f4m',
f164b971 3405 'hds+xml': 'f4m',
e910fe2f 3406 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3407 'quicktime': 'mov',
98ce1a3f 3408 'mp2t': 'ts',
39e7107d 3409 'x-wav': 'wav',
9359f3d4
F
3410 'filmstrip+json': 'fs',
3411 'svg+xml': 'svg',
3412 }
3413
3414 _, _, subtype = mt.rpartition('/')
3415 ext = SUBTYPE_MAP.get(subtype.lower())
3416 if ext is not None:
3417 return ext
3418
3419 SUFFIX_MAP = {
3420 'json': 'json',
3421 'xml': 'xml',
3422 'zip': 'zip',
3423 'gzip': 'gz',
3424 }
3425
3426 _, _, suffix = subtype.partition('+')
3427 ext = SUFFIX_MAP.get(suffix)
3428 if ext is not None:
3429 return ext
3430
3431 return subtype.replace('+', '.')
c460bdd5
PH
3432
3433
2814f12b
THD
3434def ext2mimetype(ext_or_url):
3435 if not ext_or_url:
3436 return None
3437 if '.' not in ext_or_url:
3438 ext_or_url = f'file.{ext_or_url}'
3439 return mimetypes.guess_type(ext_or_url)[0]
3440
3441
4f3c5e06 3442def parse_codecs(codecs_str):
3443 # http://tools.ietf.org/html/rfc6381
3444 if not codecs_str:
3445 return {}
a0566bbf 3446 split_codecs = list(filter(None, map(
dbf5416a 3447 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3448 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3449 for full_codec in split_codecs:
d816f61f 3450 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3451 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3452 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3453 if vcodec:
3454 continue
3455 vcodec = full_codec
3456 if parts[0] in ('dvh1', 'dvhe'):
3457 hdr = 'DV'
3458 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3459 hdr = 'HDR10'
3460 elif parts[:2] == ['vp9', '2']:
3461 hdr = 'HDR10'
3462 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3463 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3464 acodec = acodec or full_codec
3465 elif parts[0] in ('stpp', 'wvtt'):
3466 scodec = scodec or full_codec
4f3c5e06 3467 else:
19a03940 3468 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3469 if vcodec or acodec or scodec:
4f3c5e06 3470 return {
3471 'vcodec': vcodec or 'none',
3472 'acodec': acodec or 'none',
176f1866 3473 'dynamic_range': hdr,
3fe75fdc 3474 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3475 }
b69fd25c 3476 elif len(split_codecs) == 2:
3477 return {
3478 'vcodec': split_codecs[0],
3479 'acodec': split_codecs[1],
3480 }
4f3c5e06 3481 return {}
3482
3483
fc61aff4
LL
3484def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3485 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3486
3487 allow_mkv = not preferences or 'mkv' in preferences
3488
3489 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3490 return 'mkv' # TODO: any other format allows this?
3491
3492 # TODO: All codecs supported by parse_codecs isn't handled here
3493 COMPATIBLE_CODECS = {
3494 'mp4': {
3495 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3496 'h264', 'aacl', # Set in ISM
3497 },
3498 'webm': {
3499 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3500 'vp9x', 'vp8x', # in the webm spec
3501 },
3502 }
3503
3504 sanitize_codec = functools.partial(try_get, getter=lambda x: x.split('.')[0].replace('0', ''))
3505 vcodec, acodec = sanitize_codec(vcodecs[0]), sanitize_codec(acodecs[0])
3506
3507 for ext in preferences or COMPATIBLE_CODECS.keys():
3508 codec_set = COMPATIBLE_CODECS.get(ext, set())
3509 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3510 return ext
3511
3512 COMPATIBLE_EXTS = (
3513 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3514 {'webm'},
3515 )
3516 for ext in preferences or vexts:
3517 current_exts = {ext, *vexts, *aexts}
3518 if ext == 'mkv' or current_exts == {ext} or any(
3519 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3520 return ext
3521 return 'mkv' if allow_mkv else preferences[-1]
3522
3523
2ccd1b10 3524def urlhandle_detect_ext(url_handle):
79298173 3525 getheader = url_handle.headers.get
2ccd1b10 3526
b55ee18f
PH
3527 cd = getheader('Content-Disposition')
3528 if cd:
3529 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3530 if m:
3531 e = determine_ext(m.group('filename'), default_ext=None)
3532 if e:
3533 return e
3534
c460bdd5 3535 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3536
3537
1e399778
YCH
3538def encode_data_uri(data, mime_type):
3539 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3540
3541
05900629 3542def age_restricted(content_limit, age_limit):
6ec6cb4e 3543 """ Returns True iff the content should be blocked """
05900629
PH
3544
3545 if age_limit is None: # No limit set
3546 return False
3547 if content_limit is None:
3548 return False # Content available for everyone
3549 return age_limit < content_limit
61ca9a80
PH
3550
3551
88f60feb 3552# List of known byte-order-marks (BOM)
a904a7f8
L
3553BOMS = [
3554 (b'\xef\xbb\xbf', 'utf-8'),
3555 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3556 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3557 (b'\xff\xfe', 'utf-16-le'),
3558 (b'\xfe\xff', 'utf-16-be'),
3559]
a904a7f8
L
3560
3561
61ca9a80
PH
3562def is_html(first_bytes):
3563 """ Detect whether a file contains HTML by examining its first bytes. """
3564
80e8493e 3565 encoding = 'utf-8'
61ca9a80 3566 for bom, enc in BOMS:
80e8493e 3567 while first_bytes.startswith(bom):
3568 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3569
80e8493e 3570 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3571
3572
3573def determine_protocol(info_dict):
3574 protocol = info_dict.get('protocol')
3575 if protocol is not None:
3576 return protocol
3577
7de837a5 3578 url = sanitize_url(info_dict['url'])
a055469f
PH
3579 if url.startswith('rtmp'):
3580 return 'rtmp'
3581 elif url.startswith('mms'):
3582 return 'mms'
3583 elif url.startswith('rtsp'):
3584 return 'rtsp'
3585
3586 ext = determine_ext(url)
3587 if ext == 'm3u8':
3588 return 'm3u8'
3589 elif ext == 'f4m':
3590 return 'f4m'
3591
14f25df2 3592 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3593
3594
c5e3f849 3595def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3596 """ Render a list of rows, each as a list of values.
3597 Text after a \t will be right aligned """
ec11a9f4 3598 def width(string):
c5e3f849 3599 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3600
3601 def get_max_lens(table):
ec11a9f4 3602 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3603
3604 def filter_using_list(row, filterArray):
d16df59d 3605 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3606
d16df59d 3607 max_lens = get_max_lens(data) if hide_empty else []
3608 header_row = filter_using_list(header_row, max_lens)
3609 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3610
cfb56d1a 3611 table = [header_row] + data
76d321f6 3612 max_lens = get_max_lens(table)
c5e3f849 3613 extra_gap += 1
76d321f6 3614 if delim:
c5e3f849 3615 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3616 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3617 for row in table:
3618 for pos, text in enumerate(map(str, row)):
c5e3f849 3619 if '\t' in text:
3620 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3621 else:
3622 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3623 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3624 return ret
347de493
PH
3625
3626
8f18aca8 3627def _match_one(filter_part, dct, incomplete):
77b87f05 3628 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3629 STRING_OPERATORS = {
3630 '*=': operator.contains,
3631 '^=': lambda attr, value: attr.startswith(value),
3632 '$=': lambda attr, value: attr.endswith(value),
3633 '~=': lambda attr, value: re.search(value, attr),
3634 }
347de493 3635 COMPARISON_OPERATORS = {
a047eeb6 3636 **STRING_OPERATORS,
3637 '<=': operator.le, # "<=" must be defined above "<"
347de493 3638 '<': operator.lt,
347de493 3639 '>=': operator.ge,
a047eeb6 3640 '>': operator.gt,
347de493 3641 '=': operator.eq,
347de493 3642 }
a047eeb6 3643
6db9c4d5 3644 if isinstance(incomplete, bool):
3645 is_incomplete = lambda _: incomplete
3646 else:
3647 is_incomplete = lambda k: k in incomplete
3648
64fa820c 3649 operator_rex = re.compile(r'''(?x)
347de493 3650 (?P<key>[a-z_]+)
77b87f05 3651 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3652 (?:
a047eeb6 3653 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3654 (?P<strval>.+?)
347de493 3655 )
347de493 3656 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3657 m = operator_rex.fullmatch(filter_part.strip())
347de493 3658 if m:
18f96d12 3659 m = m.groupdict()
3660 unnegated_op = COMPARISON_OPERATORS[m['op']]
3661 if m['negation']:
77b87f05
MT
3662 op = lambda attr, value: not unnegated_op(attr, value)
3663 else:
3664 op = unnegated_op
18f96d12 3665 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3666 if m['quote']:
3667 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3668 actual_value = dct.get(m['key'])
3669 numeric_comparison = None
f9934b96 3670 if isinstance(actual_value, (int, float)):
e5a088dc
S
3671 # If the original field is a string and matching comparisonvalue is
3672 # a number we should respect the origin of the original field
3673 # and process comparison value as a string (see
18f96d12 3674 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3675 try:
18f96d12 3676 numeric_comparison = int(comparison_value)
347de493 3677 except ValueError:
18f96d12 3678 numeric_comparison = parse_filesize(comparison_value)
3679 if numeric_comparison is None:
3680 numeric_comparison = parse_filesize(f'{comparison_value}B')
3681 if numeric_comparison is None:
3682 numeric_comparison = parse_duration(comparison_value)
3683 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3684 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3685 if actual_value is None:
6db9c4d5 3686 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3687 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3688
3689 UNARY_OPERATORS = {
1cc47c66
S
3690 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3691 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3692 }
64fa820c 3693 operator_rex = re.compile(r'''(?x)
347de493 3694 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3695 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3696 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3697 if m:
3698 op = UNARY_OPERATORS[m.group('op')]
3699 actual_value = dct.get(m.group('key'))
6db9c4d5 3700 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3701 return True
347de493
PH
3702 return op(actual_value)
3703
3704 raise ValueError('Invalid filter part %r' % filter_part)
3705
3706
8f18aca8 3707def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3708 """ Filter a dictionary with a simple string syntax.
3709 @returns Whether the filter passes
3710 @param incomplete Set of keys that is expected to be missing from dct.
3711 Can be True/False to indicate all/none of the keys may be missing.
3712 All conditions on incomplete keys pass if the key is missing
8f18aca8 3713 """
347de493 3714 return all(
8f18aca8 3715 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3716 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3717
3718
b1a7cd05 3719def match_filter_func(filters):
3720 if not filters:
d1b5f70b 3721 return None
492272fe 3722 filters = set(variadic(filters))
d1b5f70b 3723
492272fe 3724 interactive = '-' in filters
3725 if interactive:
3726 filters.remove('-')
3727
3728 def _match_func(info_dict, incomplete=False):
3729 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3730 return NO_DEFAULT if interactive and not incomplete else None
347de493 3731 else:
3bec830a 3732 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3733 filter_str = ') | ('.join(map(str.strip, filters))
3734 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3735 return _match_func
91410c9b
PH
3736
3737
f2df4071 3738class download_range_func:
3739 def __init__(self, chapters, ranges):
3740 self.chapters, self.ranges = chapters, ranges
3741
3742 def __call__(self, info_dict, ydl):
5ec1b6b7 3743 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3744 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3745 for regex in self.chapters or []:
5ec1b6b7 3746 for i, chapter in enumerate(info_dict.get('chapters') or []):
3747 if re.search(regex, chapter['title']):
3748 warning = None
3749 yield {**chapter, 'index': i}
f2df4071 3750 if self.chapters and warning:
5ec1b6b7 3751 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3752
f2df4071 3753 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3754
f2df4071 3755 def __eq__(self, other):
3756 return (isinstance(other, download_range_func)
3757 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3758
3759
bf6427d2
YCH
3760def parse_dfxp_time_expr(time_expr):
3761 if not time_expr:
d631d5f9 3762 return
bf6427d2 3763
1d485a1a 3764 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3765 if mobj:
3766 return float(mobj.group('time_offset'))
3767
db2fe38b 3768 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3769 if mobj:
db2fe38b 3770 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3771
3772
c1c924ab 3773def srt_subtitles_timecode(seconds):
aa7785f8 3774 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3775
3776
3777def ass_subtitles_timecode(seconds):
3778 time = timetuple_from_msec(seconds * 1000)
3779 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3780
3781
3782def dfxp2srt(dfxp_data):
3869028f
YCH
3783 '''
3784 @param dfxp_data A bytes-like object containing DFXP data
3785 @returns A unicode object containing converted SRT data
3786 '''
5b995f71 3787 LEGACY_NAMESPACES = (
3869028f
YCH
3788 (b'http://www.w3.org/ns/ttml', [
3789 b'http://www.w3.org/2004/11/ttaf1',
3790 b'http://www.w3.org/2006/04/ttaf1',
3791 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3792 ]),
3869028f
YCH
3793 (b'http://www.w3.org/ns/ttml#styling', [
3794 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3795 ]),
3796 )
3797
3798 SUPPORTED_STYLING = [
3799 'color',
3800 'fontFamily',
3801 'fontSize',
3802 'fontStyle',
3803 'fontWeight',
3804 'textDecoration'
3805 ]
3806
4e335771 3807 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3808 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3809 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3810 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3811 })
bf6427d2 3812
5b995f71
RA
3813 styles = {}
3814 default_style = {}
3815
86e5f3ed 3816 class TTMLPElementParser:
5b995f71
RA
3817 _out = ''
3818 _unclosed_elements = []
3819 _applied_styles = []
bf6427d2 3820
2b14cb56 3821 def start(self, tag, attrib):
5b995f71
RA
3822 if tag in (_x('ttml:br'), 'br'):
3823 self._out += '\n'
3824 else:
3825 unclosed_elements = []
3826 style = {}
3827 element_style_id = attrib.get('style')
3828 if default_style:
3829 style.update(default_style)
3830 if element_style_id:
3831 style.update(styles.get(element_style_id, {}))
3832 for prop in SUPPORTED_STYLING:
3833 prop_val = attrib.get(_x('tts:' + prop))
3834 if prop_val:
3835 style[prop] = prop_val
3836 if style:
3837 font = ''
3838 for k, v in sorted(style.items()):
3839 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3840 continue
3841 if k == 'color':
3842 font += ' color="%s"' % v
3843 elif k == 'fontSize':
3844 font += ' size="%s"' % v
3845 elif k == 'fontFamily':
3846 font += ' face="%s"' % v
3847 elif k == 'fontWeight' and v == 'bold':
3848 self._out += '<b>'
3849 unclosed_elements.append('b')
3850 elif k == 'fontStyle' and v == 'italic':
3851 self._out += '<i>'
3852 unclosed_elements.append('i')
3853 elif k == 'textDecoration' and v == 'underline':
3854 self._out += '<u>'
3855 unclosed_elements.append('u')
3856 if font:
3857 self._out += '<font' + font + '>'
3858 unclosed_elements.append('font')
3859 applied_style = {}
3860 if self._applied_styles:
3861 applied_style.update(self._applied_styles[-1])
3862 applied_style.update(style)
3863 self._applied_styles.append(applied_style)
3864 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3865
2b14cb56 3866 def end(self, tag):
5b995f71
RA
3867 if tag not in (_x('ttml:br'), 'br'):
3868 unclosed_elements = self._unclosed_elements.pop()
3869 for element in reversed(unclosed_elements):
3870 self._out += '</%s>' % element
3871 if unclosed_elements and self._applied_styles:
3872 self._applied_styles.pop()
bf6427d2 3873
2b14cb56 3874 def data(self, data):
5b995f71 3875 self._out += data
2b14cb56 3876
3877 def close(self):
5b995f71 3878 return self._out.strip()
2b14cb56 3879
3880 def parse_node(node):
3881 target = TTMLPElementParser()
3882 parser = xml.etree.ElementTree.XMLParser(target=target)
3883 parser.feed(xml.etree.ElementTree.tostring(node))
3884 return parser.close()
bf6427d2 3885
5b995f71
RA
3886 for k, v in LEGACY_NAMESPACES:
3887 for ns in v:
3888 dfxp_data = dfxp_data.replace(ns, k)
3889
3869028f 3890 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3891 out = []
5b995f71 3892 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3893
3894 if not paras:
3895 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3896
5b995f71
RA
3897 repeat = False
3898 while True:
3899 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3900 style_id = style.get('id') or style.get(_x('xml:id'))
3901 if not style_id:
3902 continue
5b995f71
RA
3903 parent_style_id = style.get('style')
3904 if parent_style_id:
3905 if parent_style_id not in styles:
3906 repeat = True
3907 continue
3908 styles[style_id] = styles[parent_style_id].copy()
3909 for prop in SUPPORTED_STYLING:
3910 prop_val = style.get(_x('tts:' + prop))
3911 if prop_val:
3912 styles.setdefault(style_id, {})[prop] = prop_val
3913 if repeat:
3914 repeat = False
3915 else:
3916 break
3917
3918 for p in ('body', 'div'):
3919 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3920 if ele is None:
3921 continue
3922 style = styles.get(ele.get('style'))
3923 if not style:
3924 continue
3925 default_style.update(style)
3926
bf6427d2 3927 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3928 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3929 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3930 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3931 if begin_time is None:
3932 continue
7dff0363 3933 if not end_time:
d631d5f9
YCH
3934 if not dur:
3935 continue
3936 end_time = begin_time + dur
bf6427d2
YCH
3937 out.append('%d\n%s --> %s\n%s\n\n' % (
3938 index,
c1c924ab
YCH
3939 srt_subtitles_timecode(begin_time),
3940 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3941 parse_node(para)))
3942
3943 return ''.join(out)
3944
3945
c487cf00 3946def cli_option(params, command_option, param, separator=None):
66e289ba 3947 param = params.get(param)
c487cf00 3948 return ([] if param is None
3949 else [command_option, str(param)] if separator is None
3950 else [f'{command_option}{separator}{param}'])
66e289ba
S
3951
3952
3953def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3954 param = params.get(param)
c487cf00 3955 assert param in (True, False, None)
3956 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3957
3958
3959def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3960 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3961
3962
e92caff5 3963def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3964 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3965 if use_compat:
5b1ecbb3 3966 return argdict
3967 else:
3968 argdict = None
eab9b2bc 3969 if argdict is None:
5b1ecbb3 3970 return default
eab9b2bc 3971 assert isinstance(argdict, dict)
3972
e92caff5 3973 assert isinstance(keys, (list, tuple))
3974 for key_list in keys:
e92caff5 3975 arg_list = list(filter(
3976 lambda x: x is not None,
6606817a 3977 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3978 if arg_list:
3979 return [arg for args in arg_list for arg in args]
3980 return default
66e289ba 3981
6251555f 3982
330690a2 3983def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3984 main_key, exe = main_key.lower(), exe.lower()
3985 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3986 keys = [f'{root_key}{k}' for k in (keys or [''])]
3987 if root_key in keys:
3988 if main_key != exe:
3989 keys.append((main_key, exe))
3990 keys.append('default')
3991 else:
3992 use_compat = False
3993 return cli_configuration_args(argdict, keys, default, use_compat)
3994
66e289ba 3995
86e5f3ed 3996class ISO639Utils:
39672624
YCH
3997 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3998 _lang_map = {
3999 'aa': 'aar',
4000 'ab': 'abk',
4001 'ae': 'ave',
4002 'af': 'afr',
4003 'ak': 'aka',
4004 'am': 'amh',
4005 'an': 'arg',
4006 'ar': 'ara',
4007 'as': 'asm',
4008 'av': 'ava',
4009 'ay': 'aym',
4010 'az': 'aze',
4011 'ba': 'bak',
4012 'be': 'bel',
4013 'bg': 'bul',
4014 'bh': 'bih',
4015 'bi': 'bis',
4016 'bm': 'bam',
4017 'bn': 'ben',
4018 'bo': 'bod',
4019 'br': 'bre',
4020 'bs': 'bos',
4021 'ca': 'cat',
4022 'ce': 'che',
4023 'ch': 'cha',
4024 'co': 'cos',
4025 'cr': 'cre',
4026 'cs': 'ces',
4027 'cu': 'chu',
4028 'cv': 'chv',
4029 'cy': 'cym',
4030 'da': 'dan',
4031 'de': 'deu',
4032 'dv': 'div',
4033 'dz': 'dzo',
4034 'ee': 'ewe',
4035 'el': 'ell',
4036 'en': 'eng',
4037 'eo': 'epo',
4038 'es': 'spa',
4039 'et': 'est',
4040 'eu': 'eus',
4041 'fa': 'fas',
4042 'ff': 'ful',
4043 'fi': 'fin',
4044 'fj': 'fij',
4045 'fo': 'fao',
4046 'fr': 'fra',
4047 'fy': 'fry',
4048 'ga': 'gle',
4049 'gd': 'gla',
4050 'gl': 'glg',
4051 'gn': 'grn',
4052 'gu': 'guj',
4053 'gv': 'glv',
4054 'ha': 'hau',
4055 'he': 'heb',
b7acc835 4056 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4057 'hi': 'hin',
4058 'ho': 'hmo',
4059 'hr': 'hrv',
4060 'ht': 'hat',
4061 'hu': 'hun',
4062 'hy': 'hye',
4063 'hz': 'her',
4064 'ia': 'ina',
4065 'id': 'ind',
b7acc835 4066 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4067 'ie': 'ile',
4068 'ig': 'ibo',
4069 'ii': 'iii',
4070 'ik': 'ipk',
4071 'io': 'ido',
4072 'is': 'isl',
4073 'it': 'ita',
4074 'iu': 'iku',
4075 'ja': 'jpn',
4076 'jv': 'jav',
4077 'ka': 'kat',
4078 'kg': 'kon',
4079 'ki': 'kik',
4080 'kj': 'kua',
4081 'kk': 'kaz',
4082 'kl': 'kal',
4083 'km': 'khm',
4084 'kn': 'kan',
4085 'ko': 'kor',
4086 'kr': 'kau',
4087 'ks': 'kas',
4088 'ku': 'kur',
4089 'kv': 'kom',
4090 'kw': 'cor',
4091 'ky': 'kir',
4092 'la': 'lat',
4093 'lb': 'ltz',
4094 'lg': 'lug',
4095 'li': 'lim',
4096 'ln': 'lin',
4097 'lo': 'lao',
4098 'lt': 'lit',
4099 'lu': 'lub',
4100 'lv': 'lav',
4101 'mg': 'mlg',
4102 'mh': 'mah',
4103 'mi': 'mri',
4104 'mk': 'mkd',
4105 'ml': 'mal',
4106 'mn': 'mon',
4107 'mr': 'mar',
4108 'ms': 'msa',
4109 'mt': 'mlt',
4110 'my': 'mya',
4111 'na': 'nau',
4112 'nb': 'nob',
4113 'nd': 'nde',
4114 'ne': 'nep',
4115 'ng': 'ndo',
4116 'nl': 'nld',
4117 'nn': 'nno',
4118 'no': 'nor',
4119 'nr': 'nbl',
4120 'nv': 'nav',
4121 'ny': 'nya',
4122 'oc': 'oci',
4123 'oj': 'oji',
4124 'om': 'orm',
4125 'or': 'ori',
4126 'os': 'oss',
4127 'pa': 'pan',
4128 'pi': 'pli',
4129 'pl': 'pol',
4130 'ps': 'pus',
4131 'pt': 'por',
4132 'qu': 'que',
4133 'rm': 'roh',
4134 'rn': 'run',
4135 'ro': 'ron',
4136 'ru': 'rus',
4137 'rw': 'kin',
4138 'sa': 'san',
4139 'sc': 'srd',
4140 'sd': 'snd',
4141 'se': 'sme',
4142 'sg': 'sag',
4143 'si': 'sin',
4144 'sk': 'slk',
4145 'sl': 'slv',
4146 'sm': 'smo',
4147 'sn': 'sna',
4148 'so': 'som',
4149 'sq': 'sqi',
4150 'sr': 'srp',
4151 'ss': 'ssw',
4152 'st': 'sot',
4153 'su': 'sun',
4154 'sv': 'swe',
4155 'sw': 'swa',
4156 'ta': 'tam',
4157 'te': 'tel',
4158 'tg': 'tgk',
4159 'th': 'tha',
4160 'ti': 'tir',
4161 'tk': 'tuk',
4162 'tl': 'tgl',
4163 'tn': 'tsn',
4164 'to': 'ton',
4165 'tr': 'tur',
4166 'ts': 'tso',
4167 'tt': 'tat',
4168 'tw': 'twi',
4169 'ty': 'tah',
4170 'ug': 'uig',
4171 'uk': 'ukr',
4172 'ur': 'urd',
4173 'uz': 'uzb',
4174 've': 'ven',
4175 'vi': 'vie',
4176 'vo': 'vol',
4177 'wa': 'wln',
4178 'wo': 'wol',
4179 'xh': 'xho',
4180 'yi': 'yid',
e9a50fba 4181 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4182 'yo': 'yor',
4183 'za': 'zha',
4184 'zh': 'zho',
4185 'zu': 'zul',
4186 }
4187
4188 @classmethod
4189 def short2long(cls, code):
4190 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4191 return cls._lang_map.get(code[:2])
4192
4193 @classmethod
4194 def long2short(cls, code):
4195 """Convert language code from ISO 639-2/T to ISO 639-1"""
4196 for short_name, long_name in cls._lang_map.items():
4197 if long_name == code:
4198 return short_name
4199
4200
86e5f3ed 4201class ISO3166Utils:
4eb10f66
YCH
4202 # From http://data.okfn.org/data/core/country-list
4203 _country_map = {
4204 'AF': 'Afghanistan',
4205 'AX': 'Åland Islands',
4206 'AL': 'Albania',
4207 'DZ': 'Algeria',
4208 'AS': 'American Samoa',
4209 'AD': 'Andorra',
4210 'AO': 'Angola',
4211 'AI': 'Anguilla',
4212 'AQ': 'Antarctica',
4213 'AG': 'Antigua and Barbuda',
4214 'AR': 'Argentina',
4215 'AM': 'Armenia',
4216 'AW': 'Aruba',
4217 'AU': 'Australia',
4218 'AT': 'Austria',
4219 'AZ': 'Azerbaijan',
4220 'BS': 'Bahamas',
4221 'BH': 'Bahrain',
4222 'BD': 'Bangladesh',
4223 'BB': 'Barbados',
4224 'BY': 'Belarus',
4225 'BE': 'Belgium',
4226 'BZ': 'Belize',
4227 'BJ': 'Benin',
4228 'BM': 'Bermuda',
4229 'BT': 'Bhutan',
4230 'BO': 'Bolivia, Plurinational State of',
4231 'BQ': 'Bonaire, Sint Eustatius and Saba',
4232 'BA': 'Bosnia and Herzegovina',
4233 'BW': 'Botswana',
4234 'BV': 'Bouvet Island',
4235 'BR': 'Brazil',
4236 'IO': 'British Indian Ocean Territory',
4237 'BN': 'Brunei Darussalam',
4238 'BG': 'Bulgaria',
4239 'BF': 'Burkina Faso',
4240 'BI': 'Burundi',
4241 'KH': 'Cambodia',
4242 'CM': 'Cameroon',
4243 'CA': 'Canada',
4244 'CV': 'Cape Verde',
4245 'KY': 'Cayman Islands',
4246 'CF': 'Central African Republic',
4247 'TD': 'Chad',
4248 'CL': 'Chile',
4249 'CN': 'China',
4250 'CX': 'Christmas Island',
4251 'CC': 'Cocos (Keeling) Islands',
4252 'CO': 'Colombia',
4253 'KM': 'Comoros',
4254 'CG': 'Congo',
4255 'CD': 'Congo, the Democratic Republic of the',
4256 'CK': 'Cook Islands',
4257 'CR': 'Costa Rica',
4258 'CI': 'Côte d\'Ivoire',
4259 'HR': 'Croatia',
4260 'CU': 'Cuba',
4261 'CW': 'Curaçao',
4262 'CY': 'Cyprus',
4263 'CZ': 'Czech Republic',
4264 'DK': 'Denmark',
4265 'DJ': 'Djibouti',
4266 'DM': 'Dominica',
4267 'DO': 'Dominican Republic',
4268 'EC': 'Ecuador',
4269 'EG': 'Egypt',
4270 'SV': 'El Salvador',
4271 'GQ': 'Equatorial Guinea',
4272 'ER': 'Eritrea',
4273 'EE': 'Estonia',
4274 'ET': 'Ethiopia',
4275 'FK': 'Falkland Islands (Malvinas)',
4276 'FO': 'Faroe Islands',
4277 'FJ': 'Fiji',
4278 'FI': 'Finland',
4279 'FR': 'France',
4280 'GF': 'French Guiana',
4281 'PF': 'French Polynesia',
4282 'TF': 'French Southern Territories',
4283 'GA': 'Gabon',
4284 'GM': 'Gambia',
4285 'GE': 'Georgia',
4286 'DE': 'Germany',
4287 'GH': 'Ghana',
4288 'GI': 'Gibraltar',
4289 'GR': 'Greece',
4290 'GL': 'Greenland',
4291 'GD': 'Grenada',
4292 'GP': 'Guadeloupe',
4293 'GU': 'Guam',
4294 'GT': 'Guatemala',
4295 'GG': 'Guernsey',
4296 'GN': 'Guinea',
4297 'GW': 'Guinea-Bissau',
4298 'GY': 'Guyana',
4299 'HT': 'Haiti',
4300 'HM': 'Heard Island and McDonald Islands',
4301 'VA': 'Holy See (Vatican City State)',
4302 'HN': 'Honduras',
4303 'HK': 'Hong Kong',
4304 'HU': 'Hungary',
4305 'IS': 'Iceland',
4306 'IN': 'India',
4307 'ID': 'Indonesia',
4308 'IR': 'Iran, Islamic Republic of',
4309 'IQ': 'Iraq',
4310 'IE': 'Ireland',
4311 'IM': 'Isle of Man',
4312 'IL': 'Israel',
4313 'IT': 'Italy',
4314 'JM': 'Jamaica',
4315 'JP': 'Japan',
4316 'JE': 'Jersey',
4317 'JO': 'Jordan',
4318 'KZ': 'Kazakhstan',
4319 'KE': 'Kenya',
4320 'KI': 'Kiribati',
4321 'KP': 'Korea, Democratic People\'s Republic of',
4322 'KR': 'Korea, Republic of',
4323 'KW': 'Kuwait',
4324 'KG': 'Kyrgyzstan',
4325 'LA': 'Lao People\'s Democratic Republic',
4326 'LV': 'Latvia',
4327 'LB': 'Lebanon',
4328 'LS': 'Lesotho',
4329 'LR': 'Liberia',
4330 'LY': 'Libya',
4331 'LI': 'Liechtenstein',
4332 'LT': 'Lithuania',
4333 'LU': 'Luxembourg',
4334 'MO': 'Macao',
4335 'MK': 'Macedonia, the Former Yugoslav Republic of',
4336 'MG': 'Madagascar',
4337 'MW': 'Malawi',
4338 'MY': 'Malaysia',
4339 'MV': 'Maldives',
4340 'ML': 'Mali',
4341 'MT': 'Malta',
4342 'MH': 'Marshall Islands',
4343 'MQ': 'Martinique',
4344 'MR': 'Mauritania',
4345 'MU': 'Mauritius',
4346 'YT': 'Mayotte',
4347 'MX': 'Mexico',
4348 'FM': 'Micronesia, Federated States of',
4349 'MD': 'Moldova, Republic of',
4350 'MC': 'Monaco',
4351 'MN': 'Mongolia',
4352 'ME': 'Montenegro',
4353 'MS': 'Montserrat',
4354 'MA': 'Morocco',
4355 'MZ': 'Mozambique',
4356 'MM': 'Myanmar',
4357 'NA': 'Namibia',
4358 'NR': 'Nauru',
4359 'NP': 'Nepal',
4360 'NL': 'Netherlands',
4361 'NC': 'New Caledonia',
4362 'NZ': 'New Zealand',
4363 'NI': 'Nicaragua',
4364 'NE': 'Niger',
4365 'NG': 'Nigeria',
4366 'NU': 'Niue',
4367 'NF': 'Norfolk Island',
4368 'MP': 'Northern Mariana Islands',
4369 'NO': 'Norway',
4370 'OM': 'Oman',
4371 'PK': 'Pakistan',
4372 'PW': 'Palau',
4373 'PS': 'Palestine, State of',
4374 'PA': 'Panama',
4375 'PG': 'Papua New Guinea',
4376 'PY': 'Paraguay',
4377 'PE': 'Peru',
4378 'PH': 'Philippines',
4379 'PN': 'Pitcairn',
4380 'PL': 'Poland',
4381 'PT': 'Portugal',
4382 'PR': 'Puerto Rico',
4383 'QA': 'Qatar',
4384 'RE': 'Réunion',
4385 'RO': 'Romania',
4386 'RU': 'Russian Federation',
4387 'RW': 'Rwanda',
4388 'BL': 'Saint Barthélemy',
4389 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4390 'KN': 'Saint Kitts and Nevis',
4391 'LC': 'Saint Lucia',
4392 'MF': 'Saint Martin (French part)',
4393 'PM': 'Saint Pierre and Miquelon',
4394 'VC': 'Saint Vincent and the Grenadines',
4395 'WS': 'Samoa',
4396 'SM': 'San Marino',
4397 'ST': 'Sao Tome and Principe',
4398 'SA': 'Saudi Arabia',
4399 'SN': 'Senegal',
4400 'RS': 'Serbia',
4401 'SC': 'Seychelles',
4402 'SL': 'Sierra Leone',
4403 'SG': 'Singapore',
4404 'SX': 'Sint Maarten (Dutch part)',
4405 'SK': 'Slovakia',
4406 'SI': 'Slovenia',
4407 'SB': 'Solomon Islands',
4408 'SO': 'Somalia',
4409 'ZA': 'South Africa',
4410 'GS': 'South Georgia and the South Sandwich Islands',
4411 'SS': 'South Sudan',
4412 'ES': 'Spain',
4413 'LK': 'Sri Lanka',
4414 'SD': 'Sudan',
4415 'SR': 'Suriname',
4416 'SJ': 'Svalbard and Jan Mayen',
4417 'SZ': 'Swaziland',
4418 'SE': 'Sweden',
4419 'CH': 'Switzerland',
4420 'SY': 'Syrian Arab Republic',
4421 'TW': 'Taiwan, Province of China',
4422 'TJ': 'Tajikistan',
4423 'TZ': 'Tanzania, United Republic of',
4424 'TH': 'Thailand',
4425 'TL': 'Timor-Leste',
4426 'TG': 'Togo',
4427 'TK': 'Tokelau',
4428 'TO': 'Tonga',
4429 'TT': 'Trinidad and Tobago',
4430 'TN': 'Tunisia',
4431 'TR': 'Turkey',
4432 'TM': 'Turkmenistan',
4433 'TC': 'Turks and Caicos Islands',
4434 'TV': 'Tuvalu',
4435 'UG': 'Uganda',
4436 'UA': 'Ukraine',
4437 'AE': 'United Arab Emirates',
4438 'GB': 'United Kingdom',
4439 'US': 'United States',
4440 'UM': 'United States Minor Outlying Islands',
4441 'UY': 'Uruguay',
4442 'UZ': 'Uzbekistan',
4443 'VU': 'Vanuatu',
4444 'VE': 'Venezuela, Bolivarian Republic of',
4445 'VN': 'Viet Nam',
4446 'VG': 'Virgin Islands, British',
4447 'VI': 'Virgin Islands, U.S.',
4448 'WF': 'Wallis and Futuna',
4449 'EH': 'Western Sahara',
4450 'YE': 'Yemen',
4451 'ZM': 'Zambia',
4452 'ZW': 'Zimbabwe',
2f97cc61 4453 # Not ISO 3166 codes, but used for IP blocks
4454 'AP': 'Asia/Pacific Region',
4455 'EU': 'Europe',
4eb10f66
YCH
4456 }
4457
4458 @classmethod
4459 def short2full(cls, code):
4460 """Convert an ISO 3166-2 country code to the corresponding full name"""
4461 return cls._country_map.get(code.upper())
4462
4463
86e5f3ed 4464class GeoUtils:
773f291d
S
4465 # Major IPv4 address blocks per country
4466 _country_ip_map = {
53896ca5 4467 'AD': '46.172.224.0/19',
773f291d
S
4468 'AE': '94.200.0.0/13',
4469 'AF': '149.54.0.0/17',
4470 'AG': '209.59.64.0/18',
4471 'AI': '204.14.248.0/21',
4472 'AL': '46.99.0.0/16',
4473 'AM': '46.70.0.0/15',
4474 'AO': '105.168.0.0/13',
53896ca5
S
4475 'AP': '182.50.184.0/21',
4476 'AQ': '23.154.160.0/24',
773f291d
S
4477 'AR': '181.0.0.0/12',
4478 'AS': '202.70.112.0/20',
53896ca5 4479 'AT': '77.116.0.0/14',
773f291d
S
4480 'AU': '1.128.0.0/11',
4481 'AW': '181.41.0.0/18',
53896ca5
S
4482 'AX': '185.217.4.0/22',
4483 'AZ': '5.197.0.0/16',
773f291d
S
4484 'BA': '31.176.128.0/17',
4485 'BB': '65.48.128.0/17',
4486 'BD': '114.130.0.0/16',
4487 'BE': '57.0.0.0/8',
53896ca5 4488 'BF': '102.178.0.0/15',
773f291d
S
4489 'BG': '95.42.0.0/15',
4490 'BH': '37.131.0.0/17',
4491 'BI': '154.117.192.0/18',
4492 'BJ': '137.255.0.0/16',
53896ca5 4493 'BL': '185.212.72.0/23',
773f291d
S
4494 'BM': '196.12.64.0/18',
4495 'BN': '156.31.0.0/16',
4496 'BO': '161.56.0.0/16',
4497 'BQ': '161.0.80.0/20',
53896ca5 4498 'BR': '191.128.0.0/12',
773f291d
S
4499 'BS': '24.51.64.0/18',
4500 'BT': '119.2.96.0/19',
4501 'BW': '168.167.0.0/16',
4502 'BY': '178.120.0.0/13',
4503 'BZ': '179.42.192.0/18',
4504 'CA': '99.224.0.0/11',
4505 'CD': '41.243.0.0/16',
53896ca5
S
4506 'CF': '197.242.176.0/21',
4507 'CG': '160.113.0.0/16',
773f291d 4508 'CH': '85.0.0.0/13',
53896ca5 4509 'CI': '102.136.0.0/14',
773f291d
S
4510 'CK': '202.65.32.0/19',
4511 'CL': '152.172.0.0/14',
53896ca5 4512 'CM': '102.244.0.0/14',
773f291d
S
4513 'CN': '36.128.0.0/10',
4514 'CO': '181.240.0.0/12',
4515 'CR': '201.192.0.0/12',
4516 'CU': '152.206.0.0/15',
4517 'CV': '165.90.96.0/19',
4518 'CW': '190.88.128.0/17',
53896ca5 4519 'CY': '31.153.0.0/16',
773f291d
S
4520 'CZ': '88.100.0.0/14',
4521 'DE': '53.0.0.0/8',
4522 'DJ': '197.241.0.0/17',
4523 'DK': '87.48.0.0/12',
4524 'DM': '192.243.48.0/20',
4525 'DO': '152.166.0.0/15',
4526 'DZ': '41.96.0.0/12',
4527 'EC': '186.68.0.0/15',
4528 'EE': '90.190.0.0/15',
4529 'EG': '156.160.0.0/11',
4530 'ER': '196.200.96.0/20',
4531 'ES': '88.0.0.0/11',
4532 'ET': '196.188.0.0/14',
4533 'EU': '2.16.0.0/13',
4534 'FI': '91.152.0.0/13',
4535 'FJ': '144.120.0.0/16',
53896ca5 4536 'FK': '80.73.208.0/21',
773f291d
S
4537 'FM': '119.252.112.0/20',
4538 'FO': '88.85.32.0/19',
4539 'FR': '90.0.0.0/9',
4540 'GA': '41.158.0.0/15',
4541 'GB': '25.0.0.0/8',
4542 'GD': '74.122.88.0/21',
4543 'GE': '31.146.0.0/16',
4544 'GF': '161.22.64.0/18',
4545 'GG': '62.68.160.0/19',
53896ca5
S
4546 'GH': '154.160.0.0/12',
4547 'GI': '95.164.0.0/16',
773f291d
S
4548 'GL': '88.83.0.0/19',
4549 'GM': '160.182.0.0/15',
4550 'GN': '197.149.192.0/18',
4551 'GP': '104.250.0.0/19',
4552 'GQ': '105.235.224.0/20',
4553 'GR': '94.64.0.0/13',
4554 'GT': '168.234.0.0/16',
4555 'GU': '168.123.0.0/16',
4556 'GW': '197.214.80.0/20',
4557 'GY': '181.41.64.0/18',
4558 'HK': '113.252.0.0/14',
4559 'HN': '181.210.0.0/16',
4560 'HR': '93.136.0.0/13',
4561 'HT': '148.102.128.0/17',
4562 'HU': '84.0.0.0/14',
4563 'ID': '39.192.0.0/10',
4564 'IE': '87.32.0.0/12',
4565 'IL': '79.176.0.0/13',
4566 'IM': '5.62.80.0/20',
4567 'IN': '117.192.0.0/10',
4568 'IO': '203.83.48.0/21',
4569 'IQ': '37.236.0.0/14',
4570 'IR': '2.176.0.0/12',
4571 'IS': '82.221.0.0/16',
4572 'IT': '79.0.0.0/10',
4573 'JE': '87.244.64.0/18',
4574 'JM': '72.27.0.0/17',
4575 'JO': '176.29.0.0/16',
53896ca5 4576 'JP': '133.0.0.0/8',
773f291d
S
4577 'KE': '105.48.0.0/12',
4578 'KG': '158.181.128.0/17',
4579 'KH': '36.37.128.0/17',
4580 'KI': '103.25.140.0/22',
4581 'KM': '197.255.224.0/20',
53896ca5 4582 'KN': '198.167.192.0/19',
773f291d
S
4583 'KP': '175.45.176.0/22',
4584 'KR': '175.192.0.0/10',
4585 'KW': '37.36.0.0/14',
4586 'KY': '64.96.0.0/15',
4587 'KZ': '2.72.0.0/13',
4588 'LA': '115.84.64.0/18',
4589 'LB': '178.135.0.0/16',
53896ca5 4590 'LC': '24.92.144.0/20',
773f291d
S
4591 'LI': '82.117.0.0/19',
4592 'LK': '112.134.0.0/15',
53896ca5 4593 'LR': '102.183.0.0/16',
773f291d
S
4594 'LS': '129.232.0.0/17',
4595 'LT': '78.56.0.0/13',
4596 'LU': '188.42.0.0/16',
4597 'LV': '46.109.0.0/16',
4598 'LY': '41.252.0.0/14',
4599 'MA': '105.128.0.0/11',
4600 'MC': '88.209.64.0/18',
4601 'MD': '37.246.0.0/16',
4602 'ME': '178.175.0.0/17',
4603 'MF': '74.112.232.0/21',
4604 'MG': '154.126.0.0/17',
4605 'MH': '117.103.88.0/21',
4606 'MK': '77.28.0.0/15',
4607 'ML': '154.118.128.0/18',
4608 'MM': '37.111.0.0/17',
4609 'MN': '49.0.128.0/17',
4610 'MO': '60.246.0.0/16',
4611 'MP': '202.88.64.0/20',
4612 'MQ': '109.203.224.0/19',
4613 'MR': '41.188.64.0/18',
4614 'MS': '208.90.112.0/22',
4615 'MT': '46.11.0.0/16',
4616 'MU': '105.16.0.0/12',
4617 'MV': '27.114.128.0/18',
53896ca5 4618 'MW': '102.70.0.0/15',
773f291d
S
4619 'MX': '187.192.0.0/11',
4620 'MY': '175.136.0.0/13',
4621 'MZ': '197.218.0.0/15',
4622 'NA': '41.182.0.0/16',
4623 'NC': '101.101.0.0/18',
4624 'NE': '197.214.0.0/18',
4625 'NF': '203.17.240.0/22',
4626 'NG': '105.112.0.0/12',
4627 'NI': '186.76.0.0/15',
4628 'NL': '145.96.0.0/11',
4629 'NO': '84.208.0.0/13',
4630 'NP': '36.252.0.0/15',
4631 'NR': '203.98.224.0/19',
4632 'NU': '49.156.48.0/22',
4633 'NZ': '49.224.0.0/14',
4634 'OM': '5.36.0.0/15',
4635 'PA': '186.72.0.0/15',
4636 'PE': '186.160.0.0/14',
4637 'PF': '123.50.64.0/18',
4638 'PG': '124.240.192.0/19',
4639 'PH': '49.144.0.0/13',
4640 'PK': '39.32.0.0/11',
4641 'PL': '83.0.0.0/11',
4642 'PM': '70.36.0.0/20',
4643 'PR': '66.50.0.0/16',
4644 'PS': '188.161.0.0/16',
4645 'PT': '85.240.0.0/13',
4646 'PW': '202.124.224.0/20',
4647 'PY': '181.120.0.0/14',
4648 'QA': '37.210.0.0/15',
53896ca5 4649 'RE': '102.35.0.0/16',
773f291d 4650 'RO': '79.112.0.0/13',
53896ca5 4651 'RS': '93.86.0.0/15',
773f291d 4652 'RU': '5.136.0.0/13',
53896ca5 4653 'RW': '41.186.0.0/16',
773f291d
S
4654 'SA': '188.48.0.0/13',
4655 'SB': '202.1.160.0/19',
4656 'SC': '154.192.0.0/11',
53896ca5 4657 'SD': '102.120.0.0/13',
773f291d 4658 'SE': '78.64.0.0/12',
53896ca5 4659 'SG': '8.128.0.0/10',
773f291d
S
4660 'SI': '188.196.0.0/14',
4661 'SK': '78.98.0.0/15',
53896ca5 4662 'SL': '102.143.0.0/17',
773f291d
S
4663 'SM': '89.186.32.0/19',
4664 'SN': '41.82.0.0/15',
53896ca5 4665 'SO': '154.115.192.0/18',
773f291d
S
4666 'SR': '186.179.128.0/17',
4667 'SS': '105.235.208.0/21',
4668 'ST': '197.159.160.0/19',
4669 'SV': '168.243.0.0/16',
4670 'SX': '190.102.0.0/20',
4671 'SY': '5.0.0.0/16',
4672 'SZ': '41.84.224.0/19',
4673 'TC': '65.255.48.0/20',
4674 'TD': '154.68.128.0/19',
4675 'TG': '196.168.0.0/14',
4676 'TH': '171.96.0.0/13',
4677 'TJ': '85.9.128.0/18',
4678 'TK': '27.96.24.0/21',
4679 'TL': '180.189.160.0/20',
4680 'TM': '95.85.96.0/19',
4681 'TN': '197.0.0.0/11',
4682 'TO': '175.176.144.0/21',
4683 'TR': '78.160.0.0/11',
4684 'TT': '186.44.0.0/15',
4685 'TV': '202.2.96.0/19',
4686 'TW': '120.96.0.0/11',
4687 'TZ': '156.156.0.0/14',
53896ca5
S
4688 'UA': '37.52.0.0/14',
4689 'UG': '102.80.0.0/13',
4690 'US': '6.0.0.0/8',
773f291d 4691 'UY': '167.56.0.0/13',
53896ca5 4692 'UZ': '84.54.64.0/18',
773f291d 4693 'VA': '212.77.0.0/19',
53896ca5 4694 'VC': '207.191.240.0/21',
773f291d 4695 'VE': '186.88.0.0/13',
53896ca5 4696 'VG': '66.81.192.0/20',
773f291d
S
4697 'VI': '146.226.0.0/16',
4698 'VN': '14.160.0.0/11',
4699 'VU': '202.80.32.0/20',
4700 'WF': '117.20.32.0/21',
4701 'WS': '202.4.32.0/19',
4702 'YE': '134.35.0.0/16',
4703 'YT': '41.242.116.0/22',
4704 'ZA': '41.0.0.0/11',
53896ca5
S
4705 'ZM': '102.144.0.0/13',
4706 'ZW': '102.177.192.0/18',
773f291d
S
4707 }
4708
4709 @classmethod
5f95927a
S
4710 def random_ipv4(cls, code_or_block):
4711 if len(code_or_block) == 2:
4712 block = cls._country_ip_map.get(code_or_block.upper())
4713 if not block:
4714 return None
4715 else:
4716 block = code_or_block
773f291d 4717 addr, preflen = block.split('/')
ac668111 4718 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4719 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4720 return str(socket.inet_ntoa(
ac668111 4721 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4722
4723
ac668111 4724class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4725 def __init__(self, proxies=None):
4726 # Set default handlers
4727 for type in ('http', 'https'):
4728 setattr(self, '%s_open' % type,
4729 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4730 meth(r, proxy, type))
ac668111 4731 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4732
91410c9b 4733 def proxy_open(self, req, proxy, type):
2461f79d 4734 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4735 if req_proxy is not None:
4736 proxy = req_proxy
2461f79d
PH
4737 del req.headers['Ytdl-request-proxy']
4738
4739 if proxy == '__noproxy__':
4740 return None # No Proxy
14f25df2 4741 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4742 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4743 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4744 return None
ac668111 4745 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4746 self, req, proxy, type)
5bc880b9
YCH
4747
4748
0a5445dd
YCH
4749# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4750# released into Public Domain
4751# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4752
4753def long_to_bytes(n, blocksize=0):
4754 """long_to_bytes(n:long, blocksize:int) : string
4755 Convert a long integer to a byte string.
4756
4757 If optional blocksize is given and greater than zero, pad the front of the
4758 byte string with binary zeros so that the length is a multiple of
4759 blocksize.
4760 """
4761 # after much testing, this algorithm was deemed to be the fastest
4762 s = b''
4763 n = int(n)
4764 while n > 0:
ac668111 4765 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4766 n = n >> 32
4767 # strip off leading zeros
4768 for i in range(len(s)):
4769 if s[i] != b'\000'[0]:
4770 break
4771 else:
4772 # only happens when n == 0
4773 s = b'\000'
4774 i = 0
4775 s = s[i:]
4776 # add back some pad bytes. this could be done more efficiently w.r.t. the
4777 # de-padding being done above, but sigh...
4778 if blocksize > 0 and len(s) % blocksize:
4779 s = (blocksize - len(s) % blocksize) * b'\000' + s
4780 return s
4781
4782
4783def bytes_to_long(s):
4784 """bytes_to_long(string) : long
4785 Convert a byte string to a long integer.
4786
4787 This is (essentially) the inverse of long_to_bytes().
4788 """
4789 acc = 0
4790 length = len(s)
4791 if length % 4:
4792 extra = (4 - length % 4)
4793 s = b'\000' * extra + s
4794 length = length + extra
4795 for i in range(0, length, 4):
ac668111 4796 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4797 return acc
4798
4799
5bc880b9
YCH
4800def ohdave_rsa_encrypt(data, exponent, modulus):
4801 '''
4802 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4803
4804 Input:
4805 data: data to encrypt, bytes-like object
4806 exponent, modulus: parameter e and N of RSA algorithm, both integer
4807 Output: hex string of encrypted data
4808
4809 Limitation: supports one block encryption only
4810 '''
4811
4812 payload = int(binascii.hexlify(data[::-1]), 16)
4813 encrypted = pow(payload, exponent, modulus)
4814 return '%x' % encrypted
81bdc8fd
YCH
4815
4816
f48409c7
YCH
4817def pkcs1pad(data, length):
4818 """
4819 Padding input data with PKCS#1 scheme
4820
4821 @param {int[]} data input data
4822 @param {int} length target length
4823 @returns {int[]} padded data
4824 """
4825 if len(data) > length - 11:
4826 raise ValueError('Input data too long for PKCS#1 padding')
4827
4828 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4829 return [0, 2] + pseudo_random + [0] + data
4830
4831
7b2c3f47 4832def _base_n_table(n, table):
4833 if not table and not n:
4834 raise ValueError('Either table or n must be specified')
612f2be5 4835 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4836
44f14eb4 4837 if n and n != len(table):
612f2be5 4838 raise ValueError(f'base {n} exceeds table length {len(table)}')
4839 return table
59f898b7 4840
5eb6bdce 4841
7b2c3f47 4842def encode_base_n(num, n=None, table=None):
4843 """Convert given int to a base-n string"""
612f2be5 4844 table = _base_n_table(n, table)
7b2c3f47 4845 if not num:
5eb6bdce
YCH
4846 return table[0]
4847
7b2c3f47 4848 result, base = '', len(table)
81bdc8fd 4849 while num:
7b2c3f47 4850 result = table[num % base] + result
612f2be5 4851 num = num // base
7b2c3f47 4852 return result
4853
4854
4855def decode_base_n(string, n=None, table=None):
4856 """Convert given base-n string to int"""
4857 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4858 result, base = 0, len(table)
4859 for char in string:
4860 result = result * base + table[char]
4861 return result
4862
4863
4864def decode_base(value, digits):
4865 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4866 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4867 return decode_base_n(value, table=digits)
f52354a8
YCH
4868
4869
4870def decode_packed_codes(code):
06b3fe29 4871 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4872 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4873 base = int(base)
4874 count = int(count)
4875 symbols = symbols.split('|')
4876 symbol_table = {}
4877
4878 while count:
4879 count -= 1
5eb6bdce 4880 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4881 symbol_table[base_n_count] = symbols[count] or base_n_count
4882
4883 return re.sub(
4884 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4885 obfuscated_code)
e154c651 4886
4887
1ced2221
S
4888def caesar(s, alphabet, shift):
4889 if shift == 0:
4890 return s
4891 l = len(alphabet)
4892 return ''.join(
4893 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4894 for c in s)
4895
4896
4897def rot47(s):
4898 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4899
4900
e154c651 4901def parse_m3u8_attributes(attrib):
4902 info = {}
4903 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4904 if val.startswith('"'):
4905 val = val[1:-1]
4906 info[key] = val
4907 return info
1143535d
YCH
4908
4909
4910def urshift(val, n):
4911 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4912
4913
4914# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4915# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4916def decode_png(png_data):
4917 # Reference: https://www.w3.org/TR/PNG/
4918 header = png_data[8:]
4919
4920 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4921 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4922
4923 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4924 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4925
4926 chunks = []
4927
4928 while header:
4929 length = unpack_integer(header[:4])
4930 header = header[4:]
4931
4932 chunk_type = header[:4]
4933 header = header[4:]
4934
4935 chunk_data = header[:length]
4936 header = header[length:]
4937
4938 header = header[4:] # Skip CRC
4939
4940 chunks.append({
4941 'type': chunk_type,
4942 'length': length,
4943 'data': chunk_data
4944 })
4945
4946 ihdr = chunks[0]['data']
4947
4948 width = unpack_integer(ihdr[:4])
4949 height = unpack_integer(ihdr[4:8])
4950
4951 idat = b''
4952
4953 for chunk in chunks:
4954 if chunk['type'] == b'IDAT':
4955 idat += chunk['data']
4956
4957 if not idat:
86e5f3ed 4958 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4959
4960 decompressed_data = bytearray(zlib.decompress(idat))
4961
4962 stride = width * 3
4963 pixels = []
4964
4965 def _get_pixel(idx):
4966 x = idx % stride
4967 y = idx // stride
4968 return pixels[y][x]
4969
4970 for y in range(height):
4971 basePos = y * (1 + stride)
4972 filter_type = decompressed_data[basePos]
4973
4974 current_row = []
4975
4976 pixels.append(current_row)
4977
4978 for x in range(stride):
4979 color = decompressed_data[1 + basePos + x]
4980 basex = y * stride + x
4981 left = 0
4982 up = 0
4983
4984 if x > 2:
4985 left = _get_pixel(basex - 3)
4986 if y > 0:
4987 up = _get_pixel(basex - stride)
4988
4989 if filter_type == 1: # Sub
4990 color = (color + left) & 0xff
4991 elif filter_type == 2: # Up
4992 color = (color + up) & 0xff
4993 elif filter_type == 3: # Average
4994 color = (color + ((left + up) >> 1)) & 0xff
4995 elif filter_type == 4: # Paeth
4996 a = left
4997 b = up
4998 c = 0
4999
5000 if x > 2 and y > 0:
5001 c = _get_pixel(basex - stride - 3)
5002
5003 p = a + b - c
5004
5005 pa = abs(p - a)
5006 pb = abs(p - b)
5007 pc = abs(p - c)
5008
5009 if pa <= pb and pa <= pc:
5010 color = (color + a) & 0xff
5011 elif pb <= pc:
5012 color = (color + b) & 0xff
5013 else:
5014 color = (color + c) & 0xff
5015
5016 current_row.append(color)
5017
5018 return width, height, pixels
efa97bdc
YCH
5019
5020
5021def write_xattr(path, key, value):
6f7563be 5022 # Windows: Write xattrs to NTFS Alternate Data Streams:
5023 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5024 if compat_os_name == 'nt':
5025 assert ':' not in key
5026 assert os.path.exists(path)
efa97bdc
YCH
5027
5028 try:
6f7563be 5029 with open(f'{path}:{key}', 'wb') as f:
5030 f.write(value)
86e5f3ed 5031 except OSError as e:
efa97bdc 5032 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5033 return
efa97bdc 5034
6f7563be 5035 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5036
6f7563be 5037 setxattr = None
5038 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5039 # Unicode arguments are not supported in pyxattr until version 0.5.0
5040 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5041 if version_tuple(xattr.__version__) >= (0, 5, 0):
5042 setxattr = xattr.set
5043 elif xattr:
5044 setxattr = xattr.setxattr
efa97bdc 5045
6f7563be 5046 if setxattr:
5047 try:
5048 setxattr(path, key, value)
5049 except OSError as e:
5050 raise XAttrMetadataError(e.errno, e.strerror)
5051 return
efa97bdc 5052
6f7563be 5053 # UNIX Method 2. Use setfattr/xattr executables
5054 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5055 else 'xattr' if check_executable('xattr', ['-h']) else None)
5056 if not exe:
5057 raise XAttrUnavailableError(
5058 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5059 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5060
0f06bcd7 5061 value = value.decode()
6f7563be 5062 try:
f0c9fb96 5063 _, stderr, returncode = Popen.run(
6f7563be 5064 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5065 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5066 except OSError as e:
5067 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5068 if returncode:
5069 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5070
5071
5072def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5073 start_date = datetime.date(1950, 1, 1)
5074 end_date = datetime.date(1995, 12, 31)
5075 offset = random.randint(0, (end_date - start_date).days)
5076 random_date = start_date + datetime.timedelta(offset)
0c265486 5077 return {
aa374bc7
AS
5078 year_field: str(random_date.year),
5079 month_field: str(random_date.month),
5080 day_field: str(random_date.day),
0c265486 5081 }
732044af 5082
c76eb41b 5083
732044af 5084# Templates for internet shortcut files, which are plain text files.
e5a998f3 5085DOT_URL_LINK_TEMPLATE = '''\
732044af 5086[InternetShortcut]
5087URL=%(url)s
e5a998f3 5088'''
732044af 5089
e5a998f3 5090DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5091<?xml version="1.0" encoding="UTF-8"?>
5092<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5093<plist version="1.0">
5094<dict>
5095\t<key>URL</key>
5096\t<string>%(url)s</string>
5097</dict>
5098</plist>
e5a998f3 5099'''
732044af 5100
e5a998f3 5101DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5102[Desktop Entry]
5103Encoding=UTF-8
5104Name=%(filename)s
5105Type=Link
5106URL=%(url)s
5107Icon=text-html
e5a998f3 5108'''
732044af 5109
08438d2c 5110LINK_TEMPLATES = {
5111 'url': DOT_URL_LINK_TEMPLATE,
5112 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5113 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5114}
5115
732044af 5116
5117def iri_to_uri(iri):
5118 """
5119 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5120
5121 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5122 """
5123
14f25df2 5124 iri_parts = urllib.parse.urlparse(iri)
732044af 5125
5126 if '[' in iri_parts.netloc:
5127 raise ValueError('IPv6 URIs are not, yet, supported.')
5128 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5129
5130 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5131
5132 net_location = ''
5133 if iri_parts.username:
f9934b96 5134 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5135 if iri_parts.password is not None:
f9934b96 5136 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5137 net_location += '@'
5138
0f06bcd7 5139 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5140 # The 'idna' encoding produces ASCII text.
5141 if iri_parts.port is not None and iri_parts.port != 80:
5142 net_location += ':' + str(iri_parts.port)
5143
f9934b96 5144 return urllib.parse.urlunparse(
732044af 5145 (iri_parts.scheme,
5146 net_location,
5147
f9934b96 5148 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5149
5150 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5151 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5152
5153 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5154 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5155
f9934b96 5156 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5157
5158 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5159
5160
5161def to_high_limit_path(path):
5162 if sys.platform in ['win32', 'cygwin']:
5163 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5164 return '\\\\?\\' + os.path.abspath(path)
732044af 5165
5166 return path
76d321f6 5167
c76eb41b 5168
7b2c3f47 5169def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5170 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5171 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5172 return default
7b2c3f47 5173 return template % func(val)
00dd0cd5 5174
5175
5176def clean_podcast_url(url):
5177 return re.sub(r'''(?x)
5178 (?:
5179 (?:
5180 chtbl\.com/track|
5181 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5182 play\.podtrac\.com
5183 )/[^/]+|
5184 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5185 flex\.acast\.com|
5186 pd(?:
5187 cn\.co| # https://podcorn.com/analytics-prefix/
5188 st\.fm # https://podsights.com/docs/
5189 )/e
5190 )/''', '', url)
ffcb8191
THD
5191
5192
5193_HEX_TABLE = '0123456789abcdef'
5194
5195
5196def random_uuidv4():
5197 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5198
5199
5200def make_dir(path, to_screen=None):
5201 try:
5202 dn = os.path.dirname(path)
5203 if dn and not os.path.exists(dn):
5204 os.makedirs(dn)
5205 return True
86e5f3ed 5206 except OSError as err:
0202b52a 5207 if callable(to_screen) is not None:
5208 to_screen('unable to create directory ' + error_to_compat_str(err))
5209 return False
f74980cb 5210
5211
5212def get_executable_path():
b5899f4f 5213 from .update import _get_variant_and_executable_path
c487cf00 5214
b5899f4f 5215 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5216
5217
2f567473 5218def load_plugins(name, suffix, namespace):
3ae5e797 5219 classes = {}
19a03940 5220 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5221 plugins_spec = importlib.util.spec_from_file_location(
5222 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5223 plugins = importlib.util.module_from_spec(plugins_spec)
5224 sys.modules[plugins_spec.name] = plugins
5225 plugins_spec.loader.exec_module(plugins)
f74980cb 5226 for name in dir(plugins):
2f567473 5227 if name in namespace:
5228 continue
5229 if not name.endswith(suffix):
f74980cb 5230 continue
5231 klass = getattr(plugins, name)
3ae5e797 5232 classes[name] = namespace[name] = klass
f74980cb 5233 return classes
06167fbb 5234
5235
325ebc17 5236def traverse_obj(
352d63fd 5237 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5238 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5239 ''' Traverse nested list/dict/tuple
8f334380 5240 @param path_list A list of paths which are checked one by one.
19a03940 5241 Each path is a list of keys where each key is a:
5242 - None: Do nothing
5243 - string: A dictionary key
5244 - int: An index into a list
5245 - tuple: A list of keys all of which will be traversed
5246 - Ellipsis: Fetch all values in the object
5247 - Function: Takes the key and value as arguments
5248 and returns whether the key matches or not
325ebc17 5249 @param default Default value to return
352d63fd 5250 @param expected_type Only accept final value of this type (Can also be any callable)
5251 @param get_all Return all the values obtained from a path or only the first one
324ad820 5252 @param casesense Whether to consider dictionary keys as case sensitive
5253 @param is_user_input Whether the keys are generated from user input. If True,
5254 strings are converted to int/slice if necessary
5255 @param traverse_string Whether to traverse inside strings. If True, any
5256 non-compatible object will also be converted into a string
8f334380 5257 # TODO: Write tests
324ad820 5258 '''
325ebc17 5259 if not casesense:
dbf5416a 5260 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5261 path_list = (map(_lower, variadic(path)) for path in path_list)
5262
5263 def _traverse_obj(obj, path, _current_depth=0):
5264 nonlocal depth
5265 path = tuple(variadic(path))
5266 for i, key in enumerate(path):
1797b073 5267 if None in (key, obj):
5268 return obj
8f334380 5269 if isinstance(key, (list, tuple)):
5270 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5271 key = ...
5272 if key is ...:
5273 obj = (obj.values() if isinstance(obj, dict)
5274 else obj if isinstance(obj, (list, tuple, LazyList))
5275 else str(obj) if traverse_string else [])
5276 _current_depth += 1
5277 depth = max(depth, _current_depth)
5278 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5279 elif callable(key):
5280 if isinstance(obj, (list, tuple, LazyList)):
5281 obj = enumerate(obj)
5282 elif isinstance(obj, dict):
5283 obj = obj.items()
5284 else:
5285 if not traverse_string:
5286 return None
5287 obj = str(obj)
5288 _current_depth += 1
5289 depth = max(depth, _current_depth)
e6f868a6 5290 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5291 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5292 obj = (obj.get(key) if casesense or (key in obj)
5293 else next((v for k, v in obj.items() if _lower(k) == key), None))
5294 else:
5295 if is_user_input:
5296 key = (int_or_none(key) if ':' not in key
5297 else slice(*map(int_or_none, key.split(':'))))
8f334380 5298 if key == slice(None):
575e17a1 5299 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5300 if not isinstance(key, (int, slice)):
9fea350f 5301 return None
8f334380 5302 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5303 if not traverse_string:
5304 return None
5305 obj = str(obj)
5306 try:
5307 obj = obj[key]
5308 except IndexError:
324ad820 5309 return None
325ebc17 5310 return obj
5311
352d63fd 5312 if isinstance(expected_type, type):
5313 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5314 else:
7b2c3f47 5315 type_test = expected_type or IDENTITY
352d63fd 5316
8f334380 5317 for path in path_list:
5318 depth = 0
5319 val = _traverse_obj(obj, path)
325ebc17 5320 if val is not None:
8f334380 5321 if depth:
5322 for _ in range(depth - 1):
6586bca9 5323 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5324 val = [v for v in map(type_test, val) if v is not None]
8f334380 5325 if val:
352d63fd 5326 return val if get_all else val[0]
5327 else:
5328 val = type_test(val)
5329 if val is not None:
8f334380 5330 return val
325ebc17 5331 return default
324ad820 5332
5333
5334def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5335 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5336 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5337 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5338
5339
ff91cf74 5340def get_first(obj, keys, **kwargs):
5341 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5342
5343
4b4b7f74 5344def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5345 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5346
5347
3e9b66d7
LNO
5348def time_seconds(**kwargs):
5349 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5350 return t.timestamp()
5351
5352
49fa4d9a
N
5353# create a JSON Web Signature (jws) with HS256 algorithm
5354# the resulting format is in JWS Compact Serialization
5355# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5356# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5357def jwt_encode_hs256(payload_data, key, headers={}):
5358 header_data = {
5359 'alg': 'HS256',
5360 'typ': 'JWT',
5361 }
5362 if headers:
5363 header_data.update(headers)
0f06bcd7 5364 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5365 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5366 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5367 signature_b64 = base64.b64encode(h.digest())
5368 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5369 return token
819e0531 5370
5371
16b0d7e6 5372# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5373def jwt_decode_hs256(jwt):
5374 header_b64, payload_b64, signature_b64 = jwt.split('.')
5375 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5376 return payload_data
5377
5378
53973b4d 5379WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5380
5381
0b9c08b4 5382@functools.cache
819e0531 5383def supports_terminal_sequences(stream):
5384 if compat_os_name == 'nt':
8a82af35 5385 if not WINDOWS_VT_MODE:
819e0531 5386 return False
5387 elif not os.getenv('TERM'):
5388 return False
5389 try:
5390 return stream.isatty()
5391 except BaseException:
5392 return False
5393
5394
53973b4d 5395def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5396 if get_windows_version() < (10, 0, 10586):
53973b4d 5397 return
5398 global WINDOWS_VT_MODE
53973b4d 5399 try:
f0c9fb96 5400 Popen.run('', shell=True)
53973b4d 5401 except Exception:
5402 return
5403
5404 WINDOWS_VT_MODE = True
5405 supports_terminal_sequences.cache_clear()
5406
5407
ec11a9f4 5408_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5409
5410
5411def remove_terminal_sequences(string):
5412 return _terminal_sequences_re.sub('', string)
5413
5414
5415def number_of_digits(number):
5416 return len('%d' % number)
34921b43 5417
5418
5419def join_nonempty(*values, delim='-', from_dict=None):
5420 if from_dict is not None:
7b2c3f47 5421 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5422 return delim.join(map(str, filter(None, values)))
06e57990 5423
5424
27231526
ZM
5425def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5426 """
5427 Find the largest format dimensions in terms of video width and, for each thumbnail:
5428 * Modify the URL: Match the width with the provided regex and replace with the former width
5429 * Update dimensions
5430
5431 This function is useful with video services that scale the provided thumbnails on demand
5432 """
5433 _keys = ('width', 'height')
5434 max_dimensions = max(
86e5f3ed 5435 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5436 default=(0, 0))
5437 if not max_dimensions[0]:
5438 return thumbnails
5439 return [
5440 merge_dicts(
5441 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5442 dict(zip(_keys, max_dimensions)), thumbnail)
5443 for thumbnail in thumbnails
5444 ]
5445
5446
93c8410d
LNO
5447def parse_http_range(range):
5448 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5449 if not range:
5450 return None, None, None
5451 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5452 if not crg:
5453 return None, None, None
5454 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5455
5456
6b9e832d 5457def read_stdin(what):
5458 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5459 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5460 return sys.stdin
5461
5462
a904a7f8
L
5463def determine_file_encoding(data):
5464 """
88f60feb 5465 Detect the text encoding used
a904a7f8
L
5466 @returns (encoding, bytes to skip)
5467 """
5468
88f60feb 5469 # BOM marks are given priority over declarations
a904a7f8 5470 for bom, enc in BOMS:
a904a7f8
L
5471 if data.startswith(bom):
5472 return enc, len(bom)
5473
88f60feb 5474 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5475 # We ignore the endianness to get a good enough match
a904a7f8 5476 data = data.replace(b'\0', b'')
88f60feb 5477 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5478 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5479
5480
06e57990 5481class Config:
5482 own_args = None
9e491463 5483 parsed_args = None
06e57990 5484 filename = None
5485 __initialized = False
5486
5487 def __init__(self, parser, label=None):
9e491463 5488 self.parser, self.label = parser, label
06e57990 5489 self._loaded_paths, self.configs = set(), []
5490
5491 def init(self, args=None, filename=None):
5492 assert not self.__initialized
284a60c5 5493 self.own_args, self.filename = args, filename
5494 return self.load_configs()
5495
5496 def load_configs(self):
65662dff 5497 directory = ''
284a60c5 5498 if self.filename:
5499 location = os.path.realpath(self.filename)
65662dff 5500 directory = os.path.dirname(location)
06e57990 5501 if location in self._loaded_paths:
5502 return False
5503 self._loaded_paths.add(location)
5504
284a60c5 5505 self.__initialized = True
5506 opts, _ = self.parser.parse_known_args(self.own_args)
5507 self.parsed_args = self.own_args
9e491463 5508 for location in opts.config_locations or []:
6b9e832d 5509 if location == '-':
5510 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5511 continue
65662dff 5512 location = os.path.join(directory, expand_path(location))
06e57990 5513 if os.path.isdir(location):
5514 location = os.path.join(location, 'yt-dlp.conf')
5515 if not os.path.exists(location):
9e491463 5516 self.parser.error(f'config location {location} does not exist')
06e57990 5517 self.append_config(self.read_file(location), location)
5518 return True
5519
5520 def __str__(self):
5521 label = join_nonempty(
5522 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5523 delim=' ')
5524 return join_nonempty(
5525 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5526 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5527 delim='\n')
5528
5529 @staticmethod
5530 def read_file(filename, default=[]):
5531 try:
a904a7f8 5532 optionf = open(filename, 'rb')
86e5f3ed 5533 except OSError:
06e57990 5534 return default # silently skip if file is not present
a904a7f8
L
5535 try:
5536 enc, skip = determine_file_encoding(optionf.read(512))
5537 optionf.seek(skip, io.SEEK_SET)
5538 except OSError:
5539 enc = None # silently skip read errors
06e57990 5540 try:
5541 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5542 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5543 res = shlex.split(contents, comments=True)
44a6fcff 5544 except Exception as err:
5545 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5546 finally:
5547 optionf.close()
5548 return res
5549
5550 @staticmethod
5551 def hide_login_info(opts):
86e5f3ed 5552 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5553 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5554
5555 def _scrub_eq(o):
5556 m = eqre.match(o)
5557 if m:
5558 return m.group('key') + '=PRIVATE'
5559 else:
5560 return o
5561
5562 opts = list(map(_scrub_eq, opts))
5563 for idx, opt in enumerate(opts):
5564 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5565 opts[idx + 1] = 'PRIVATE'
5566 return opts
5567
5568 def append_config(self, *args, label=None):
9e491463 5569 config = type(self)(self.parser, label)
06e57990 5570 config._loaded_paths = self._loaded_paths
5571 if config.init(*args):
5572 self.configs.append(config)
5573
5574 @property
5575 def all_args(self):
5576 for config in reversed(self.configs):
5577 yield from config.all_args
9e491463 5578 yield from self.parsed_args or []
5579
5580 def parse_known_args(self, **kwargs):
5581 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5582
5583 def parse_args(self):
9e491463 5584 return self.parser.parse_args(self.all_args)
da42679b
LNO
5585
5586
5587class WebSocketsWrapper():
5588 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5589 pool = None
da42679b 5590
3cea3edd 5591 def __init__(self, url, headers=None, connect=True):
059bc4db 5592 self.loop = asyncio.new_event_loop()
9cd08050 5593 # XXX: "loop" is deprecated
5594 self.conn = websockets.connect(
5595 url, extra_headers=headers, ping_interval=None,
5596 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5597 if connect:
5598 self.__enter__()
15dfb392 5599 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5600
5601 def __enter__(self):
3cea3edd 5602 if not self.pool:
9cd08050 5603 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5604 return self
5605
5606 def send(self, *args):
5607 self.run_with_loop(self.pool.send(*args), self.loop)
5608
5609 def recv(self, *args):
5610 return self.run_with_loop(self.pool.recv(*args), self.loop)
5611
5612 def __exit__(self, type, value, traceback):
5613 try:
5614 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5615 finally:
5616 self.loop.close()
15dfb392 5617 self._cancel_all_tasks(self.loop)
da42679b
LNO
5618
5619 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5620 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5621 @staticmethod
5622 def run_with_loop(main, loop):
059bc4db 5623 if not asyncio.iscoroutine(main):
da42679b
LNO
5624 raise ValueError(f'a coroutine was expected, got {main!r}')
5625
5626 try:
5627 return loop.run_until_complete(main)
5628 finally:
5629 loop.run_until_complete(loop.shutdown_asyncgens())
5630 if hasattr(loop, 'shutdown_default_executor'):
5631 loop.run_until_complete(loop.shutdown_default_executor())
5632
5633 @staticmethod
5634 def _cancel_all_tasks(loop):
059bc4db 5635 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5636
5637 if not to_cancel:
5638 return
5639
5640 for task in to_cancel:
5641 task.cancel()
5642
9cd08050 5643 # XXX: "loop" is removed in python 3.10+
da42679b 5644 loop.run_until_complete(
059bc4db 5645 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5646
5647 for task in to_cancel:
5648 if task.cancelled():
5649 continue
5650 if task.exception() is not None:
5651 loop.call_exception_handler({
5652 'message': 'unhandled exception during asyncio.run() shutdown',
5653 'exception': task.exception(),
5654 'task': task,
5655 })
5656
5657
8b7539d2 5658def merge_headers(*dicts):
08d30158 5659 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5660 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5661
5662
b1f94422 5663def cached_method(f):
5664 """Cache a method"""
5665 signature = inspect.signature(f)
5666
5667 @functools.wraps(f)
5668 def wrapper(self, *args, **kwargs):
5669 bound_args = signature.bind(self, *args, **kwargs)
5670 bound_args.apply_defaults()
5671 key = tuple(bound_args.arguments.values())
5672
5673 if not hasattr(self, '__cached_method__cache'):
5674 self.__cached_method__cache = {}
5675 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5676 if key not in cache:
5677 cache[key] = f(self, *args, **kwargs)
5678 return cache[key]
5679 return wrapper
5680
5681
28787f16 5682class classproperty:
b1f94422 5683 """property access for class methods"""
c487cf00 5684
5685 def __init__(self, func):
5686 functools.update_wrapper(self, func)
5687 self.func = func
28787f16 5688
5689 def __get__(self, _, cls):
c487cf00 5690 return self.func(cls)
19a03940 5691
5692
64fa820c 5693class Namespace(types.SimpleNamespace):
591bb9d3 5694 """Immutable namespace"""
591bb9d3 5695
7896214c 5696 def __iter__(self):
64fa820c 5697 return iter(self.__dict__.values())
7896214c 5698
64fa820c 5699 @property
5700 def items_(self):
5701 return self.__dict__.items()
9b8ee23b 5702
5703
8dc59305 5704MEDIA_EXTENSIONS = Namespace(
5705 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5706 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5707 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5708 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5709 thumbnails=('jpg', 'png', 'webp'),
5710 storyboards=('mhtml', ),
5711 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5712 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5713)
5714MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5715MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5716
5717KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5718
5719
be5c1ae8 5720class RetryManager:
5721 """Usage:
5722 for retry in RetryManager(...):
5723 try:
5724 ...
5725 except SomeException as err:
5726 retry.error = err
5727 continue
5728 """
5729 attempt, _error = 0, None
5730
5731 def __init__(self, _retries, _error_callback, **kwargs):
5732 self.retries = _retries or 0
5733 self.error_callback = functools.partial(_error_callback, **kwargs)
5734
5735 def _should_retry(self):
5736 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5737
5738 @property
5739 def error(self):
5740 if self._error is NO_DEFAULT:
5741 return None
5742 return self._error
5743
5744 @error.setter
5745 def error(self, value):
5746 self._error = value
5747
5748 def __iter__(self):
5749 while self._should_retry():
5750 self.error = NO_DEFAULT
5751 self.attempt += 1
5752 yield self
5753 if self.error:
5754 self.error_callback(self.error, self.attempt, self.retries)
5755
5756 @staticmethod
5757 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5758 """Utility function for reporting retries"""
5759 if count > retries:
5760 if error:
5761 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5762 raise e
5763
5764 if not count:
5765 return warn(e)
5766 elif isinstance(e, ExtractorError):
05e2243e 5767 e = remove_end(str(e.cause) or e.orig_msg, '.')
be5c1ae8 5768 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5769
5770 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5771 if delay:
5772 info(f'Sleeping {delay:.2f} seconds ...')
5773 time.sleep(delay)
5774
5775
0647d925 5776def make_archive_id(ie, video_id):
5777 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5778 return f'{ie_key.lower()} {video_id}'
5779
5780
a1c5bd82 5781def truncate_string(s, left, right=0):
5782 assert left > 3 and right >= 0
5783 if s is None or len(s) <= left + right:
5784 return s
5785 return f'{s[:left-3]}...{s[-right:]}'
5786
5787
9b8ee23b 5788# Deprecated
5789has_certifi = bool(certifi)
5790has_websockets = bool(websockets)